In [None]:
#| default_exp model
%load_ext autoreload
%autoreload 2

In [None]:
# | export
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import joblib
import yaml
import numpy as np
from pathlib import Path
from dataclasses import dataclass

In [None]:
# | export
def create_line_features(lines: list[str]) -> pd.DataFrame:
    """compute features for each lines in a group (doc) and return a dataframe with them

    Arguments:
        lines -- list of lines used to compute features

    Returns:
        the dataframe with the features.
    """
    line_lengths = [len(l) for l in lines]
    line_rows = [i for i, _ in enumerate(lines)]
    are_end_of_sent = [l.strip()[-1] in [".", "?", "!"] if len(l.strip()) > 0 else False for l in lines]
    are_end_hyphen = [l.strip()[-1] in ["-"] if len(l.strip()) > 0 else False for l in lines]
    # erreur car élimine des lignes. ils doit avoir une valeur else
    are_start_upper = [l.strip()[0].isupper() if len(l.strip()) > 0 else False for l in lines]
    are_start_bullet = [l.strip().startswith(("-", "•", "o ")) if len(l.strip()) > 0 else False for l in lines]

    assert (
        len(line_lengths)
        == len(line_rows)
        == len(are_end_of_sent)
        == len(are_end_hyphen)
        == len(are_start_upper)
        == len(are_start_bullet)
    ), "all lines must be processed."

    lines_data = [
        (r, l, e, h, u, b, t)
        for r, t, l, e, h, u, b in zip(
            line_rows,
            lines,
            line_lengths,
            are_end_of_sent,
            are_end_hyphen,
            are_start_upper,
            are_start_bullet,
        )
    ]
    lines_df = pd.DataFrame(
        lines_data,
        columns=[
            "row",
            "txt_len",
            "end_with_end_sent",
            "end_with_hyphen",
            "start_with_upper",
            "start_with_bullet",
            "line_txt",
        ],
    )
    lines_df["diff_len_prev"] = lines_df.txt_len.diff()
    lines_df.diff_len_prev = lines_df.diff_len_prev.fillna(lines_df.txt_len)
    lines_df["diff_max_len"] = lines_df.txt_len.max() - lines_df.txt_len

    return lines_df


def do_prepare_train_data_grp(df: pd.DataFrame) -> pd.DataFrame:
    """Training data provides examples of sequences of text lines. Each lines belongs to a group, which is like a document.
    This function process one group.

    Arguments:
        df -- dataframe of training lines without any features.

    Returns:
        a dataframe of a group.
    """
    lines = df.line_txt.values.tolist()
    lines_feats_df = create_line_features(lines)
    prepared_df = pd.concat([lines_feats_df, df.new_paragraph.reset_index().new_paragraph], axis=1)
    return prepared_df


def prepare_train_data(lines_df: pd.DataFrame) -> pd.DataFrame:
    """Prepare raw train data and create features

    Arguments:
        lines_df -- raw train data

    Returns:
        a Dataframe of line features.
    """
    lines_df["line_txt"] = lines_df.line_txt.fillna("")
    df = lines_df.groupby("grp").apply(do_prepare_train_data_grp).reset_index()
    return df


In [None]:
train_lines_df = pd.read_csv("../data/train.csv", sep=";")
test_lines_df = pd.read_csv("../data/test.csv", sep=";")
train_df = prepare_train_data(train_lines_df)
test_df = prepare_train_data(test_lines_df)
assert len(train_df) == len(train_lines_df), "Line nb in train should be the same"
assert len(test_df) == len(test_lines_df), "Line nb in test should be the same"
assert len(train_df[train_df.new_paragraph.isna()]) == 0, "there should be no NA target value in training data"
assert len(test_df[test_df.new_paragraph.isna()]) == 0, "there should be no NA target value in test data"


In [None]:
# | export
from typing import Final

X_COLS: Final[list[str]] = [
    "txt_len",
    "end_with_end_sent",
    "end_with_hyphen",
    "start_with_upper",
    "start_with_bullet",
    "diff_len_prev",
    "diff_max_len",
]


def df_to_x_y(train_df) -> tuple[np.ndarray, np.ndarray]:
    y_col = "new_paragraph"
    x = train_df[X_COLS]
    y = train_df[y_col]

    return x, y


def cross_validate_model(clf: BaseEstimator, train_df: pd.DataFrame, cv: int = 3) -> list[float]:
    x, y = df_to_x_y(train_df)
    return cross_val_score(clf, x, y, cv=cv).tolist()


@dataclass
class ModelScores:
    cv_scores: list[float]
    train_accuracy: float
    test_accuracy: float


def save_experiment(clf: BaseEstimator, scores: ModelScores, model_name: str, version: str):
    exp_dir: Path = Path("../experiments") / model_name / version
    os.makedirs(exp_dir, exist_ok=True)

    # os.makedirs(exp_dir, exist_ok=True)
    joblib.dump(clf, exp_dir / "clf.joblib")
    metadata = dict(
        {
            "name": model_name,
            "version": version,
            "cv_scores": scores.cv_scores,
            "train_accuracy": scores.train_accuracy,
            "test_accuracy": scores.test_accuracy,
            "hparams": clf.get_params(),
        }
    )
    with open(exp_dir / "metadata.yaml", "w") as f:
        yaml.dump(metadata, f, sort_keys=False)


def save_model(clf: BaseEstimator):
    model_dir = Path("../model")
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(clf, model_dir / "clf.joblib")


def train_model(
    clf: BaseEstimator, train_df: pd.DataFrame, test_df: pd.DataFrame, model_name: str, version: str, cv: int = 3
) -> BaseEstimator:
    cv_scores = cross_validate_model(clf, train_df, cv=cv)
    x_train, y_train = df_to_x_y(train_df)
    clf.fit(x_train, y_train)
    y_pred_train = clf.predict(x_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    x_test, y_test = df_to_x_y(test_df)
    y_pred_test = clf.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    scores = ModelScores(cv_scores=cv_scores, train_accuracy=train_accuracy, test_accuracy=test_accuracy)
    save_experiment(clf, scores, model_name, version)
    return clf


In [None]:
# manage experiments with diff models and then select the best one which is put into model.
clf = train_model(GradientBoostingClassifier(), train_df, test_df, model_name="xg", version="1.0", cv=3)

# save_model(clf)

In [None]:
import joblib

model_dir = Path("../model")
os.makedirs(model_dir, exist_ok=True)
tto: Path = model_dir / "clf.joblib"

dump(clf, Path("../model/clf.joblib"))


In [None]:
clf = GradientBoostingClassifier()
cross_val_score(clf, x, y, cv=3).tolist()

In [None]:
clf = train_model(train_df)
x, y = df_to_x_y(train_df)
clf.score(x, y)


In [None]:
x_test, y_test = df_to_x_y(test_df)
clf.score(x_test, y_test)


In [None]:
import numpy as np


In [None]:
# | export
def load_model(version: str) -> ClassifierMixin:
    clf = load(f"../models/paragraph_clf/{version}/clf.joblib")
    return clf


In [None]:
lclf = load_model("1.0")
lclf


In [None]:
# | export
def prepare_paragraph_from_txt_lines(clf: ClassifierMixin, lines: list[str]) -> str:
    lines_df = create_line_features(lines)
    x = lines_df[X_COLS]
    preds = clf.predict(x).tolist()
    txt = ""
    for l, pred in zip(lines, preds):
        if len(l) > 0:
            if l[-1] == "-":
                l = l[:-1]

        if pred == 1:
            txt += "\n" + l
        else:
            txt += l
    return txt

In [None]:
print(prepare_paragraph_from_txt_lines(clf, train_lines_df.line_txt.values.tolist()))


In [None]:
import nbdev

nbdev.nbdev_export("core.ipynb")