# Train models notebook

In [None]:
from typing import Union
from pathlib import Path
import pandas as pd
import os
from sklearn.base import BaseEstimator
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
import joblib
import yaml
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from paragraph_detective.data_prep import prepare_data_from_doc, create_line_features
from paragraph_detective.model import MODEL_FILE_NAME, df_to_x_y


In [None]:
def prepare_train_data_grp(df: pd.DataFrame) -> pd.DataFrame:
    """Training data provides examples of sequences of text lines. Each lines belongs to a group, which is like a document.
    This function process one group.

    Arguments:
        df -- dataframe of training lines without any features.

    Returns:
        a dataframe of a group.
    """
    lines = df.line_txt.values.tolist()
    lines_feats_df = create_line_features(lines)
    prepared_df = pd.concat([lines_feats_df, df.new_paragraph.reset_index().new_paragraph], axis=1)
    return prepared_df


def prepare_train_data(lines_df: pd.DataFrame) -> pd.DataFrame:
    """Prepare raw train data and create features

    Arguments:
        lines_df -- raw train data

    Returns:
        a Dataframe of line features.
    """
    lines_df["line_txt"] = lines_df.line_txt.fillna("")
    df = lines_df.groupby("grp").apply(prepare_train_data_grp).reset_index()
    return df

In [None]:
def prepare_data_from_csv(file_path: Union[str, Path]) -> pd.DataFrame:
    lines_df = pd.read_csv(file_path, sep=";")
    df = prepare_train_data(lines_df)
    return df

In [None]:
train_df = prepare_data_from_csv(Path("../data/train.csv"))
test_df = prepare_data_from_csv("../data/test.csv")
assert len(train_df) == len(train_df), "Line nb in train should be the same"
assert len(test_df) == len(test_df), "Line nb in test should be the same"
assert len(train_df[train_df.new_paragraph.isna()]) == 0, "there should be no NA target value in training data"
assert len(test_df[test_df.new_paragraph.isna()]) == 0, "there should be no NA target value in test data"

## train models experiments

In [None]:
def cross_validate_model(clf: BaseEstimator, train_df: pd.DataFrame, cv: int = 3) -> list[float]:
    x, y = df_to_x_y(train_df)
    return cross_val_score(clf, x, y, cv=cv).tolist()


@dataclass
class ModelScores:
    cv_scores: list[float]
    train_accuracy: float
    test_accuracy: float
    train_conf_matrix: list[list[int]]
    test_conf_matrix: list[list[int]]
    train_recall: float
    test_recall: float
    train_precision: float
    test_precision: float
    train_f1: float
    test_f1: float


def get_experiment_path(name: str, version: str) -> Path:
    return Path("../experiments") / name / version


def save_experiment(
    clf: BaseEstimator,
    scores: ModelScores,
    model_name: str,
    version: str,
    train_error_df: pd.DataFrame,
    test_error_df: pd.DataFrame,
) -> None:
    exp_dir_path = get_experiment_path(model_name, version)
    os.makedirs(exp_dir_path, exist_ok=True)

    joblib.dump(clf, exp_dir_path / MODEL_FILE_NAME)
    metadata = dict(
        {
            "name": model_name,
            "version": version,
            "cv_scores": scores.cv_scores,
            "train_accuracy": scores.train_accuracy,
            "train_confusion_matrix": scores.train_conf_matrix,
            "train_f1": scores.train_f1,
            "train_precision": scores.train_precision,
            "train_recall": scores.train_recall,
            "test_accuracy": scores.test_accuracy,
            "test_confusion_matrix": scores.test_conf_matrix,
            "test_f1": scores.train_f1,
            "test_precision": scores.train_precision,
            "test_recall": scores.train_recall,
            "hparams": clf.get_params(),
        }
    )
    with open(exp_dir_path / "metadata.yaml", "w") as f:
        yaml.dump(metadata, f, sort_keys=False)

    train_error_df.to_csv(exp_dir_path / "train_error.csv", index=False)
    test_error_df.to_csv(exp_dir_path / "test_error.csv", index=False)


def save_model(clf: BaseEstimator):
    model_dir = Path("../model")
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(clf, model_dir / MODEL_FILE_NAME)


def analyze_error(data_df: pd.DataFrame, y_pred: np.ndarray) -> pd.DataFrame:
    df = pd.concat([data_df, pd.Series(y_pred, name="prediction")], axis=1)
    error_cond = [
        (df.new_paragraph == df.prediction) & (df.new_paragraph == 1),
        (df.new_paragraph != df.prediction) & (df.new_paragraph == 1),
        (df.new_paragraph == df.prediction) & (df.new_paragraph == 0),
        (df.new_paragraph != df.prediction) & (df.new_paragraph == 0),
    ]
    error_cat = ["TP", "FP", "TN", "FN"]
    df["error"] = np.select(error_cond, error_cat)
    return df


def train_model(
    clf: BaseEstimator,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    model_name: str,
    version: str,
    cv: int = 3,
) -> tuple[BaseEstimator, ModelScores]:
    cv_scores = cross_validate_model(clf, train_df, cv=cv)
    x_train, y_train = df_to_x_y(train_df)
    clf.fit(x_train, y_train)
    y_pred_train = clf.predict(x_train)
    train_accuracy = float(accuracy_score(y_train, y_pred_train))
    train_precision = float(precision_score(y_train, y_pred_train))
    train_recall = float(recall_score(y_train, y_pred_train))
    train_f1 = float(f1_score(y_train, y_pred_train))
    train_cm: list[list[int]] = confusion_matrix(y_train, y_pred_train).tolist()
    train_error_df = analyze_error(train_df, y_pred_train)

    x_test, y_test = df_to_x_y(test_df)
    y_pred_test = clf.predict(x_test)
    test_accuracy = float(accuracy_score(y_test, y_pred_test))
    test_precision = float(precision_score(y_test, y_pred_test))
    test_recall = float(recall_score(y_test, y_pred_test))
    test_f1 = float(f1_score(y_test, y_pred_test))
    test_cm: list[list[int]] = confusion_matrix(y_test, y_pred_test).tolist()
    test_error_df = analyze_error(test_df, y_pred_test)

    scores = ModelScores(
        cv_scores=cv_scores,
        train_accuracy=train_accuracy,
        train_conf_matrix=train_cm,
        train_recall=train_recall,
        train_precision=train_precision,
        train_f1=train_f1,
        test_accuracy=test_accuracy,
        test_conf_matrix=test_cm,
        test_recall=test_recall,
        test_precision=test_precision,
        test_f1=test_f1,
    )

    save_experiment(clf, scores, model_name, version, train_error_df, test_error_df)
    return clf, scores


In [None]:
# manage experiments with diff models and then select the best one which is put into model.
models = [("xgb", GradientBoostingClassifier()), ("random_forrest", RandomForestClassifier())]

for name, model in models:
    train_df = prepare_data_from_csv(Path("../data/train.csv"))
    test_df = prepare_data_from_csv("../data/test.csv")
    clf, scores = train_model(model, train_df, test_df, model_name=name, version="1.0", cv=3)

    print(scores)

In [None]:
def load_experiment(from_experiment: bool = False, name: str = "", version: str = "") -> BaseEstimator:
    if from_experiment:
        exp_dir_path = get_experiment_path(name, version)
        clf = joblib.load(exp_dir_path / MODEL_FILE_NAME)
    else:
        clf = joblib.load(Path("../model") / MODEL_FILE_NAME)
    return clf


In [None]:
def train_model(train_df: pd.DataFrame):
    clf = RandomForestClassifier()
    x_train, y_train = df_to_x_y(train_df)
    clf.fit(x_train, y_train)
    save_model(clf)


train_model(train_df)
