In [None]:
# | default_exp model
%load_ext autoreload
%autoreload 2

In [None]:
# | export
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Final, Union

import joblib
import numpy as np
import pandas as pd
import yaml
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score

from paragraph_detective.data_prep import create_line_features, prepare_data_from_doc

In [None]:
# | export
MODEL_FILE_NAME: Final[str] = "clf.joblib"
X_COLS: Final[list[str]] = [
    "txt_len",
    "end_with_end_sent",
    "end_with_hyphen",
    "start_with_upper",
    "start_with_bullet",
    "diff_len_prev",
    "diff_max_len",
]


def df_to_x_y(train_df) -> tuple[np.ndarray, np.ndarray]:
    y_col = "new_paragraph"
    x = train_df[X_COLS]
    y = train_df[y_col]

    return x, y


def get_model() -> BaseEstimator:
    model_path = Path("../model") / MODEL_FILE_NAME
    clf = joblib.load(model_path)
    return clf

In [None]:
# | export
def clean_doc_paragraphs(clf: BaseEstimator, doc_path: Union[str, Path]) -> str:
    lines_df, lines = prepare_data_from_doc(doc_path)
    x = lines_df[X_COLS]
    preds = clf.predict(x).tolist()
    txt = ""
    for l, pred in zip(lines, preds):
        if len(l) > 0:
            if l[-1] == "-":
                l = l[:-1]

        # 1 is for new paragraph.
        if pred == 1:
            txt += "\n" + l
        else:
            txt += l
    return txt

In [None]:
clf = get_model()
file_path = Path("../test_data/doc_a.txt")
doc_txt = clean_doc_paragraphs(clf, file_path)
print(doc_txt)

# TODO: there should be a \n  after Title

In [None]:
with open(file_path) as f:
    raw_content = f.read()

print(raw_content)

In [None]:
import nbdev

nbdev.nbdev_export("model.ipynb")