In [None]:
#| default_exp data_prep
%load_ext autoreload
%autoreload 2

In [None]:
# | exporti
import os
import pandas as pd
import joblib
import yaml
from typing import Union, Final
import numpy as np
from pathlib import Path
from dataclasses import dataclass


In [None]:
# | export

LINE_FEAT_NAMES: Final[list[str]] = [
    "row",
    "txt_len",
    "end_with_end_sent",
    "end_with_hyphen",
    "start_with_upper",
    "start_with_bullet",
    "line_txt",
    "diff_len_prev",
    "diff_max_len",
]


def create_line_features(lines: list[str]) -> pd.DataFrame:
    """compute features for each lines in a group (doc) and return a dataframe with them

    Arguments:
        lines -- list of lines used to compute features

    Returns:
        the dataframe with the features.
    """
    line_lengths = [len(l) for l in lines]
    line_rows = [i for i, _ in enumerate(lines)]
    are_end_of_sent = [l.strip()[-1] in [".", "?", "!"] if len(l.strip()) > 0 else False for l in lines]
    are_end_hyphen = [l.strip()[-1] in ["-"] if len(l.strip()) > 0 else False for l in lines]
    # erreur car élimine des lignes. ils doit avoir une valeur else
    are_start_upper = [l.strip()[0].isupper() if len(l.strip()) > 0 else False for l in lines]
    are_start_bullet = [l.strip().startswith(("-", "•", "o ")) if len(l.strip()) > 0 else False for l in lines]

    assert (
        len(line_lengths)
        == len(line_rows)
        == len(are_end_of_sent)
        == len(are_end_hyphen)
        == len(are_start_upper)
        == len(are_start_bullet)
    ), "all lines must be processed."

    lines_data = [
        (r, l, e, h, u, b, t)
        for r, t, l, e, h, u, b in zip(
            line_rows,
            lines,
            line_lengths,
            are_end_of_sent,
            are_end_hyphen,
            are_start_upper,
            are_start_bullet,
        )
    ]
    lines_df = pd.DataFrame(
        lines_data,
        columns=[
            "row",
            "txt_len",
            "end_with_end_sent",
            "end_with_hyphen",
            "start_with_upper",
            "start_with_bullet",
            "line_txt",
        ],
    )
    lines_df["diff_len_prev"] = lines_df.txt_len.diff()
    lines_df.diff_len_prev = lines_df.diff_len_prev.fillna(lines_df.txt_len)
    lines_df["diff_max_len"] = lines_df.txt_len.max() - lines_df.txt_len

    return lines_df


def prepare_train_data_grp(df: pd.DataFrame) -> pd.DataFrame:
    """Training data provides examples of sequences of text lines. Each lines belongs to a group, which is like a document.
    This function process one group.

    Arguments:
        df -- dataframe of training lines without any features.

    Returns:
        a dataframe of a group.
    """
    lines = df.line_txt.values.tolist()
    lines_feats_df = create_line_features(lines)
    prepared_df = pd.concat([lines_feats_df, df.new_paragraph.reset_index().new_paragraph], axis=1)
    return prepared_df


def prepare_train_data(lines_df: pd.DataFrame) -> pd.DataFrame:
    """Prepare raw train data and create features

    Arguments:
        lines_df -- raw train data

    Returns:
        a Dataframe of line features.
    """
    lines_df["line_txt"] = lines_df.line_txt.fillna("")
    df = lines_df.groupby("grp").apply(prepare_train_data_grp).reset_index()
    return df


In [None]:
# | export
def prepare_data_from_csv(file_path: Union[str, Path]) -> pd.DataFrame:
    lines_df = pd.read_csv(file_path, sep=";")
    df = prepare_train_data(lines_df)
    return df


In [None]:
train_df = prepare_data_from_csv(Path("../data/train.csv"))
test_df = prepare_data_from_csv("../data/test.csv")
assert len(train_df) == len(train_df), "Line nb in train should be the same"
assert len(test_df) == len(test_df), "Line nb in test should be the same"
assert len(train_df[train_df.new_paragraph.isna()]) == 0, "there should be no NA target value in training data"
assert len(test_df[test_df.new_paragraph.isna()]) == 0, "there should be no NA target value in test data"


In [None]:
# | export
def prepare_data_from_doc(file_path: Union[str, Path]) -> pd.DataFrame:
    with open(file_path, "r") as f:
        lines: list[str] = f.read().split("\n")
        line_feats = create_line_features(lines)
    return line_feats


In [None]:
prep_df = prepare_data_from_doc(Path("../test_data/doc_a.txt"))
assert all([c in LINE_FEAT_NAMES for c in prep_df.columns.tolist()])

In [None]:
import nbdev

nbdev.nbdev_export("prepare.ipynb")