## Imports

In [53]:
import sys
from pathlib import Path
import os
import typing as tp
from collections.abc import Callable
from copy import deepcopy
import time
from pathlib import Path

import joblib
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold
from tqdm.autonotebook import tqdm

In [54]:
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"  # Also affect subprocesses

## Data loading

In [55]:
# DATA_FOLDER = Path('.')
DATA_FOLDER = Path('../datasets/')

In [56]:
df_train = pd.read_csv(DATA_FOLDER / 'train.tsv', sep='\t', header=None, names=['text', 'label'])
df_val = pd.read_csv(DATA_FOLDER / 'val.tsv', sep='\t', header=None, names=['text', 'label'])
df_test = pd.read_csv(DATA_FOLDER / 'test.tsv', sep='\t', header=None, names=['text', 'label'])

len(df_train), len(df_val), len(df_test)

(12240, 1000, 860)

## Auxiliary functions 

In [57]:
def train_validate_split(
        model: BaseEstimator,
        data_train: list,
        target_train: list,
        data_val: list,
        target_val: list,
        scorer: Callable[[tp.Any, tp.Any], float],
) -> dict[str, tp.Any]:
    """Fit predict model on current split
    :param model: Model to be trained
    :param data_train: train data to perform k-fold cv
    :param target_train: target train data
    :param data_val: validate data to scoring
    :param target_val: validate target to scoring
    :param scorer: function to score prediction. args: target, prediction
    :return: dict with results of cv
    """
    data_train, data_val = np.array(data_train), np.array(data_val)
    target_train, target_val = np.array(target_train), np.array(target_val)

    # Fit model in current fold
    start_time = time.time()
    model.fit(data_train, target_train)
    end_time = time.time()

    # predict for out-fold and save it for validation
    pred_val = model.predict(data_val)

    # Score for out-fold
    score_fold = scorer(target_val, pred_val)

    return {
        'pred_val': pred_val,
        'score': score_fold,
        'time': end_time - start_time,
    }

In [58]:
def cv_kfold(
        model: BaseEstimator,
        data: list,
        target: list,
        scorer: Callable[[tp.Any, tp.Any], float],
        k: int = 5,
        *,
        random_state: int = 42,
) -> dict[str, tp.Any]:
    """Fit predict model multiple times with k-fold cross validation
    :param model: Model to be trained
    :param data: train data to perform k-fold cv
    :param target: target train data
    :param scorer: function to score prediction. args: target, prediction
    :param k: number of folds in cross validation
    :param random_state: fixed random state
    :return: dict with results of cv
    """
    random_instance = np.random.RandomState(random_state)

    data = np.array(data)
    target = np.array(target)

    pred_train = np.empty(data.shape[0], dtype=data.dtype)

    mean_score = 0
    full_oof_score, split_oof_score = [], []
    times = []

    pred_split_train = np.empty(data.shape[0], dtype=data.dtype)
    full_oof_score.append([])

    kf = KFold(n_splits=k, shuffle=True, random_state=random_instance)
    for i, (train_index, val_index) in enumerate(kf.split(data)):
        # select current train/val split
        data_train, data_val = data[train_index], data[val_index]
        target_train, target_val = target[train_index], target[val_index]

        # Fit model in current fold
        model_fold = deepcopy(model)
        fold_result = train_validate_split(
            model_fold,
            data_train, target_train,
            data_val, target_val,
            scorer,
            verbose=verbose,
        )

        times.append(fold_result['time'])
        pred_val = fold_result['pred_val']
        score_fold = fold_result['score']

        # save for out-fold validation
        pred_train[val_index] = pred_val
        pred_split_train[val_index] = pred_val

        # Score for out-fold
        mean_score += score_fold / float(k)
        full_oof_score[-1].append(score_fold)

        split_oof_score.append(scorer(target, pred_split_train))

    return {
        'train_pred': pred_train,
        'mean_score': mean_score,
        'mean_oof_score': np.mean(split_oof_score),
        'oof_scores': split_oof_score,
        'full_oof_scores': full_oof_score,
        'oof_score': scorer(target, pred_train),
        'times': times,
        'mean_time': np.mean(times),
    }

## Model selection

In [59]:
vectorizers = {
    'Count': CountVectorizer(),
    'Tfidf': TfidfVectorizer(),
    'Hashing': HashingVectorizer(),
}
models = {
    'LogReg': LogisticRegression(),
    'SVM': SVC(),
    'LinearSVM': LinearSVC(),
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'NaiveBayes': MultinomialNB(),
    'KNeighbors': KNeighborsClassifier(),
}

In [60]:
%%time 

df_scores = pd.DataFrame(columns=vectorizers.keys(), index=models.keys())
df_time = pd.DataFrame(columns=vectorizers.keys(), index=models.keys())
scorer = lambda *x: metrics.accuracy_score(*x)
scorer = lambda *x: metrics.f1_score(*x, average='macro')


joblib_memory = joblib.Memory()
for vec_name, vec in tqdm(vectorizers.items(), total=len(vectorizers), desc='vectorizer'):
    for model_name, model in tqdm(models.items(), total=len(models), desc='models'):
        pipeline = Pipeline(
            steps=[
                ('vec', vec),
                ('cls', model)
            ],
            memory=joblib_memory,
        )

        try:
        #     kfold_result = cv_kfold(pipeline, df_train['text'], df_train['label'], scorer=scorer, k=5)
        #     df_scores.loc[model_name, vec_name] = kfold_result['oof_score']
        #     df_time.loc[model_name, vec_name] = kfold_result['mean_time']

            X_train, X_val, y_train, y_val = train_test_split(df_train['text'], df_train['label'])
            validate_results = train_validate_split(pipeline, df_train['text'], df_train['label'], df_val['text'], df_val['label'], scorer, verbose=0)
            df_scores.loc[model_name, vec_name] = validate_results['score']
            df_time.loc[model_name, vec_name] = validate_results['time']
        except Exception as e:
            df_scores.loc[model_name, vec_name] = None
            df_time.loc[model_name, vec_name] = None

vectorizer:   0%|                                                                                                 | 0/3 [00:00<?, ?it/s]
models:   0%|                                                                                                     | 0/7 [00:00<?, ?it/s][A
models:  14%|█████████████▎                                                                               | 1/7 [00:00<00:03,  1.66it/s][A
models:  29%|██████████████████████████▌                                                                  | 2/7 [00:23<01:07, 13.49s/it][A
models:  43%|███████████████████████████████████████▊                                                     | 3/7 [00:24<00:30,  7.75s/it][A
models:  57%|█████████████████████████████████████████████████████▏                                       | 4/7 [00:37<00:29,  9.92s/it][A
models:  71%|██████████████████████████████████████████████████████████████████▍                          | 5/7 [00:39<00:14,  7.26s/it][A
models:  86%|██████████

CPU times: user 18min 49s, sys: 16.7 s, total: 19min 5s
Wall time: 18min 11s





In [61]:
df_scores

Unnamed: 0,Count,Tfidf,Hashing
LogReg,0.689854,0.676762,0.639036
SVM,0.612472,0.664,0.633952
LinearSVM,0.691414,0.695577,0.679687
RandomForest,0.669319,0.671942,0.563775
GradientBoosting,0.63987,0.633718,0.633318
NaiveBayes,0.662126,0.504388,
KNeighbors,0.514452,0.592787,0.558429


In [87]:
for r in df_scores.iterrows():
    print(r'\text{' + r[0] + '}', '&', ' & '.join(f'{i:.3f}' if i else '-' for i in r[1]), r'\\')

\text{LogReg} & 0.690 & 0.677 & 0.639 \\
\text{SVM} & 0.612 & 0.664 & 0.634 \\
\text{LinearSVM} & 0.691 & 0.696 & 0.680 \\
\text{RandomForest} & 0.669 & 0.672 & 0.564 \\
\text{GradientBoosting} & 0.640 & 0.634 & 0.633 \\
\text{NaiveBayes} & 0.662 & 0.504 & - \\
\text{KNeighbors} & 0.514 & 0.593 & 0.558 \\


In [62]:
df_time

Unnamed: 0,Count,Tfidf,Hashing
LogReg,0.571754,0.500931,11.501481
SVM,21.151801,41.121188,33.87945
LinearSVM,0.890761,0.290308,0.241149
RandomForest,13.13926,11.265821,906.895696
GradientBoosting,2.507715,5.887582,34.143957
NaiveBayes,0.227256,0.232428,
KNeighbors,0.22036,0.212619,0.16012
