## Imports

In [12]:
import sys

import joblib
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from tqdm.auto import tqdm

sys.path.append('..')
from utils import cv_kfold, train_validate_split

In [13]:
import warnings
warnings.filterwarnings("ignore")

## Data Loading

In [14]:
def read_file(filename: str) -> pd.DataFrame:
    return pd.DataFrame([
        (l.split()[0], l.split()[1], ' '.join(l.split()[3:]))
        for l in open(filename)
        ], columns=['class', 'sent', 'text']
    )

In [15]:
df_train = read_file('../datasets/train.txt')
df_test = read_file('../datasets/test.txt')

len(df_train), len(df_test)

(5400, 600)

In [16]:
X = df_train['text'].values
y = df_train['class'].values
y_sent = df_train['sent'].values

## Experiments

In [22]:
vectorizers = {
    'Count': CountVectorizer(),
    'Tfidf': TfidfVectorizer(),
    'Hashing': HashingVectorizer(),
}
models = {
    'LogReg': LogisticRegression(),
    'SVM': SVC(),
    'LinearSVM': LinearSVC(),
    'RandomForest': RandomForestClassifier(),
    'ExtraTrees': ExtraTreesClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'NaiveBayes': MultinomialNB(),
    'KNeighbors': KNeighborsClassifier(),
}

In [23]:
df_scores = pd.DataFrame(columns=vectorizers.keys(), index=models.keys())
df_time = pd.DataFrame(columns=vectorizers.keys(), index=models.keys())
scorer = lambda *x: metrics.f1_score(*x, average='micro')

joblib_memory = joblib.Memory()
for model_name, model in tqdm(models.items(), total=len(models), desc='models'):
    for vec_name, vec in tqdm(vectorizers.items(), total=len(vectorizers), desc='vectorizer'):
        pipeline = Pipeline(
            steps=[
                ('vec', vec),
                ('cls', model)
            ],
            memory=joblib_memory,
        )

        try:
            kfold_result = cv_kfold(pipeline, X, y, scorer=scorer, k=5)
            df_scores.loc[model_name, vec_name] = kfold_result['oof_score']
            df_time.loc[model_name, vec_name] = kfold_result['mean_time']

            # X_train, X_val, y_train, y_val = train_test_split(X, y)
            # validate_results = train_validate_split(pipeline, X_train, y_train, X_val, y_val, scorer, verbose=0)
            # df_scores.loc[model_name, vec_name] = validate_results['score']
            # df_time.loc[model_name, vec_name] = validate_results['time']
        except Exception as e:
            df_scores.loc[model_name, vec_name] = None
            df_time.loc[model_name, vec_name] = None

df_scores

models:   0%|          | 0/8 [00:00<?, ?it/s]

vectorizer:   0%|          | 0/3 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

vectorizer:   0%|          | 0/3 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

vectorizer:   0%|          | 0/3 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

vectorizer:   0%|          | 0/3 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

vectorizer:   0%|          | 0/3 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

vectorizer:   0%|          | 0/3 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
df_time

Unnamed: 0,Count,Tfidf,Hashing
LogReg,3.096869,2.601949,72.520418
SVM,11.659743,17.306558,10.384465
LinearSVM,1.240577,0.557531,0.725248
RandomForest,5.302111,5.129231,209.46437
ExtraTrees,7.842815,7.88569,354.351009
GradientBoosting,25.689576,57.326416,
NaiveBayes,,,
KNeighbors,,,


In [25]:
df_scores

Unnamed: 0,Count,Tfidf,Hashing
LogReg,0.879815,0.911296,0.866852
SVM,0.757407,0.902778,0.861852
LinearSVM,0.87037,0.918704,0.905556
RandomForest,0.872593,0.87537,0.850926
ExtraTrees,0.879815,0.883148,0.857963
GradientBoosting,0.847222,0.847037,
NaiveBayes,,,
KNeighbors,,,
