## Imports

In [112]:
import sys

import spacy
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from tqdm.auto import tqdm

sys.path.append('..')
from utils import cv_kfold, train_validate_split

In [113]:
import warnings
warnings.filterwarnings("ignore")

In [101]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
     |████████████████████████████████| 12.8 MB 1.0 MB/s            
You should consider upgrading via the '/Users/kblack/Projects/rug/lft-assignment-1/.venv/bin/python -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Data Loading

In [114]:
def read_file(filename: str) -> pd.DataFrame:
    return pd.DataFrame([
        (l.split()[0], l.split()[1], ' '.join(l.split()[3:]))
        for l in open(filename)
        ], columns=['class', 'sent', 'text']
    )

In [115]:
df_train = read_file('../datasets/train.txt')
df_test = read_file('../datasets/test.txt')

len(df_train), len(df_test)

(5400, 600)

In [116]:
X = df_train['text'].values
y = df_train['class'].values
y_sent = df_train['sent'].values

## Features pipelines

In [117]:
class TextLength(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([[float(len(i))] for i in X])


class AverageTokensLength(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([[np.mean([len(j) for j in i.split()])] for i in X])


class PredictSentiment(BaseEstimator, TransformerMixin):
    def __init__(self, X, y_sent):
        self.text_to_sentiment = {i: sent for i, sent in zip(X, y_sent)}
        self.sent_to_num = {'pos': 1, 'neg': -1, '': 0}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            [self.sent_to_num[self.text_to_sentiment[i]]] for i in X
        ])


nlp = spacy.load('en_core_web_sm')

def spacy_tokenizer(doc):
    return [x.orth_ for x in nlp(doc)]

def spacy_lemmatizer(doc):
    return [x.lemma_ for x in nlp(doc)]

In [109]:
_base_model = ('clf', LinearSVC())
_base_vec = ('vec', TfidfVectorizer())

features = {
    'Baseline': Pipeline([_base_vec, _base_model]),
    'Text Length': Pipeline([('feat', FeatureUnion([_base_vec, ('len', TextLength())])), _base_model]),
    'Avg Tokens Length': Pipeline([('feat', FeatureUnion([_base_vec, ('len', AverageTokensLength())])), _base_model]),
    'Sentiment': Pipeline([('feat', FeatureUnion([_base_vec, ('sent', PredictSentiment(X, y_sent))])), _base_model]),
    'StopWords Filter': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(stop_words='english'))])), _base_model]),
    'NGrams 1-3': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(ngram_range=(1, 3)))])), _base_model]),
    'Add CountVectorizer': Pipeline([('feat', FeatureUnion([_base_vec, ('vec2', CountVectorizer())])), _base_model]),
    'Spacy Tokenizer': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_tokenizer))])), _base_model]),
    'Spacy Lemmatizer': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_lemmatizer))])), _base_model]),
}

In [110]:
df_scores = pd.DataFrame(columns=['score', 'time'], index=features.keys())
scorer = lambda *x: metrics.f1_score(*x, average='micro')

for features_name, features_pipeline in tqdm(features.items(), total=len(features), desc='Pipelines'):
    try:
        kfold_result = cv_kfold(features_pipeline, X, y, scorer=scorer, k=5)
        df_scores.loc[features_name, 'score'] = kfold_result['oof_score']
        df_scores.loc[features_name, 'time'] = kfold_result['mean_time']

        # X_train, X_val, y_train, y_val = train_test_split(X, y)
        # validate_results = train_validate_split(features_pipeline, X_train, y_train, X_val, y_val, scorer, verbose=0)
        # df_scores.loc[features_name, 'score'] = validate_results['score']
        # df_scores.loc[features_name, 'time'] = validate_results['time']
    except Exception as e:
        df_scores.loc[features_name, 'score'] = None
        df_scores.loc[features_name, 'time'] = None

df_scores['diff'] = df_scores['score'] - df_scores.loc['Baseline', 'score']

df_scores

Pipelines:   0%|          | 0/9 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Folds:   0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,score,time,diff
Baseline,0.918704,0.923271,0.0
Text Length,0.306667,6.766604,-0.612037
Avg Tokens Length,0.92037,1.949838,0.001667
Sentiment,0.919444,0.785033,0.000741
StopWords Filter,0.91963,0.471536,0.000926
NGrams 1-3,0.918704,5.424568,0.0
Add CountVectorizer,0.871296,2.258375,-0.047407
Spacy Tokenizer,0.917407,80.435213,-0.001296
Spacy Lemmatizer,0.915,83.836742,-0.003704
