## Imports

In [None]:
import sys

import spacy
from sklearn import metrics
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm
from nltk.stem.snowball import SnowballStemmer

sys.path.append('..')
from utils import cv_kfold, train_validate_split

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!python -m spacy download en_core_web_sm

## Data Loading

In [None]:
def read_file(filename: str) -> pd.DataFrame:
    return pd.DataFrame([
        (l.split()[0], l.split()[1], ' '.join(l.split()[3:]))
        for l in open(filename, encoding='utf8')
        ], columns=['class', 'sent', 'text']
    )

In [None]:
df_train = read_file('../datasets/train.txt')
df_test = read_file('../datasets/test.txt')

len(df_train), len(df_test)

In [None]:
X = df_train['text'].values
y = df_train['class'].values
y_sent = df_train['sent'].values

## Features pipelines

In [None]:
class TextLength(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([[len(i)] for i in X])

class WordCount(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([[len(i.split())] for i in X])
    
class AverageTokensLength(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([[np.mean([len(j) for j in i.split()])] for i in X])


class PredictSentiment(BaseEstimator, TransformerMixin):
    def __init__(self, X, y_sent):
        self.text_to_sentiment = {i: sent for i, sent in zip(X, y_sent)}
        self.sent_to_num = {'pos': 1, 'neg': -1, '': 0}

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            [self.sent_to_num[self.text_to_sentiment[i]]] for i in X
        ])


nlp = spacy.load('en_core_web_sm')
stemmer = SnowballStemmer(language='english')

def spacy_tokenizer(doc):
    return [x.orth_ for x in nlp(doc)]

def spacy_lemmatizer(doc):
    return [x.lemma_ for x in nlp(doc)]

def nltk_stems(doc):
    return [stemmer.stem(x) for x in doc]

def spacy_pos(doc):
    return [token.pos_ for token in nlp(doc)]

def spacy_ne(doc):
    return [ent.label_ for ent in nlp(doc).ents]

In [None]:
_base_model = ('clf', LinearSVC())
_base_vec = ('vec', TfidfVectorizer())

features = {
    'Baseline': Pipeline([_base_vec, _base_model]),
    'Text Length': Pipeline([('feat', FeatureUnion([_base_vec, ('len', TextLength())])), _base_model]),
    'Avg Tokens Length': Pipeline([('feat', FeatureUnion([_base_vec, ('avglen', AverageTokensLength())])), _base_model]),
    'Word Count':Pipeline([('feat', FeatureUnion([_base_vec, ('wcnt', WordCount())])), _base_model]),
    'Sentiment': Pipeline([('feat', FeatureUnion([_base_vec, ('sent', PredictSentiment(X, y_sent))])), _base_model]),
    'StopWords Filter': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(stop_words='english'))])), _base_model]),
    'NGrams 1-3': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(ngram_range=(1, 3)))])), _base_model]),
    'Add CountVectorizer': Pipeline([('feat', FeatureUnion([_base_vec, ('vec2', CountVectorizer())])), _base_model]),
    'Spacy Tokenizer': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_tokenizer))])), _base_model]),
    'Spacy Lemmatizer': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_lemmatizer))])), _base_model]),
    'NGrams 1-2 + StopWords Filter + sublinear_tf=True': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(stop_words="english",sublinear_tf=True,ngram_range=(1,2)))])), _base_model]),
    'Character-level': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 8), min_df=3))])), _base_model]),
    'NLTK Stemming': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=nltk_stems))])), _base_model]),
    'Lemmatizing + POS Tags': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_lemmatizer)), ('posvec', CountVectorizer(tokenizer=spacy_pos))])), _base_model]),
    'Lemmatizing + NE Tags': Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_lemmatizer)), ('nevec', CountVectorizer(tokenizer=spacy_ne))])), _base_model]),
    'Lemmatizing + POS Tags + NE Tags':Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_lemmatizer)), ('nevec', CountVectorizer(tokenizer=spacy_ne)), ('posvec', CountVectorizer(tokenizer=spacy_ne))])), _base_model]),
    'Lemmatizing + Text Length + Avg Token Length + Word Count':Pipeline([('feat', FeatureUnion([('vec', TfidfVectorizer(tokenizer=spacy_lemmatizer)), ('num_feat_sc', Pipeline([('num_feat', FeatureUnion([('len', TextLength()), ('avglen', AverageTokensLength()), ('wcnt', WordCount())])), ('sc', StandardScaler())]))])), _base_model])
}

In [None]:
df_scores = pd.DataFrame(columns=['score', 'time'], index=features.keys())
scorer = lambda *x: metrics.f1_score(*x, average='micro')

for features_name, features_pipeline in tqdm(features.items(), total=len(features), desc='Pipelines'):
    try:
        kfold_result = cv_kfold(features_pipeline, X, y, scorer=scorer, k=5)
        df_scores.loc[features_name, 'score'] = kfold_result['oof_score']
        df_scores.loc[features_name, 'time'] = kfold_result['mean_time']

        # X_train, X_val, y_train, y_val = train_test_split(X, y)
        # validate_results = train_validate_split(features_pipeline, X_train, y_train, X_val, y_val, scorer, verbose=0)
        # df_scores.loc[features_name, 'score'] = validate_results['score']
        # df_scores.loc[features_name, 'time'] = validate_results['time']
    except Exception as e:
        df_scores.loc[features_name, 'score'] = None
        df_scores.loc[features_name, 'time'] = None

df_scores['diff'] = df_scores['score'] - df_scores.loc['Baseline', 'score']

df_scores