# Semantic Features
I'm going to try adding semantic features: 
- vector representations of words
- vector representations of sentences

First thing to try is glove embeddings for words. We'll compare to the baseline

In [37]:
import pandas as pd
import numpy as np
import spacy
import pickle

from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

import hyper.eval as evl
from hyper.transformers import FlairTransformer, SpacyTransformer
from flair.embeddings import TransformerDocumentEmbeddings

Read in the data

In [8]:
data = pd.read_csv("../data/processed.csv", sep="\t", dtype={"content": "string", "label": bool})
X = data["content"]
y = data["label"]

## Baseline

In [23]:
base_pipe = make_pipeline(
    CountVectorizer(), 
    LogisticRegression(max_iter=300)
)

In [40]:
res = evl.evaluate_algorithm(X, y, base_pipe)
res

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.5s remaining:    3.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.6s finished


{'fit_time': array([2.05309916, 2.48872709, 2.27170467, 2.42785668, 2.40229273]),
 'score_time': array([0.06390119, 0.04425859, 0.08256388, 0.04086161, 0.04082966]),
 'test_accuracy': array([0.72093023, 0.70542636, 0.70542636, 0.80620155, 0.72093023]),
 'test_precision': array([0.63043478, 0.63888889, 0.63888889, 0.78947368, 0.66666667]),
 'test_recall': array([0.60416667, 0.47916667, 0.47916667, 0.63829787, 0.46808511]),
 'test_f1': array([0.61702128, 0.54761905, 0.54761905, 0.70588235, 0.55      ])}

In [16]:
res["test_accuracy"].mean()

0.7317829457364341

These scores match what is in the baseline notebook. This means we have reproducibility, which is good.

## Glove embeddings
with zeugma

In [48]:
from zeugma.embeddings import EmbeddingTransformer
glove = EmbeddingTransformer("glove")

In [31]:
glove_pipeline = make_pipeline(
    glove,
    LogisticRegression(max_iter=300)
)

In [33]:
res = evl.evaluate_algorithm(X, y, glove_pipeline)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.1min remaining:  1.6min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished


In [34]:
res

{'fit_time': array([1.91023254, 1.98548007, 1.78395462, 1.71965981, 1.65102267]),
 'score_time': array([0.44476247, 0.4780736 , 0.50054312, 0.32994771, 0.29263854]),
 'test_accuracy': array([0.62015504, 0.68992248, 0.65116279, 0.7751938 , 0.74418605]),
 'test_precision': array([0.48275862, 0.65384615, 0.56521739, 0.82142857, 0.69444444]),
 'test_recall': array([0.29166667, 0.35416667, 0.27083333, 0.4893617 , 0.53191489]),
 'test_f1': array([0.36363636, 0.45945946, 0.36619718, 0.61333333, 0.60240964])}

In [9]:
res["test_accuracy"].mean()

0.696124031007752

So this is not as good as the baseline

## Building a Transformer using spacy embeddings

In [10]:
import spacy
from hyper.transformers import SpacyTransformer

In [4]:
spacy_model = spacy.load("en_core_web_md")

In [25]:
spacy_transformer = SpacyTransformer(spacy_model, "en_core_web_md")

In [26]:
space = make_pipeline(
    spacy_transformer,
    LogisticRegression(max_iter=300),
)

In [9]:
results = evl.evaluate_algorithm(X, y, space)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.1min remaining:  3.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.4min finished


In [11]:
results

{'fit_time': array([61.96490002, 64.77607632, 67.31588554, 75.45301342, 74.95549393]),
 'score_time': array([15.12033987, 14.11581182, 13.83320427,  8.49561477,  7.1852808 ]),
 'test_accuracy': array([0.74418605, 0.69767442, 0.72093023, 0.82170543, 0.7751938 ]),
 'test_precision': array([0.66666667, 0.64516129, 0.65      , 0.81578947, 0.78125   ]),
 'test_recall': array([0.625     , 0.41666667, 0.54166667, 0.65957447, 0.53191489]),
 'test_f1': array([0.64516129, 0.50632911, 0.59090909, 0.72941176, 0.63291139])}

In [12]:
results["test_accuracy"].mean()

0.751937984496124

A bit better than the baseline and much better than the glove embeddings.

## Feature union

In [19]:
from sklearn.pipeline import Pipeline, FeatureUnion

In [27]:
pipeline = Pipeline([
    ("union", FeatureUnion(
        transformer_list=[
            ("base_vectorizer", CountVectorizer()),
            ("spacy_vectorizer", spacy_transformer),
        ],
    )),
    ("log_reg", LogisticRegression(max_iter=300)),
])

In [37]:
results = evl.evaluate_algorithm(X, y, pipeline)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.2min remaining:  3.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.4min finished


In [42]:
res

{'fit_time': array([2.05309916, 2.48872709, 2.27170467, 2.42785668, 2.40229273]),
 'score_time': array([0.06390119, 0.04425859, 0.08256388, 0.04086161, 0.04082966]),
 'test_accuracy': array([0.72093023, 0.70542636, 0.70542636, 0.80620155, 0.72093023]),
 'test_precision': array([0.63043478, 0.63888889, 0.63888889, 0.78947368, 0.66666667]),
 'test_recall': array([0.60416667, 0.47916667, 0.47916667, 0.63829787, 0.46808511]),
 'test_f1': array([0.61702128, 0.54761905, 0.54761905, 0.70588235, 0.55      ])}

In [58]:
res["test_accuracy"].mean()

0.7317829457364341

This isn't working as intended. This is exactly the same score as with the base classifier. I think it's because the outputs of the two transformers aren't the same shape. 

In [49]:
union_transformer = FeatureUnion(
        transformer_list=[
            ("glove_vectorizer", glove),
            ("spacy_vectorizer", spacy_transformer),
        ],
    )

In [52]:
pipeline = make_pipeline(
    union_transformer,
    LogisticRegression()
)

In [53]:
results = evl.evaluate_algorithm(X, y, pipeline, verbosity=0)

In [57]:
results

{'fit_time': array([75.09200478, 77.23983812, 75.42570567, 78.75791216, 72.82453895]),
 'score_time': array([21.33738518, 18.13628912, 17.34909534,  9.07141471,  8.17491913]),
 'test_accuracy': array([0.72868217, 0.70542636, 0.68992248, 0.82945736, 0.78294574]),
 'test_precision': array([0.65116279, 0.67857143, 0.6       , 0.85714286, 0.77142857]),
 'test_recall': array([0.58333333, 0.39583333, 0.5       , 0.63829787, 0.57446809]),
 'test_f1': array([0.61538462, 0.5       , 0.54545455, 0.73170732, 0.65853659])}

In [56]:
results["test_accuracy"].mean()

0.7472868217054264

Which is slightly worse than when using only the spacy vectorizer. So the glove vector isn't really doing anything for me.

## Flair transformer

In [20]:
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
from hyper.transformers import FlairTransformer

In [27]:
glove_embedding = WordEmbeddings("glove")
document_embeddings = DocumentPoolEmbeddings([glove_embedding])

In [15]:
f_transformer = trans.FlairTransformer(document_embeddings)

In [25]:
pipe = make_pipeline(
    f_transformer, 
    LogisticRegression(max_iter=300)
)

In [26]:
results = evl.evaluate_algorithm(X, y, pipe)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   37.1s remaining:   55.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   41.9s finished


In [28]:
results["test_accuracy"].mean()

0.7348837209302326

So compared to the other glove embeddings, these are much better. But they're not quite as good as the spacy embeddings. They're much faster than the spacy embeddings though. 

### Flair transformer with FlairEmbeddings

In [58]:
from flair.embeddings import FlairEmbeddings

In [59]:
flair_embeddings = FlairEmbeddings("news-forward")
pooled_flair_embeddings = DocumentPoolEmbeddings([flair_embeddings])

In [64]:
f_transformer = trans.FlairTransformer(pooled_flair_embeddings)

In [65]:
pipe = make_pipeline(
    f_transformer,
    LogisticRegression(max_iter=300)
)

This has to train an NN I think so is super slow (I accidentally left it running for hours, didn't finish).

### Flair transformer with TransformerDocumentEmbeddings

In [11]:
from flair.embeddings import TransformerDocumentEmbeddings

In [12]:
doc_embeddings = TransformerDocumentEmbeddings("distilbert-base-uncased")

In [13]:
transformer = FlairTransformer(doc_embeddings)

In [14]:
pipe = make_pipeline(
    transformer,
    LogisticRegression(max_iter=300)
)

In [15]:
res = evl.evaluate_algorithm(X, y, pipe, n_jobs=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END  accuracy: (test=0.775) f1: (test=0.688) precision: (test=0.711) recall: (test=0.667) total time= 2.3min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


[CV] END  accuracy: (test=0.806) f1: (test=0.706) precision: (test=0.811) recall: (test=0.625) total time= 2.5min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.9min remaining:    0.0s


[CV] END  accuracy: (test=0.783) f1: (test=0.702) precision: (test=0.717) recall: (test=0.688) total time= 2.3min
[CV] END  accuracy: (test=0.837) f1: (test=0.759) precision: (test=0.825) recall: (test=0.702) total time= 2.3min
[CV] END  accuracy: (test=0.822) f1: (test=0.763) precision: (test=0.740) recall: (test=0.787) total time= 2.3min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 11.8min finished


In [17]:
res

{'fit_time': array([112.4225204 , 116.11454725, 109.02963853, 110.20191193,
        112.06278419]),
 'score_time': array([28.45146108, 32.09502554, 28.80259061, 26.86412263, 24.91251326]),
 'test_accuracy': array([0.7751938 , 0.80620155, 0.78294574, 0.8372093 , 0.82170543]),
 'test_precision': array([0.71111111, 0.81081081, 0.7173913 , 0.825     , 0.74      ]),
 'test_recall': array([0.66666667, 0.625     , 0.6875    , 0.70212766, 0.78723404]),
 'test_f1': array([0.68817204, 0.70588235, 0.70212766, 0.75862069, 0.7628866 ])}

In [18]:
res["test_accuracy"].mean()

0.8046511627906977

This is the most accurate, the issue is, I can't run it in parallel without it blowing up my computer.

## A union of the two best so far
Going to do a feature union of the dilbert base uncased sentence level embeddings and the spacy transformer

In [9]:
spacy_model = spacy.load("en_core_web_md")
spacy_transformer = SpacyTransformer(spacy_model, "en_core_web_md")

In [10]:
doc_embeddings = TransformerDocumentEmbeddings("distilbert-base-uncased")
flair_transformer = FlairTransformer(doc_embeddings)

In [11]:
union_transformer = FeatureUnion(
        transformer_list=[
            ("spacy_transformer", spacy_transformer),
            ("flair_transformer", flair_transformer),
        ],
    )

In [12]:
log_reg_pipeline = make_pipeline(
    union_transformer,
    LogisticRegression(max_iter=300)
)

In [None]:
log_results = evl.evaluate_algorithm(X, y, log_reg_pipeline, n_jobs=1)
print(log_results)
print(log_results["test_accuracy"].mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END  accuracy: (test=0.814) f1: (test=0.745) precision: (test=0.761) recall: (test=0.729) total time= 3.7min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.8min remaining:    0.0s


[CV] END  accuracy: (test=0.767) f1: (test=0.659) precision: (test=0.725) recall: (test=0.604) total time= 3.5min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.5min remaining:    0.0s


[CV] END  accuracy: (test=0.760) f1: (test=0.674) precision: (test=0.681) recall: (test=0.667) total time= 6.2min
[CV] END  accuracy: (test=0.853) f1: (test=0.776) precision: (test=0.868) recall: (test=0.702) total time= 5.5min


In [7]:
np.array([0.814, 0.767, 0.760, 0.853]).mean()

0.7985

This keeps crashing but looks like accuracy is going to be something like 0.8

That isn't any better than the transfer document embeddings by themselves.

### Same but different workflow

In [18]:
X_transformed = union_transformer.transform(X)

In [44]:
model = LogisticRegression(max_iter=300)

In [45]:
resutls = evl.evaluate_algorithm(X_transformed, y, model)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.6s remaining:    5.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.8s finished


In [46]:
resutls

{'fit_time': array([1.02787304, 0.98183155, 0.95653915, 1.08852792, 0.88862753]),
 'score_time': array([0.00394225, 0.00742865, 0.00881505, 0.00313354, 0.01115441]),
 'test_accuracy': array([0.82170543, 0.76744186, 0.75968992, 0.85271318, 0.8372093 ]),
 'test_precision': array([0.77777778, 0.725     , 0.68085106, 0.86842105, 0.76      ]),
 'test_recall': array([0.72916667, 0.60416667, 0.66666667, 0.70212766, 0.80851064]),
 'test_f1': array([0.75268817, 0.65909091, 0.67368421, 0.77647059, 0.78350515])}

In [47]:
resutls["test_accuracy"].mean()

0.8077519379844962