In [1]:
import pandas as pd
import spacy 

from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import numpy as np

import fasttext

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.model_selection import learning_curve

from statistics import mean

from nltk.tokenize import RegexpTokenizer

import thesis_helper
functions = thesis_helper.Thesis_Helper()

fasttext = fasttext.load_model("/Users/ivowings/Downloads/cc.en.300.bin")
#fasttext = fasttext.load_model("C:/Users/Ivo/Downloads/cc.en.300.bin/cc.en.300.bin")



In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations_token.csv'

In [3]:
df = pd.read_csv(annotations,sep=',')
df.head()

Unnamed: 0,tokens,label
0,assist,0
1,their,0
2,development,0
3,enable,0
4,them,0


In [4]:
#Function to retrieve word2vec vectors from spacy
def fasttext_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)
    return wordvectors
    

def fasttext_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)/len(wordvectors)
    return wordvectors

mode = fasttext_retriever_average

In [5]:
#Retrieving the FastText vectors
x = pd.DataFrame(df['tokens'].progress_apply(mode))
x = x['tokens'].progress_apply(pd.Series)
x.shape

100%|██████████| 208742/208742 [00:04<00:00, 43626.53it/s]
100%|██████████| 208742/208742 [00:35<00:00, 5930.86it/s] 


(208742, 300)

In [6]:
%%time

functions.model_performance(x, df['label'])
#functions.model_performance(test_x, test_y)

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.3min finished
 17%|█▋        | 1/6 [04:20<21:42, 260.54s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 45.6min finished
 33%|███▎      | 2/6 [49:57<1:54:29, 1717.31s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   16.8s finished
 50%|█████     | 3/6 [50:14<47:02, 940.86s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 12.2min finished
 67%|██████▋   | 4/6 [1:02:28<28:38, 859.32s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 1134.3min finished
 83%|████████▎ | 5/6 [19:56:48<6:58:12, 25092.32s/it][Parallel(n_jobs=-

CPU times: user 3.44 s, sys: 3.84 s, total: 7.28 s
Wall time: 21h 27min 20s





Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.512934,0.336663,0.332569
1,GBC,0.390735,0.338492,0.336196
2,SGD,0.318396,0.333333,0.325694
3,RF,0.611341,0.379101,0.405054
4,SVM,0.454158,0.33589,0.330983
5,MLP,0.557196,0.382552,0.409067


In [7]:
df['pos'] = df['tokens'].progress_apply(functions.pos_tagger)
df['pos'] = df['pos'].progress_apply(functions.sequence_counter)

pos_dicts = df[['pos']]
pos_dicts = pos_dicts['pos'].apply(pd.Series)
pos_dicts = pos_dicts.fillna(0).astype(int)

df['dep'] = df['tokens'].progress_apply(functions.dep_tagger)
df['dep'] = df['dep'].progress_apply(functions.sequence_counter)

dep_dicts = df[['dep']]
dep_dicts = dep_dicts['dep'].apply(pd.Series)
dep_dicts = dep_dicts.fillna(0).astype(int)

x_pos = pos_dicts.join(dep_dicts,lsuffix='_gram', rsuffix='_pos')

x = x.join(x_pos, lsuffix='_embedding', rsuffix='_pos')

100%|██████████| 208742/208742 [14:15<00:00, 243.90it/s]
100%|██████████| 208742/208742 [14:58<00:00, 232.39it/s]
100%|██████████| 208742/208742 [15:00<00:00, 231.80it/s]
100%|██████████| 208742/208742 [15:03<00:00, 231.11it/s]


In [8]:
functions.model_performance(x.fillna(0), df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.2min finished
 17%|█▋        | 1/6 [07:12<36:01, 432.23s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 55.0min finished
 33%|███▎      | 2/6 [1:02:13<2:21:19, 2119.96s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   30.0s finished
 50%|█████     | 3/6 [1:02:43<58:17, 1165.69s/it]  [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 16.1min finished
 67%|██████▋   | 4/6 [1:18:49<36:13, 1086.95s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 1169.8min finished
 83%|████████▎ | 5/6 [20:48:37<7:13:25, 26005.29s/it][Parallel(n_j

Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.509342,0.336433,0.332109
1,GBC,0.390735,0.338492,0.336196
2,SGD,0.318396,0.333333,0.325694
3,RF,0.612928,0.379103,0.405071
4,SVM,0.344329,0.333516,0.326079
5,MLP,0.543989,0.38108,0.406448
