In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

import thesis_helper
functions = thesis_helper.Thesis_Helper()

from gensim.models import Word2Vec
import gensim.downloader as api
word2vec = api.load("glove-wiki-gigaword-300") 


In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations_token.csv'

In [3]:
df = pd.read_csv(annotations,sep=',')
df.head()

Unnamed: 0,tokens,label
0,assist,0
1,their,0
2,development,0
3,enable,0
4,them,0


In [4]:
from nltk import RegexpTokenizer
def word2vec_vocab_check(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    try:
        word2vec.wv[tokens]
        return True
    except:
        return False

#Function to retrieve word2vec vectors from spacy
def word2vec_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = sum(word2vec.wv[tokens])
    return wordvectors
    
def word2vec_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = word2vec.wv[tokens]
    average = sum(wordvectors)/len(wordvectors)
    return average

In [5]:
#Removing out of vocabulary word2vec words
df['vocab_check'] = df['tokens'].progress_apply(word2vec_vocab_check)
df = df[df.vocab_check==True]
df = df.drop(columns=['vocab_check'])
df.shape

  word2vec.wv[tokens]
100%|██████████| 208742/208742 [00:03<00:00, 59724.32it/s]


(206918, 2)

In [6]:
mode = word2vec_retriever_sum

#Retrieving the word2vec vectors
x = pd.DataFrame(df['tokens'].progress_apply(mode))
x = x['tokens'].progress_apply(pd.Series)
x.shape

  wordvectors = sum(word2vec.wv[tokens])
100%|██████████| 206918/206918 [00:05<00:00, 37270.47it/s]
100%|██████████| 206918/206918 [00:44<00:00, 4660.50it/s] 


(206918, 300)

In [7]:
%%time
functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 11.8min finished
 17%|█▋        | 1/6 [11:49<59:07, 709.51s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 56.1min finished
 33%|███▎      | 2/6 [1:07:58<2:31:36, 2274.06s/it]

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   27.5s finished
 50%|█████     | 3/6 [1:08:26<1:02:24, 1248.29s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 17.4min finished
 67%|██████▋   | 4/6 [1:25:51<38:56, 1168.27s/it]  [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 575.2min finished
 83%|████████▎ | 5/6 [11:01:06<3:39:53, 13193.04s/it]

We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.0min finished
100%|██████████| 6/6 [11:14:05<00:00, 6740.92s/it]   


CPU times: user 3.18 s, sys: 6.67 s, total: 9.85 s
Wall time: 11h 14min 5s


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.510403,0.341143,0.341378
1,GBC,0.425754,0.377324,0.390439
2,SGD,0.318608,0.333333,0.325804
3,RF,0.599791,0.37609,0.399973
4,SVM,0.590277,0.362385,0.378063
5,MLP,0.570847,0.375611,0.399106


In [8]:
df['pos'] = df['tokens'].progress_apply(functions.pos_tagger)
df['pos'] = df['pos'].progress_apply(functions.sequence_counter)

pos_dicts = df[['pos']]
pos_dicts = pos_dicts['pos'].apply(pd.Series)
pos_dicts = pos_dicts.fillna(0).astype(int)

df['dep'] = df['tokens'].progress_apply(functions.dep_tagger)
df['dep'] = df['dep'].progress_apply(functions.sequence_counter)

dep_dicts = df[['dep']]
dep_dicts = dep_dicts['dep'].apply(pd.Series)
dep_dicts = dep_dicts.fillna(0).astype(int)

x_pos = pos_dicts.join(dep_dicts,lsuffix='_gram', rsuffix='_pos')

x = x.join(x_pos, lsuffix='_embedding', rsuffix='_pos')

100%|██████████| 206918/206918 [14:47<00:00, 233.21it/s] 
100%|██████████| 206918/206918 [14:49<00:00, 232.68it/s]
100%|██████████| 206918/206918 [14:44<00:00, 233.98it/s]
100%|██████████| 206918/206918 [14:48<00:00, 232.76it/s]


In [9]:
functions.model_performance(x.fillna(0), df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.3min finished
 17%|█▋        | 1/6 [13:17<1:06:26, 797.38s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 43.9min finished
 33%|███▎      | 2/6 [57:14<2:05:17, 1879.33s/it]

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   28.3s finished
 50%|█████     | 3/6 [57:42<51:42, 1034.22s/it]  [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 18.8min finished
 67%|██████▋   | 4/6 [1:16:31<35:43, 1071.53s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 833.0min finished
 83%|████████▎ | 5/6 [15:09:31<5:11:48, 18708.25s/it]

We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 12.6min finished
100%|██████████| 6/6 [15:22:09<00:00, 9221.51s/it]   


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.519362,0.341197,0.341487
1,GBC,0.425754,0.377324,0.390439
2,SGD,0.318608,0.33333,0.325803
3,RF,0.604236,0.376094,0.399995
4,SVM,0.590984,0.36174,0.37712
5,MLP,0.560824,0.374237,0.396754
