In [1]:
import pandas as pd
import spacy 

from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import numpy as np

import fasttext

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.model_selection import learning_curve

from statistics import mean

from nltk.tokenize import RegexpTokenizer

import thesis_helper
functions = thesis_helper.Thesis_Helper()

fasttext = fasttext.load_model("/Users/ivowings/Downloads/cc.en.300.bin")
#fasttext = fasttext.load_model("C:/Users/Ivo/Downloads/cc.en.300.bin/cc.en.300.bin")





In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'

In [3]:
df2 = pd.read_csv(annotations,sep=';')

#Filling any empty context columns with 'empty'
df2['left_context'] = df2['left_context'].astype(str).replace(r'^\s*$', 'empty', regex=True)
df2['right_context'] = df2['right_context'].astype(str).replace(r'^\s*$', 'empty', regex=True)

df2['concatenated'] = df2['left_context'] + ' | ' + df2['candidate_skill'] + ' | ' + df2['right_context']
print('Number of annotated rows ',df2.shape[0])

Number of annotated rows  20836


In [4]:
df = df2.copy()
#df = df.head(1000)

In [5]:
#Function to retrieve word2vec vectors from spacy
def fasttext_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)
    return wordvectors
    

def fasttext_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)/len(wordvectors)
    return wordvectors

mode = fasttext_retriever_average

In [6]:
#Retrieving the word2vec vectors
x_left = pd.DataFrame(df['left_context'].progress_apply(mode))
x_left = x_left['left_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_right = pd.DataFrame(df['right_context'].progress_apply(mode))
x_right = x_right['right_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_middle = pd.DataFrame(df['candidate_skill'].progress_apply(mode))
x_middle = x_middle['candidate_skill'].progress_apply(pd.Series) 

x = x_left
x['sep'] = 5
x = x.join(x_middle,lsuffix='_left', rsuffix='_middle')
x['sep2'] = 5
x = x.join(x_right,lsuffix='_middle', rsuffix='_right')

100%|██████████| 20836/20836 [00:02<00:00, 8857.98it/s] 
100%|██████████| 20836/20836 [00:07<00:00, 2693.61it/s]
100%|██████████| 20836/20836 [00:02<00:00, 8695.09it/s] 
100%|██████████| 20836/20836 [00:06<00:00, 3467.54it/s]
100%|██████████| 20836/20836 [00:01<00:00, 18755.63it/s]
100%|██████████| 20836/20836 [00:06<00:00, 3430.78it/s]


In [8]:
%%time


functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.5min finished
 17%|█▋        | 1/6 [07:29<37:27, 449.57s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 33.1min finished
 33%|███▎      | 2/6 [40:36<1:30:15, 1353.96s/it]

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   22.0s finished
 50%|█████     | 3/6 [40:58<37:17, 745.83s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.3min finished
 67%|██████▋   | 4/6 [44:19<17:41, 530.59s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.3min finished
 83%|████████▎ | 5/6 [53:38<09:00, 540.93s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 74.0min finished
100%|██████████| 6/6 [2:07:37<00:00, 1276.25s/it]

CPU times: user 937 ms, sys: 989 ms, total: 1.93 s
Wall time: 2h 7min 37s





Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.771407,0.669013,0.703131
1,GBC,0.640726,0.511614,0.523877
2,SGD,0.768749,0.661034,0.695577
3,RF,0.891047,0.716867,0.763324
4,SVM,0.788481,0.525298,0.581297
5,MLP,0.706772,0.708012,0.698384


In [9]:
df['pos'] = df['candidate_skill'].progress_apply(functions.pos_tagger)
df['pos'] = df['pos'].progress_apply(functions.sequence_counter)

pos_dicts = df[['pos']]
pos_dicts = pos_dicts['pos'].apply(pd.Series)
pos_dicts = pos_dicts.fillna(0).astype(int)

df['dep'] = df['candidate_skill'].progress_apply(functions.dep_tagger)
df['dep'] = df['dep'].progress_apply(functions.sequence_counter)

dep_dicts = df[['dep']]
dep_dicts = dep_dicts['dep'].apply(pd.Series)
dep_dicts = dep_dicts.fillna(0).astype(int)

x_pos = pos_dicts.join(dep_dicts,lsuffix='_gram', rsuffix='_pos')

x = x.join(x_pos, lsuffix='_embedding', rsuffix='_pos')

100%|██████████| 20836/20836 [01:30<00:00, 230.46it/s]
100%|██████████| 20836/20836 [01:30<00:00, 231.00it/s]
100%|██████████| 20836/20836 [01:28<00:00, 235.89it/s]
100%|██████████| 20836/20836 [01:30<00:00, 231.28it/s]


In [10]:
functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.5min finished
 17%|█▋        | 1/6 [03:32<17:43, 212.73s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 30.4min finished
 33%|███▎      | 2/6 [33:54<1:17:16, 1159.23s/it]

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   22.3s finished
 50%|█████     | 3/6 [34:16<32:00, 640.16s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.1min finished
 67%|██████▋   | 4/6 [37:25<15:23, 461.91s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.5min finished
 83%|████████▎ | 5/6 [46:57<08:21, 501.63s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 56.4min finished
100%|██████████| 6/6 [1:43:19<00:00, 1033.21s/it]


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.788633,0.694697,0.722844
1,GBC,0.708793,0.529121,0.545065
2,SGD,0.763639,0.678936,0.695425
3,RF,0.890255,0.717174,0.76232
4,SVM,0.854887,0.612439,0.664539
5,MLP,0.721357,0.746519,0.724686
