In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

import thesis_helper
functions = thesis_helper.Thesis_Helper()

from gensim.models import Word2Vec
import gensim.downloader as api
word2vec = api.load("glove-wiki-gigaword-300") 


In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'

In [3]:
df = pd.read_csv(annotations,sep=';')

#Filling any empty context columns with 'empty'
df['left_context'] = df['left_context'].astype(str).replace(r'^\s*$', 'empty', regex=True)
df['right_context'] = df['right_context'].astype(str).replace(r'^\s*$', 'empty', regex=True)

df['concatenated'] = df['left_context'] + ' | ' + df['candidate_skill'] + ' | ' + df['right_context']
print('Number of annotated rows ',df.shape[0])

Number of annotated rows  20836


In [4]:
from nltk import RegexpTokenizer
def word2vec_vocab_check(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    try:
        word2vec.wv[tokens]
        return True
    except:
        return False

#Function to retrieve word2vec vectors from spacy
def word2vec_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = sum(word2vec.wv[tokens])
    return wordvectors
    
def word2vec_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = word2vec.wv[tokens]
    average = sum(wordvectors)/len(wordvectors)
    return average

In [5]:
#Removing out of vocabulary word2vec words
df['vocab_check_left'] = df['left_context'].progress_apply(word2vec_vocab_check)
df['vocab_check_middle'] = df['candidate_skill'].progress_apply(word2vec_vocab_check)
df['vocab_check_right'] = df['right_context'].progress_apply(word2vec_vocab_check)
df = df[(df.vocab_check_left==True) & (df.vocab_check_middle==True) & (df.vocab_check_right==True)]
df = df.drop(columns=['vocab_check_left', 'vocab_check_middle', 'vocab_check_right'])

  word2vec.wv[tokens]
100%|██████████| 20836/20836 [00:00<00:00, 41319.62it/s]
100%|██████████| 20836/20836 [00:00<00:00, 52338.72it/s]
100%|██████████| 20836/20836 [00:00<00:00, 46241.41it/s]


In [6]:
mode = word2vec_retriever_sum
#Retrieving the word2vec vectors
x_left = pd.DataFrame(df['left_context'].progress_apply(mode))
x_left = x_left['left_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_right = pd.DataFrame(df['right_context'].progress_apply(mode))
x_right = x_right['right_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_middle = pd.DataFrame(df['candidate_skill'].progress_apply(mode))
x_middle = x_middle['candidate_skill'].progress_apply(pd.Series)

x = x_left
x['sep'] = 5
x = x.join(x_middle,lsuffix='_left', rsuffix='_middle')
x['sep2'] = 5
x = x.join(x_right,lsuffix='_middle', rsuffix='_right')

  wordvectors = sum(word2vec.wv[tokens])
100%|██████████| 19340/19340 [00:00<00:00, 31118.11it/s]
100%|██████████| 19340/19340 [00:03<00:00, 6001.02it/s] 
100%|██████████| 19340/19340 [00:00<00:00, 33570.62it/s]
100%|██████████| 19340/19340 [00:03<00:00, 5755.21it/s] 
100%|██████████| 19340/19340 [00:00<00:00, 39761.60it/s]
100%|██████████| 19340/19340 [00:03<00:00, 5963.40it/s] 


In [7]:
%%time

y=df['label']
functions.model_performance(x, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.6min finished
 17%|█▋        | 1/6 [06:36<33:01, 396.39s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 28.3min finished
 33%|███▎      | 2/6 [34:53<1:17:25, 1161.30s/it]

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   40.1s finished
 50%|█████     | 3/6 [35:33<32:28, 649.39s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.7min finished
 67%|██████▋   | 4/6 [39:16<16:02, 481.20s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.9min finished
 83%|████████▎ | 5/6 [46:09<07:36, 456.60s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.8min finished
100%|██████████| 6/6 [48:59<00:00, 489.91s/it]

CPU times: user 619 ms, sys: 920 ms, total: 1.54 s
Wall time: 48min 59s





Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.669678,0.657569,0.653461
1,GBC,0.609466,0.590652,0.587595
2,SGD,0.687821,0.633017,0.6376
3,RF,0.903621,0.647457,0.700919
4,SVM,0.812628,0.58796,0.613423
5,MLP,0.672795,0.681995,0.668668


In [8]:
df['pos'] = df['candidate_skill'].progress_apply(functions.pos_tagger)
df['pos'] = df['pos'].progress_apply(functions.sequence_counter)

pos_dicts = df[['pos']]
pos_dicts = pos_dicts['pos'].apply(pd.Series)
pos_dicts = pos_dicts.fillna(0).astype(int)

df['dep'] = df['candidate_skill'].progress_apply(functions.dep_tagger)
df['dep'] = df['dep'].progress_apply(functions.sequence_counter)

dep_dicts = df[['dep']]
dep_dicts = dep_dicts['dep'].apply(pd.Series)
dep_dicts = dep_dicts.fillna(0).astype(int)

x_pos = pos_dicts.join(dep_dicts,lsuffix='_gram', rsuffix='_pos')

x = x.join(x_pos, lsuffix='_embedding', rsuffix='_pos')

100%|██████████| 19340/19340 [01:27<00:00, 221.72it/s]
100%|██████████| 19340/19340 [01:27<00:00, 222.29it/s]
100%|██████████| 19340/19340 [01:25<00:00, 225.83it/s]
100%|██████████| 19340/19340 [01:19<00:00, 241.84it/s]


In [9]:
functions.model_performance(x.fillna(0), df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.8min finished
 17%|█▋        | 1/6 [06:48<34:01, 408.20s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 28.4min finished
 33%|███▎      | 2/6 [35:10<1:17:58, 1169.63s/it]

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   38.0s finished
 50%|█████     | 3/6 [35:48<32:38, 652.98s/it]   [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.5min finished
 67%|██████▋   | 4/6 [39:20<15:57, 478.72s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.2min finished
 83%|████████▎ | 5/6 [46:30<07:40, 460.98s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.6min finished
100%|██████████| 6/6 [49:05<00:00, 490.92s/it]


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.683421,0.673337,0.667682
1,GBC,0.614695,0.594783,0.592165
2,SGD,0.705618,0.640695,0.651384
3,RF,0.904931,0.64828,0.701714
4,SVM,0.818069,0.593068,0.619631
5,MLP,0.71019,0.710516,0.701539
