In [1]:
import thesis_helper
functions = thesis_helper.Thesis_Helper()
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:


import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from flair.data import Sentence

from flair.embeddings import ELMoEmbeddings

# init embedding
embedding = ELMoEmbeddings('small')

In [3]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'

In [4]:
df = pd.read_csv(annotations,sep=';')

#Filling any empty context columns with 'empty'
df['left_context'] = df['left_context'].fillna('empty')
df['right_context'] = df['right_context'].fillna('empty')

df['concatenated'] = df['left_context'] + ' | ' + df['candidate_skill'] + ' | ' + df['right_context']
print('Number of annotated rows ',df.shape[0])

Number of annotated rows  20836


In [6]:
#1 token has 1536 columns
# there are more than 3 tokens
def ELMo_embedder(text):

    string = Sentence(text)
    embedding.embed(string)

    # Creating a list which stores the indexes of the | symbols
    bar_indexes = []
    #Creating a list which stores the embedding_tensors
    embedding_tensors = []

    #Checking the sentence object for the | symbols and storing their indexes
    for x in range(1,len(string)+1):
        if '|' in str(string.get_token(x)):
            bar_indexes.append(x)

    #Collecting the embeddings for every index between the indexes in bar_indexes
    word_embedding_indexes = range(bar_indexes[0]+1,bar_indexes[1])
    for x in word_embedding_indexes:
        embedding_tensors.append(pd.Series(string[x].embedding))
        embedding_tensors.append(222)

    #Removing last 666 from list
    embedding_tensors.pop()

    #Turning the elements from embedding_tensors into dataframe rows
    row = pd.DataFrame()
    for x in range(0,len(embedding_tensors)):
        row = row.append(pd.DataFrame(pd.Series(embedding_tensors[x])))

    row = row.transpose().reset_index(drop=True)
    
    #Changing the column names in order to make pd.concat work later
    row.columns = [x for x in range(0,len(row.columns))]
    return row

In [7]:
df['embeddings'] = df['concatenated'].progress_apply(ELMo_embedder)

100%|██████████| 20836/20836 [17:49<00:00, 19.49it/s]


In [8]:
x_elmo = pd.concat(df['embeddings'].tolist()).reset_index(drop=True)
x_elmo = x_elmo.fillna(0)

In [14]:
%%time

functions.model_performance(x_elmo, df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 134.3min finished
 17%|█▋        | 1/6 [2:14:15<11:11:16, 8055.27s/it]

We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 56.8min finished
 33%|███▎      | 2/6 [3:11:02<5:54:45, 5321.28s/it] 

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.9min finished
 50%|█████     | 3/6 [3:13:54<2:28:30, 2970.11s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.1min finished
 67%|██████▋   | 4/6 [3:18:01<1:03:10, 1895.03s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 41.6min finished
 83%|████████▎ | 5/6 [3:59:37<35:11, 2111.42s/it]  

We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 197.2min finished
100%|██████████| 6/6 [7:16:47<00:00, 4367.88s/it]


CPU times: user 2.08 s, sys: 3.85 s, total: 5.93 s
Wall time: 7h 16min 47s


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.776105,0.745958,0.748975
1,GBC,0.582314,0.532398,0.540956
2,SGD,0.736549,0.700224,0.69623
3,RF,0.892777,0.589646,0.640032
4,SVM,0.534732,0.424707,0.436034
5,MLP,0.721221,0.733649,0.719403


In [9]:
df['pos'] = df['candidate_skill'].progress_apply(functions.pos_tagger)
df['pos'] = df['pos'].progress_apply(functions.sequence_counter)

pos_dicts = df[['pos']]
pos_dicts = pos_dicts['pos'].apply(pd.Series)
pos_dicts = pos_dicts.fillna(0).astype(int)

df['dep'] = df['candidate_skill'].progress_apply(functions.dep_tagger)
df['dep'] = df['dep'].progress_apply(functions.sequence_counter)

dep_dicts = df[['dep']]
dep_dicts = dep_dicts['dep'].apply(pd.Series)
dep_dicts = dep_dicts.fillna(0).astype(int)

x_pos = pos_dicts.join(dep_dicts,lsuffix='_gram', rsuffix='_pos')

x = x_elmo.join(x_pos, lsuffix='_embedding', rsuffix='_pos')
print(x.shape[1])

100%|██████████| 20836/20836 [02:06<00:00, 164.10it/s]
100%|██████████| 20836/20836 [02:07<00:00, 163.88it/s]
100%|██████████| 20836/20836 [02:09<00:00, 160.35it/s]
100%|██████████| 20836/20836 [02:04<00:00, 167.21it/s]


3132


In [10]:
%%time

functions.model_performance(x.fillna(0), df['label'])

  0%|          | 0/6 [00:00<?, ?it/s]

Starting model evaluation
We are at classifier  LogisticRegression(max_iter=10000000000000000000000, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 139.3min finished
 17%|█▋        | 1/6 [2:19:17<11:36:25, 8357.17s/it]

We are at classifier  GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 47.8min finished
 33%|███▎      | 2/6 [3:07:05<5:41:53, 5128.41s/it] 

We are at classifier  SGDClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.0min finished
 50%|█████     | 3/6 [3:09:04<2:22:03, 2841.12s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  RandomForestClassifier(random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.7min finished
 67%|██████▋   | 4/6 [3:12:45<1:00:12, 1806.50s/it][Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


We are at classifier  SVC(decision_function_shape='ovo', random_state=456)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 32.9min finished
 83%|████████▎ | 5/6 [3:45:39<31:07, 1867.04s/it]  

We are at classifier  MLPClassifier(alpha=1e-05, hidden_layer_sizes=(15,),
              max_iter=10000000000000000000000, random_state=456,
              solver='lbfgs')


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 527.3min finished
100%|██████████| 6/6 [12:32:55<00:00, 7529.29s/it] 


CPU times: user 2.33 s, sys: 3.29 s, total: 5.62 s
Wall time: 12h 32min 56s


Unnamed: 0,Classifier,Precision,Recall,F1
0,LR,0.777494,0.747703,0.750066
1,GBC,0.58005,0.527629,0.536423
2,SGD,0.765536,0.691828,0.702302
3,RF,0.888194,0.597745,0.650726
4,SVM,0.534698,0.425535,0.437014
5,MLP,0.725682,0.736529,0.720458
