In [1]:
import pandas as pd
import spacy 

from tqdm import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import numpy as np

import fasttext

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.model_selection import learning_curve

from statistics import mean

from nltk.tokenize import RegexpTokenizer

fasttext = fasttext.load_model("/Users/ivowings/Downloads/cc.en.300.bin")





In [2]:
annotations = '/Users/ivowings/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'
#annotations = 'D:/Sync/Thesis/Datasources/Preprocessed/Combined/Taxonomy/Normal/Annotated/combined_annotations.csv'

In [3]:
df2 = pd.read_csv(annotations,sep=';')

#Filling any empty context columns with 'empty'
df2['left_context'] = df2['left_context'].astype(str).replace(r'^\s*$', 'empty', regex=True)
df2['right_context'] = df2['right_context'].astype(str).replace(r'^\s*$', 'empty', regex=True)

df2['concatenated'] = df2['left_context'] + ' | ' + df2['candidate_skill'] + ' | ' + df2['right_context']
print('Number of annotated rows ',df2.shape[0])

Number of annotated rows  20836


In [4]:
df = df2.copy()
#df = df.head(1000)

In [5]:
#Function to retrieve word2vec vectors from spacy
def fasttext_retriever_sum(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)
    return wordvectors
    

def fasttext_retriever_average(text):
    tokenizer = RegexpTokenizer(r'\w+')
    tokens =  tokenizer.tokenize(text)
    wordvectors = []
    for token in tokens:
        wordvectors.append(fasttext[token])
    wordvectors = sum(wordvectors)/len(wordvectors)
    return wordvectors

mode = fasttext_retriever_average

In [6]:
#Retrieving the word2vec vectors
x_left = pd.DataFrame(df['left_context'].progress_apply(mode))
x_left = x_left['left_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_right = pd.DataFrame(df['right_context'].progress_apply(mode))
x_right = x_right['right_context'].progress_apply(pd.Series)

#Retrieving the word2vec vectors
x_middle = pd.DataFrame(df['candidate_skill'].progress_apply(mode))
x_middle = x_middle['candidate_skill'].progress_apply(pd.Series) 

x = x_left
x['sep'] = 5
x = x.join(x_middle,lsuffix='_left', rsuffix='_middle')
x['sep2'] = 5
x = x.join(x_right,lsuffix='_middle', rsuffix='_right')

100%|██████████| 20836/20836 [00:02<00:00, 9804.00it/s] 
100%|██████████| 20836/20836 [00:10<00:00, 1978.73it/s]
100%|██████████| 20836/20836 [00:03<00:00, 5869.54it/s]
100%|██████████| 20836/20836 [00:05<00:00, 3775.67it/s] 
100%|██████████| 20836/20836 [00:00<00:00, 24734.12it/s]
100%|██████████| 20836/20836 [00:05<00:00, 4062.04it/s] 


In [7]:
left_context_columns = ['left_context_'+ str(x) for x in range(0,300)]
separator_1 = ['separator_1']
candidate_skill_columns = ['candidate_skill_'+ str(x) for x in range(0,300)]
separator_2 = ['separator_2']
right_context_columns = ['right_context_'+ str(x) for x in range(0,300)]
column_names = left_context_columns + separator_1 + candidate_skill_columns + separator_2 + right_context_columns

In [8]:
x.columns = column_names

In [9]:
RF = RandomForestClassifier(random_state=456, n_jobs=-1)

In [10]:
X_train,X_test,y_train,y_test=train_test_split(x, df['label'], test_size=0.2,random_state=456)

In [11]:
RF.fit(X_train,y_train)

RandomForestClassifier(n_jobs=-1, random_state=456)

In [12]:
test = X_test.join(y_test)
not_skill = test[test.label==0].sample(30,random_state=456)
soft_skill = test[test.label==1].sample(30,random_state=456)
hard_skill = test[test.label==2].sample(30,random_state=456)

import pandas as pd
X_test = pd.concat([not_skill,soft_skill,hard_skill])
X_test = X_test.drop(columns=['label'])
y_test = pd.concat([not_skill,soft_skill,hard_skill])
y_test = y_test[['label']]

In [None]:
import shap
shap_values = shap.TreeExplainer(RF).shap_values(X_train)
shap.summary_plot(shap_values, X_test)