In [1]:
import os
import pandas as pd
import numpy as np
import gensim.downloader
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize 

from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from eval import evaluate

In [2]:
word_vector = gensim.downloader.load('word2vec-google-news-300')

In [3]:
data_folder = "data/preprocessed"

## Task1 - Single

### load train df

In [4]:
df = pd.read_csv(os.path.join(data_folder, "lcp_single_train_preprocessed.csv"), index_col=0)
df.head()

Unnamed: 0_level_0,corpus,sentence,token,complexity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,behold came river seven cattle sleek fat fed m...,river,0.0
34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,fellow bondservant brother prophet keep word book,brother,0.0
3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,man lord land said u know honest men leave one...,brother,0.05
3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,shimei sixteen son six daughter brother didnt ...,brother,0.15
3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,put brother far,brother,0.263889


In [5]:
# take mean of duplicate tokens
# token not in vocab are considered as 'unk'

df = df.groupby('token').mean().reset_index()
print("unique tokens : {}".format(len(df)))

count = [True if w in word_vector else False for w in df['token']]
print("{} tokens not in vocab".format(len(df)-sum(count)))

vectors = np.array(list(df['token'].apply(lambda x:word_vector[x] if x in word_vector else word_vector['unk'])))

unique tokens : 2856
130 tokens not in vocab


### load test df

In [6]:
test_df = pd.read_csv(os.path.join(data_folder, "lcp_single_test_preprocessed.csv"), index_col=0)
count = [True if w in word_vector else False for w in test_df['token']]
print("{} tokens not in vocab".format(len(test_df)-sum(count)))

testdf_vectors = np.array(list(test_df['token'].apply(lambda x:word_vector[x] if x in word_vector else word_vector['unk'])))

35 tokens not in vocab


### Baseline models

In [7]:
submission_folder = "predictions/single"

# Linear Regression
reg = LinearRegression().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/linear_regression_baseline.csv", index=False, header=False)

In [8]:
# Gradient Boosting
reg = GradientBoostingRegressor().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/gradient_boosting_baseline.csv", index=False, header=False)

In [9]:
# SVM regressor
reg = SVR().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/SVM_baseline.csv", index=False, header=False)

In [10]:
# MLP Regressor
regr = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/MLP_baseline.csv", index=False, header=False)

### evaluate baseline models

In [11]:
evaluate(submission_folder, "references/lcp_single_test_labelled_preprocessed.csv")


For file gradient_boosting_baseline.csv
pearson  :  0.6475249120262303
spearman :  0.625628747127938
mae      :  0.0746743972587837
mse      :  0.009492645287416529
r2       :  0.4134939111449625

For file SVM_baseline.csv
pearson  :  0.6553534622949289
spearman :  0.609787415042385
mae      :  0.07787217407123892
mse      :  0.009971588727391003
r2       :  0.3839022393551641

For file linear_regression_baseline.csv
pearson  :  0.6639716433790339
spearman :  0.6391991602453316
mae      :  0.07316730116994907
mse      :  0.009196928531600866
r2       :  0.4317648643525217

For file MLP_baseline.csv
pearson  :  0.6553534622949289
spearman :  0.609787415042385
mae      :  0.07787217407123892
mse      :  0.009971588727391003
r2       :  0.3839022393551641


## Task2 - Multi 

### load train df

In [12]:
df = pd.read_csv(os.path.join(data_folder, "lcp_multi_train_preprocessed.csv"), index_col=0)
df.head()

Unnamed: 0_level_0,corpus,sentence,token,complexity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3S37Y8CWI80N8KVM53U4E6JKCDC4WE,bible,seventh day sabbath yahweh god shall work son ...,seventh day,0.027778
3WGCNLZJKF877FYC1Q6COKNWTDWD11,bible,let man test own work take pride neighbor,own work,0.05
3UOMW19E6D6WQ5TH2HDD74IVKTP5CB,bible,understanding made heaven loving kindness endu...,loving kindness,0.05
36JW4WBR06KF9AXMUL4N476OMF8FHD,bible,remember god also spare according greatness lo...,loving kindness,0.05
3HRWUH63QU2FH9Q8R7MRNFC7JX2N5A,bible,loving kindness better life lip shall praise,loving kindness,0.075


In [13]:
# take avg of word embeddings in a token

df = df.groupby('token').mean().reset_index()
print("unique tokens : {}".format(len(df)))

count0 = [True if w.split()[0] in word_vector else False for w in df['token']]
count1 = [True if w.split()[1] in word_vector else False for w in df['token']]
print("{} tokens not in vocab".format(2*len(df)-sum(count0)-sum(count1)))

vectors = np.zeros((len(df),300))

for i,token in enumerate(df['token']):
    for x in token.split():
        vectors[i] += word_vector[x] if x in word_vector else word_vector['unk']
    vectors[i] /= len(token.split())

unique tokens : 1244
33 tokens not in vocab


### load test df

In [14]:
test_df = pd.read_csv(os.path.join(data_folder, "lcp_multi_test_preprocessed.csv"), index_col=0)

count0 = [True if w.split()[0] in word_vector else False for w in test_df['token']]
count1 = [True if w.split()[1] in word_vector else False for w in test_df['token']]
print("{} tokens not in vocab".format(2*len(test_df)-sum(count0)-sum(count1)))

testdf_vectors = np.zeros((len(test_df),300))
for i,token in enumerate(test_df['token']):
    for x in token.split():
        testdf_vectors[i] += word_vector[x] if x in word_vector else word_vector['unk']
    testdf_vectors[i] /= len(token.split())

9 tokens not in vocab


### Baseline models

In [15]:
submission_folder = "predictions/multi"

# Linear Regression
reg = LinearRegression().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/linear_regression_baseline.csv", index=False, header=False)

In [16]:
# Gradient Boosting
reg = GradientBoostingRegressor().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/gradient_boosting_baseline.csv", index=False, header=False)

In [17]:
# SVM regressor
reg = SVR().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/SVM_baseline.csv", index=False, header=False)

In [18]:
# MLP Regressor
regr = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/MLP_baseline.csv", index=False, header=False)

### evaluate baseline models

In [19]:
evaluate(submission_folder, "references/lcp_multi_test_labelled_preprocessed.csv")


For file gradient_boosting_baseline.csv
pearson  :  0.7266561437943603
spearman :  0.6739876494256528
mae      :  0.08769937390012252
mse      :  0.012007532002936954
r2       :  0.50252863779856

For file SVM_baseline.csv
pearson  :  0.7308921844093375
spearman :  0.7011304191917298
mae      :  0.08428310274484914
mse      :  0.011455105899844999
r2       :  0.5254156195658017

For file linear_regression_baseline.csv
pearson  :  0.6605462208340676
spearman :  0.6464161980672312
mae      :  0.09322292165368064
mse      :  0.013838118656466326
r2       :  0.42668753773434287

For file MLP_baseline.csv
pearson  :  0.7308921844093375
spearman :  0.7011304191917298
mae      :  0.08428310274484914
mse      :  0.011455105899844999
r2       :  0.5254156195658017
