In [1]:
import os
import pandas as pd
import numpy as np
import gensim.downloader
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize 

from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from eval import evaluate

In [2]:
word_vector = gensim.downloader.load('word2vec-google-news-300')

In [3]:
data_folder = "data/preprocessed"

## Task1 - Single

### load train df

In [4]:
df = pd.read_csv(os.path.join(data_folder, "lcp_single_train_preprocessed.csv"), index_col=0)
df.head()

Unnamed: 0_level_0,corpus,sentence,token,complexity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,bible behold came river seven cattle sleek fat...,river,0.0
34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,bible fellow bondservant brother prophet keep ...,brother,0.0
3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,bible man lord land said u know honest men lea...,brother,0.05
3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,bible shimei sixteen son six daughter brother ...,brother,0.15
3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,bible put brother far,brother,0.263889


In [5]:
# take mean of duplicate tokens
# token not in vocab are considered as 'unk'

df = df.groupby('token').mean().reset_index()
print("unique tokens : {}".format(len(df)))

count = [True if w in word_vector else False for w in df['token']]
print("{} tokens not in vocab".format(len(df)-sum(count)))

vectors = np.array(list(df['token'].apply(lambda x:word_vector[x] if x in word_vector else word_vector['unk'])))

unique tokens : 2856
130 tokens not in vocab


### load test df

In [6]:
test_df = pd.read_csv(os.path.join(data_folder, "lcp_single_test_preprocessed.csv"), index_col=0)
count = [True if w in word_vector else False for w in test_df['token']]
print("{} tokens not in vocab".format(len(test_df)-sum(count)))

testdf_vectors = np.array(list(test_df['token'].apply(lambda x:word_vector[x] if x in word_vector else word_vector['unk'])))

35 tokens not in vocab


### Baseline models

In [7]:
submission_folder = "predictions/single"

# Linear Regression
reg = LinearRegression().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/linear_regression_baseline.csv", index=False, header=False)

In [8]:
# Gradient Boosting
reg = GradientBoostingRegressor().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/gradient_boosting_baseline.csv", index=False, header=False)

In [9]:
# SVM regressor
reg = SVR().fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/SVM_baseline.csv", index=False, header=False)

In [10]:
# MLP Regressor
regr = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(df['complexity']))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/MLP_baseline.csv", index=False, header=False)

### evaluate baseline models

In [11]:
evaluate(submission_folder, "references/lcp_single_test_labelled_preprocessed.csv")


For file gradient_boosting_baseline.csv
pearson  :  0.6469066684805035
spearman :  0.62493976278848
mae      :  0.0747125087856786
mse      :  0.009504483612266126
r2       :  0.4127624765030997

For file SVM_baseline.csv
pearson  :  0.6553534622949289
spearman :  0.609787415042385
mae      :  0.07787217407123892
mse      :  0.009971588727391003
r2       :  0.3839022393551641

For file linear_regression_baseline.csv
pearson  :  0.6639716433790339
spearman :  0.6391991602453316
mae      :  0.07316730116994907
mse      :  0.009196928531600866
r2       :  0.4317648643525217

For file MLP_baseline.csv
pearson  :  0.6553534622949289
spearman :  0.609787415042385
mae      :  0.07787217407123892
mse      :  0.009971588727391003
r2       :  0.3839022393551641
