## Initial Setup - Imports and Downloads

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d embeddings

--2021-04-12 06:24:16--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-12 06:24:16--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-12 06:24:17--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [3]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

In [4]:
FOLDER_PATH = "/content/drive/MyDrive/CS60075-Team28-Task-1"
DATA_FOLDER = os.path.join(FOLDER_PATH,"data/preprocessed")

In [5]:
# import evaluate function
import sys
sys.path.append(FOLDER_PATH)
from eval import evaluate

## Getting Data 

In [6]:
data = pd.read_csv(os.path.join(DATA_FOLDER, "lcp_single_train_preprocessed.csv"), index_col=0)
data['token'] = data['token'].astype(str)
data['sentence'] = data['sentence'].astype(str)

data.head()

Unnamed: 0_level_0,corpus,sentence,token,complexity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3ZLW647WALVGE8EBR50EGUBPU4P32A,bible,behold came river seven cattle sleek fat fed m...,river,0.0
34R0BODSP1ZBN3DVY8J8XSIY551E5C,bible,fellow bondservant brother prophet keep word book,brother,0.0
3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,bible,man lord land said u know honest men leave one...,brother,0.05
3BFNCI9LYKQN09BHXHH9CLSX5KP738,bible,shimei sixteen son six daughter brother didnt ...,brother,0.15
3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,bible,put brother far,brother,0.263889


In [7]:
data_multi = pd.read_csv(os.path.join(DATA_FOLDER, "lcp_multi_train_preprocessed.csv"), index_col=0)
data_multi['token'] = data_multi['token'].astype(str)
data_multi['sentence'] = data_multi['sentence'].astype(str)

data_multi.head()

Unnamed: 0_level_0,corpus,sentence,token,complexity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3S37Y8CWI80N8KVM53U4E6JKCDC4WE,bible,seventh day sabbath yahweh god shall work son ...,seventh day,0.027778
3WGCNLZJKF877FYC1Q6COKNWTDWD11,bible,let man test own work take pride neighbor,own work,0.05
3UOMW19E6D6WQ5TH2HDD74IVKTP5CB,bible,understanding made heaven loving kindness endu...,loving kindness,0.05
36JW4WBR06KF9AXMUL4N476OMF8FHD,bible,remember god also spare according greatness lo...,loving kindness,0.05
3HRWUH63QU2FH9Q8R7MRNFC7JX2N5A,bible,loving kindness better life lip shall praise,loving kindness,0.075


## Functions to Read GloVe Embeddings and Extract them According to sentence

In [8]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

  return word_to_vec_map

word_to_vec_map = read_glove_vector('embeddings/glove.6B.300d.txt')
print(len(word_to_vec_map)," words loaded!")

400000  words loaded!


In [9]:
def get_embeddings(sentences, tokens):
    token_emb = []
    for s,t in zip(sentences, tokens):
        
        # fill unk by nan
        # calculate mean over non nan embeddings
        # fill unk by the mean embedding of sentence
        # pad 0 vectors till max_len
        
        temp_emb = [ word_to_vec_map[x] if x in word_to_vec_map else np.full((300,), np.nan) for x in t.split() ]
        
        # calculate mean for filling null values <unk>
        temp_sent_emb = [ word_to_vec_map[x] if x in word_to_vec_map else np.full((300,), np.nan) for x in s.split() ]
        mean_emb = np.nanmean(np.array(temp_sent_emb), axis=0)
        
        # single or multi - will be converted to (1,300) 
        temp_emb = np.mean(np.array([ mean_emb if np.isnan(x[0]) else x for x in temp_emb ]), axis=0)

        token_emb.append(temp_emb)

    return np.array(token_emb)

## Testing Single Word Complexity Predictions - with Single Token Dataset 

In [10]:
SUBMISSION_FOLDER = os.path.join(FOLDER_PATH,"predictions/baselines/single")

if( not os.path.exists(SUBMISSION_FOLDER)):
    os.makedirs(SUBMISSION_FOLDER)

In [11]:
sentences_train_list = list(data['sentence'])
complexity_train_list = list(data['complexity'])
tokens_train_list = list(data['token'])

vectors = get_embeddings(sentences_train_list, tokens_train_list)
print(vectors.shape)

(7662, 300)


In [12]:
test_df = pd.read_csv(os.path.join(DATA_FOLDER, "lcp_single_test_preprocessed.csv"), index_col=0)

In [13]:
test_df['token'] = test_df['token'].astype(str)
test_df['sentence'] = test_df['sentence'].astype(str)
sentences_test_list = list(test_df['sentence'])
test_tokens_list = list(test_df['token'])

testdf_vectors = get_embeddings(sentences_test_list, test_tokens_list)
testdf_vectors.shape

(917, 300)

In [14]:
# Linear Regression
reg = LinearRegression().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/linear_regression_baseline.csv", index=False, header=False)

# Gradient Boosting
reg = GradientBoostingRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/gradient_boosting_baseline.csv", index=False, header=False)

# AdaBoost
reg = AdaBoostRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/ada_boost_baseline.csv", index=False, header=False)


# SVM regressor
reg = SVR().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/SVM_baseline.csv", index=False, header=False)

# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/MLP_baseline.csv", index=False, header=False)

In [15]:
evaluate(SUBMISSION_FOLDER, FOLDER_PATH+"/references/lcp_single_test_labelled_preprocessed.csv")


For file linear_regression_baseline.csv
pearson  :  0.6817130060790865
spearman :  0.6622243983222191
mae      :  0.0725450964271907
mse      :  0.008759716679890811
r2       :  0.45877813677379853

For file gradient_boosting_baseline.csv
pearson  :  0.7203520903570985
spearman :  0.6948938413555582
mae      :  0.06731888763261062
mse      :  0.007795421637461341
r2       :  0.5183574106972986

For file ada_boost_baseline.csv
pearson  :  0.6777051422207718
spearman :  0.6451366278370756
mae      :  0.0718233174638877
mse      :  0.00891662105013616
r2       :  0.4490837506748203

For file SVM_baseline.csv
pearson  :  0.7097033380749598
spearman :  0.6839183037224942
mae      :  0.07039859014661132
mse      :  0.008047758067602407
r2       :  0.5027667246201666

For file MLP_baseline.csv
pearson  :  0.5156106888626774
spearman :  0.46423932585188654
mae      :  0.10786861350295623
mse      :  0.01897024918777512
r2       :  -0.17208284085746972


In [23]:
'''
For file gradient_boosting_baseline.csv
pearson  :  0.7203520903570985
spearman :  0.6948938413555582
mae      :  0.06731888763261062
mse      :  0.007795421637461341
r2       :  0.5183574106972986
'''

'\nFor file gradient_boosting_baseline.csv\npearson  :  0.7203520903570985\nspearman :  0.6948938413555582\nmae      :  0.06731888763261062\nmse      :  0.007795421637461341\nr2       :  0.5183574106972986\n'

## Testing Single Word Complexity Predictions - with Single + Multi Token Dataset

In [16]:
SUBMISSION_FOLDER = os.path.join(FOLDER_PATH,"predictions/baselines/single")

if( not os.path.exists(SUBMISSION_FOLDER)):
    os.makedirs(SUBMISSION_FOLDER)

In [17]:
data = pd.concat([data, data_multi])

In [18]:
sentences_train_list = list(data['sentence'])
complexity_train_list = list(data['complexity'])
tokens_train_list = list(data['token'])

vectors = get_embeddings(sentences_train_list, tokens_train_list)
print(vectors.shape)

(9179, 300)


In [19]:
reg = LinearRegression().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/linear_regression_baseline.csv", index=False, header=False)

# Gradient Boosting
reg = GradientBoostingRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/gradient_boosting_baseline.csv", index=False, header=False)

# AdaBoost
reg = AdaBoostRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/ada_boost_baseline.csv", index=False, header=False)


# SVM regressor
reg = SVR().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/SVM_baseline.csv", index=False, header=False)

# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/MLP_baseline.csv", index=False, header=False)

In [20]:
evaluate(SUBMISSION_FOLDER, FOLDER_PATH+"/references/lcp_single_test_labelled_preprocessed.csv")


For file linear_regression_baseline.csv
pearson  :  0.6715029466644609
spearman :  0.6548482413646795
mae      :  0.0759823204469714
mse      :  0.009499099203375246
r2       :  0.4130951539079504

For file gradient_boosting_baseline.csv
pearson  :  0.6911351486877488
spearman :  0.6707113273501418
mae      :  0.07188171267210111
mse      :  0.008743365608190055
r2       :  0.459788392905934

For file ada_boost_baseline.csv
pearson  :  0.651843952414161
spearman :  0.6341639740279196
mae      :  0.08485453903761837
mse      :  0.011637487364086556
r2       :  0.2809741656460959

For file SVM_baseline.csv
pearson  :  0.6678429730298119
spearman :  0.6568058055125834
mae      :  0.07496900141336407
mse      :  0.009234749134759981
r2       :  0.42942810643464235

For file MLP_baseline.csv
pearson  :  0.5327112377402287
spearman :  0.4959083037345019
mae      :  0.10635874201963097
mse      :  0.01960525183945077
r2       :  -0.21131668035851603


In [25]:
'''
For file gradient_boosting_baseline.csv
pearson  :  0.6911351486877488
spearman :  0.6707113273501418
mae      :  0.07188171267210111
mse      :  0.008743365608190055
r2       :  0.459788392905934
'''

'\nFor file gradient_boosting_baseline.csv\npearson  :  0.6911351486877488\nspearman :  0.6707113273501418\nmae      :  0.07188171267210111\nmse      :  0.008743365608190055\nr2       :  0.459788392905934\n'

## Testing Multi Word Complexity Prediction - Multi Token Dataset

In [26]:
SUBMISSION_FOLDER = os.path.join(FOLDER_PATH,"predictions/baselines/multi")

if( not os.path.exists(SUBMISSION_FOLDER)):
    os.makedirs(SUBMISSION_FOLDER)

In [27]:
sentences_train_list = list(data_multi['sentence'])
complexity_train_list = list(data_multi['complexity'])
tokens_train_list = list(data_multi['token'])

In [28]:
vectors = get_embeddings(sentences_train_list, tokens_train_list)
print(vectors.shape)

(1517, 300)


In [29]:
test_df = pd.read_csv(os.path.join(DATA_FOLDER, "lcp_multi_test_preprocessed.csv"), index_col=0)

In [30]:
test_df['token'] = test_df['token'].astype(str)
test_df['sentence'] = test_df['sentence'].astype(str)
sentences_test_list = list(test_df['sentence'])
test_tokens_list = list(test_df['token'])
testdf_vectors = get_embeddings(sentences_test_list, test_tokens_list)
testdf_vectors.shape

(184, 300)

In [31]:
# Linear Regression
reg = LinearRegression().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/linear_regression_baseline.csv", index=False, header=False)

# Gradient Boosting
reg = GradientBoostingRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/gradient_boosting_baseline.csv", index=False, header=False)

# AdaBoost
reg = AdaBoostRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/ada_boost_baseline.csv", index=False, header=False)


# SVM regressor
reg = SVR().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/SVM_baseline.csv", index=False, header=False)

# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/MLP_baseline.csv", index=False, header=False)

In [32]:
evaluate(SUBMISSION_FOLDER, FOLDER_PATH+"/references/lcp_multi_test_labelled_preprocessed.csv")


For file linear_regression_baseline.csv
pearson  :  0.7384735689252496
spearman :  0.7096751511559939
mae      :  0.08420414291697727
mse      :  0.011177467719053704
r2       :  0.5369181534723191

For file gradient_boosting_baseline.csv
pearson  :  0.7823878559554673
spearman :  0.7546944249875555
mae      :  0.07924415256123464
mse      :  0.00953696812319659
r2       :  0.6048839576394317

For file ada_boost_baseline.csv
pearson  :  0.7988214260459509
spearman :  0.7780559066252843
mae      :  0.07713078050256739
mse      :  0.009342698342090006
r2       :  0.6129325435285244

For file SVM_baseline.csv
pearson  :  0.7747686594582374
spearman :  0.7563503882622532
mae      :  0.07906457106883466
mse      :  0.009776174954316456
r2       :  0.5949736323456092

For file MLP_baseline.csv
pearson  :  0.5617861056424609
spearman :  0.5658144254122174
mae      :  0.1236007498123032
mse      :  0.024975175150413018
r2       :  -0.03472007405490074


In [33]:
'''
For file ada_boost_baseline.csv
pearson  :  0.7988214260459509
spearman :  0.7780559066252843
mae      :  0.07713078050256739
mse      :  0.009342698342090006
r2       :  0.6129325435285244
'''

'\nFor file ada_boost_baseline.csv\npearson  :  0.7988214260459509\nspearman :  0.7780559066252843\nmae      :  0.07713078050256739\nmse      :  0.009342698342090006\nr2       :  0.6129325435285244\n'

## Testing Multi Word Predictions - with Single + Multi Dataset

In [34]:
SUBMISSION_FOLDER = os.path.join(FOLDER_PATH,"predictions/baselines/multi")

if( not os.path.exists(SUBMISSION_FOLDER)):
    os.makedirs(SUBMISSION_FOLDER)

In [35]:
sentences_train_list = list(data['sentence'])
complexity_train_list = list(data['complexity'])
tokens_train_list = list(data['token'])

vectors = get_embeddings(sentences_train_list, tokens_train_list)
print(vectors.shape)

(9179, 300)


In [36]:
reg = LinearRegression().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/linear_regression_baseline.csv", index=False, header=False)

# Gradient Boosting
reg = GradientBoostingRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/gradient_boosting_baseline.csv", index=False, header=False)

# AdaBoost
reg = AdaBoostRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/ada_boost_baseline.csv", index=False, header=False)


# SVM regressor
reg = SVR().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/SVM_baseline.csv", index=False, header=False)

# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(testdf_vectors)

pred = pd.DataFrame({"ID":test_df.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/MLP_baseline.csv", index=False, header=False)

In [37]:
evaluate(SUBMISSION_FOLDER, FOLDER_PATH+"/references/lcp_multi_test_labelled_preprocessed.csv")


For file linear_regression_baseline.csv
pearson  :  0.8179201155507047
spearman :  0.8049744407871943
mae      :  0.10626544460880245
mse      :  0.0162353528148035
r2       :  0.3273702640454511

For file gradient_boosting_baseline.csv
pearson  :  0.8193283866008076
spearman :  0.8090107910611813
mae      :  0.10516079075843644
mse      :  0.01603721066214149
r2       :  0.3355792820659891

For file ada_boost_baseline.csv
pearson  :  0.7715057753961843
spearman :  0.7468502086000744
mae      :  0.1002614955390914
mse      :  0.015038504834717827
r2       :  0.3769556072163568

For file SVM_baseline.csv
pearson  :  0.8137261535536578
spearman :  0.8074492340697415
mae      :  0.07244008552665147
mse      :  0.008700322647294615
r2       :  0.6395461317210687

For file MLP_baseline.csv
pearson  :  0.6534750568487009
spearman :  0.632187822519071
mae      :  0.10611963154645704
mse      :  0.01823920316785689
r2       :  0.24435085884732366


In [38]:
'''
For file gradient_boosting_baseline.csv
pearson  :  0.8193283866008076
spearman :  0.8090107910611813
mae      :  0.10516079075843644
mse      :  0.01603721066214149
r2       :  0.3355792820659891
'''

'\nFor file gradient_boosting_baseline.csv\npearson  :  0.8193283866008076\nspearman :  0.8090107910611813\nmae      :  0.10516079075843644\nmse      :  0.01603721066214149\nr2       :  0.3355792820659891\n'