## Initial Setup - Imports and Downloads

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d embeddings

--2021-04-12 06:49:04--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-12 06:49:04--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-12 06:49:04--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [3]:
import os
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

In [4]:
FOLDER_PATH = "/content/drive/MyDrive/CS60075-Team28-Task-1"
DATA_FOLDER = os.path.join(FOLDER_PATH,"data/preprocessed")

In [5]:
# import evaluate function
import sys
sys.path.append(FOLDER_PATH)
from eval import evaluate

## Functions to Read GloVe Embeddings and Extract them According to sentence

In [6]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

  return word_to_vec_map

word_to_vec_map = read_glove_vector('embeddings/glove.6B.300d.txt')
print(len(word_to_vec_map)," words loaded!")

400000  words loaded!


In [7]:
def get_embeddings(sentences, tokens):
    token_emb = []
    for s,t in zip(sentences, tokens):
        
        # fill unk by nan
        # calculate mean over non nan embeddings
        # fill unk by the mean embedding of sentence
        # pad 0 vectors till max_len
        
        temp_emb = [ word_to_vec_map[x] if x in word_to_vec_map else np.full((300,), np.nan) for x in t.split() ]
        
        # calculate mean for filling null values <unk>
        temp_sent_emb = [ word_to_vec_map[x] if x in word_to_vec_map else np.full((300,), np.nan) for x in s.split() ]
        mean_emb = np.nanmean(np.array(temp_sent_emb), axis=0)
        
        # single or multi - will be converted to (1,300) 
        temp_emb = np.mean(np.array([ mean_emb if np.isnan(x[0]) else x for x in temp_emb ]), axis=0)

        token_emb.append(temp_emb)

    return np.array(token_emb)

## Load data with Features

In [8]:
f1 = pd.read_csv(os.path.join(FOLDER_PATH, "data/extra_features/lcp_single_train_features.csv"), index_col=0)
f1['token'] = f1['token'].astype(str)
f1['sentence'] = f1['sentence'].astype(str)
f1.set_index("id", inplace=True)

# drop unwanted features
f1.drop(['parse', 'lemma'], axis=1, inplace=True)

print(f1.columns)

f2 = pd.read_csv(os.path.join(FOLDER_PATH, "data/added_corpus_presence/lcp_single_train_preprocessed.csv"), index_col=0)
f2['token'] = f2['token'].astype(str)
f2['sentence'] = f2['sentence'].astype(str)
print(f2.columns)

features = f1.merge(f2, on=['id','sentence', 'corpus', 'token', 'complexity'])

Index(['sentence', 'corpus', 'token', 'complexity', 'token_length',
       'syllables', 'pos', 'dep num', 'synonyms', 'hypernyms', 'hyponyms',
       'google frequency'],
      dtype='object')
Index(['corpus', 'sentence', 'token', 'complexity', 'biomedical', 'bible',
       'subtitles', 'wiki', 'familarity'],
      dtype='object')


In [9]:
# fill pos nan by NN, as they are in majority
features['pos'] = features['pos'].fillna('NN')
features['token_length'] = features['token_length'].fillna(0)

# categorical encoding
labels = dict(features['pos'].value_counts())
labels = { k:i for i,k in enumerate(labels)}
labels['POS'] = len(labels)
print(labels)


def get_vowels(word):
    val = 0
    for w in word:
        if(w in ['A', 'a', 'E', 'e', 'I', 'i', 'O', 'o', 'U','u']):
            val+=1
    return val

features['token_vowels'] = features['token'].apply(lambda x: get_vowels(x) )


features['pos'] = features['pos'].apply((lambda x: labels[x]))

# scaler = preprocessing.StandardScaler()
# features[['token_length', 'syllables', 'pos', 'dep num', 'synonyms', 'hypernyms', 'hyponyms', 'google frequency', 'familarity', 'token_vowels']] =  \
#     scaler.fit_transform(features[['token_length', 'syllables', 'pos', 'dep num', 'synonyms', 'hypernyms', 'hyponyms', 'google frequency', 'familarity', 'token_vowels']])


features.head()

{'NN': 0, 'JJ': 1, 'NNP': 2, 'NNS': 3, 'VBG': 4, 'VB': 5, 'RB': 6, 'VBP': 7, 'UH': 8, 'CD': 9, 'VBN': 10, 'FW': 11, 'VBZ': 12, 'IN': 13, 'JJR': 14, 'VBD': 15, 'PRP': 16, 'GW': 17, 'MD': 18, 'LS': 19, 'WRB': 20, 'JJS': 21, 'AFX': 22, 'POS': 23}


Unnamed: 0_level_0,sentence,corpus,token,complexity,token_length,syllables,pos,dep num,synonyms,hypernyms,hyponyms,google frequency,biomedical,bible,subtitles,wiki,familarity,token_vowels
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
3ZLW647WALVGE8EBR50EGUBPU4P32A,behold came river seven cattle sleek fat fed m...,bible,river,0.0,5.0,2,0,0,1,1,0,173.485953,1,0,0,0,565,2
34R0BODSP1ZBN3DVY8J8XSIY551E5C,fellow bondservant brother prophet keep word book,bible,brother,0.0,7.0,2,0,3,5,1,3,112.198857,1,0,0,0,598,2
3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,man lord land said u know honest men leave one...,bible,brother,0.05,7.0,2,0,1,5,1,3,112.198857,1,0,0,0,598,2
3BFNCI9LYKQN09BHXHH9CLSX5KP738,shimei sixteen son six daughter brother didnt ...,bible,brother,0.15,7.0,2,0,4,5,1,3,112.198857,1,0,0,0,598,2
3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,put brother far,bible,brother,0.263889,7.0,2,0,2,5,1,3,112.198857,1,0,0,0,598,2


In [10]:
multi_f1 = pd.read_csv(os.path.join(FOLDER_PATH, "data/extra_features/lcp_multi_train_split_features.csv"), index_col=0)
multi_f1['token'] = multi_f1['token'].astype(str)
multi_f1['sentence'] = multi_f1['sentence'].astype(str)
multi_f1.set_index("id", inplace=True)

# drop unwanted features
multi_f1.drop(['parse', 'token1', 'token2', 'lemma1', 'lemma2', 'Unnamed: 0.1'], axis=1, inplace=True)

multi_f2 = pd.read_csv(os.path.join(FOLDER_PATH, "data/added_corpus_presence/lcp_multi_train_preprocessed.csv"), index_col=0)
multi_f2['token'] = multi_f2['token'].astype(str)
multi_f2['sentence'] = multi_f2['sentence'].astype(str)

multi_features = multi_f1.merge(multi_f2, on=['id','sentence', 'corpus', 'token', 'complexity'])
multi_features.head(2)

Unnamed: 0_level_0,sentence,corpus,token,complexity,token_length,syllables,pos1,dep num1,pos2,dep num2,synonyms1,hypernyms1,hyponyms1,synonyms2,hypernyms2,hyponyms2,google frequency1,google frequency2,token_vowels,biomedical,bible,subtitles,wiki,familarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3S37Y8CWI80N8KVM53U4E6JKCDC4WE,seventh day sabbath yahweh god shall work son ...,bible,seventh day,0.027778,11,3,JJ,0,NN,1,4,1,0,10,1,7,24.522564,682.298213,3,1,0,1,0,297.5
3WGCNLZJKF877FYC1Q6COKNWTDWD11,let man test own work take pride neighbor,bible,own work,0.05,8,2,JJ,0,NN,3,2,0,1,34,1,27,0.0,1022.711588,2,1,1,1,1,600.5


In [11]:
# fill pos nan by NN, as they are in majority
multi_features['pos2'] = multi_features['pos2'].fillna('NN')

multi_features['pos1'] = multi_features['pos1'].apply((lambda x: labels[x]))
multi_features['pos2'] = multi_features['pos2'].apply((lambda x: labels[x]))

# scaler = preprocessing.StandardScaler()
# multi_features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']] =  \
#     scaler.fit_transform(multi_features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']])


multi_features.head()

Unnamed: 0_level_0,sentence,corpus,token,complexity,token_length,syllables,pos1,dep num1,pos2,dep num2,synonyms1,hypernyms1,hyponyms1,synonyms2,hypernyms2,hyponyms2,google frequency1,google frequency2,token_vowels,biomedical,bible,subtitles,wiki,familarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3S37Y8CWI80N8KVM53U4E6JKCDC4WE,seventh day sabbath yahweh god shall work son ...,bible,seventh day,0.027778,11,3,1,0,0,1,4,1,0,10,1,7,24.522564,682.298213,3,1,0,1,0,297.5
3WGCNLZJKF877FYC1Q6COKNWTDWD11,let man test own work take pride neighbor,bible,own work,0.05,8,2,1,0,0,3,2,0,1,34,1,27,0.0,1022.711588,2,1,1,1,1,600.5
3UOMW19E6D6WQ5TH2HDD74IVKTP5CB,understanding made heaven loving kindness endu...,bible,loving kindness,0.05,15,4,4,0,0,2,10,1,12,3,1,4,339.132903,14.221491,4,1,1,1,1,566.5
36JW4WBR06KF9AXMUL4N476OMF8FHD,remember god also spare according greatness lo...,bible,loving kindness,0.05,15,4,4,1,0,2,10,1,12,3,1,4,339.132903,14.221491,4,1,1,1,1,566.5
3HRWUH63QU2FH9Q8R7MRNFC7JX2N5A,loving kindness better life lip shall praise,bible,loving kindness,0.075,15,4,4,0,0,0,10,1,12,3,1,4,339.132903,14.221491,4,1,1,1,1,566.5


In [13]:
features['pos1'] = features['pos'].copy()
features['pos2'] = features['pos'] 

features['dep num1'] = features['dep num'] 
features['dep num2'] = features['dep num'] 

features['synonyms1'] = features['synonyms'] 
features['synonyms2'] = features['synonyms'] 

features['hypernyms1'] = features['hypernyms'] 
features['hypernyms2'] = features['hypernyms'] 

features['hyponyms1'] = features['hyponyms'] 
features['hyponyms2'] = features['hyponyms'] 

features['google frequency1'] = features['google frequency'] 
features['google frequency2'] = features['google frequency1'] 

features.drop(['pos','dep num', 'synonyms', 'hyponyms', 'hypernyms', 'google frequency'], axis=1, inplace=True)

features = features.append( multi_features)
print(len(features))

scaler = preprocessing.StandardScaler()
features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']] =  \
    scaler.fit_transform(features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']])

features.head()

9179


Unnamed: 0_level_0,sentence,corpus,token,complexity,token_length,syllables,biomedical,bible,subtitles,wiki,familarity,token_vowels,pos1,pos2,dep num1,dep num2,synonyms1,synonyms2,hypernyms1,hypernyms2,hyponyms1,hyponyms2,google frequency1,google frequency2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
3ZLW647WALVGE8EBR50EGUBPU4P32A,behold came river seven cattle sleek fat fed m...,bible,river,0.0,-0.797042,-0.505404,1,0,0,0,1.276865,-0.689858,-0.322563,-0.248693,-0.707506,-0.831106,-0.7294,-0.754824,0.336733,0.195761,-0.333525,-0.369612,0.419993,0.466245
34R0BODSP1ZBN3DVY8J8XSIY551E5C,fellow bondservant brother prophet keep word book,bible,brother,0.0,-0.337058,-0.505404,1,0,0,0,1.405225,-0.689858,-0.322563,-0.248693,1.70999,1.365895,-0.024553,-0.063647,0.336733,0.195761,-0.168281,-0.211593,0.088143,0.097424
3S1WOPCJFGTJU2SGNAN2Y213N6WJE3,man lord land said u know honest men leave one...,bible,brother,0.05,-0.337058,-0.505404,1,0,0,0,1.405225,-0.689858,-0.322563,-0.248693,0.098326,-0.098772,-0.024553,-0.063647,0.336733,0.195761,-0.168281,-0.211593,0.088143,0.097424
3BFNCI9LYKQN09BHXHH9CLSX5KP738,shimei sixteen son six daughter brother didnt ...,bible,brother,0.15,-0.337058,-0.505404,1,0,0,0,1.405225,-0.689858,-0.322563,-0.248693,2.515822,2.098228,-0.024553,-0.063647,0.336733,0.195761,-0.168281,-0.211593,0.088143,0.097424
3G5RUKN2EC3YIWSKUXZ8ZVH95R49N2,put brother far,bible,brother,0.263889,-0.337058,-0.505404,1,0,0,0,1.405225,-0.689858,-0.322563,-0.248693,0.904158,0.633561,-0.024553,-0.063647,0.336733,0.195761,-0.168281,-0.211593,0.088143,0.097424


## Testing Single Word Complexity Predictions

In [14]:
sentences_train_list = list(features['sentence'])
complexity_train_list = list(features['complexity'])
tokens_train_list = list(features['token'])

vectors = get_embeddings(sentences_train_list, tokens_train_list)
print(vectors.shape)

# f_vectors = features[['token_length', 'token_vowels', 'syllables', 'pos', 'dep num', 'synonyms', 'hypernyms', 'hyponyms', 'google frequency', 'biomedical', 'bible', 'subtitles', 'wiki', 'familarity']].values
# or 

f_vectors = features[['token_length', 'token_vowels', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 
                        'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 
                        'google frequency1', 'google frequency2', 
                        'biomedical', 'bible', 'subtitles', 'wiki', 'familarity']].values

print(f_vectors.shape)
vectors = np.concatenate((vectors, f_vectors), axis=1)

(9179, 300)
(9179, 20)


In [15]:
SUBMISSION_FOLDER = os.path.join(FOLDER_PATH,"predictions/baselines/features/single")

if( not os.path.exists(SUBMISSION_FOLDER)):
    os.makedirs(SUBMISSION_FOLDER)

In [19]:
test_f1 = pd.read_csv(os.path.join(FOLDER_PATH, "data/extra_features/lcp_single_test_features.csv"), index_col=0)
test_f1['token'] = test_f1['token'].astype(str)
test_f1['sentence'] = test_f1['sentence'].astype(str)
test_f1.set_index("id", inplace=True)

# drop unwanted features
test_f1.drop(['parse', 'lemma'], axis=1, inplace=True)

test_f2 = pd.read_csv(os.path.join(FOLDER_PATH, "data/added_corpus_presence/lcp_single_test_preprocessed.csv"), index_col=0)
test_f2['token'] = test_f2['token'].astype(str)
test_f2['sentence'] = test_f2['sentence'].astype(str)

test_features = test_f1.merge(test_f2, on=['id','sentence', 'corpus', 'token'])

In [20]:
# fill pos nan by NN, as they are in majority
test_features['pos'] = test_features['pos'].fillna('NN')
test_features['token_length'] = test_features['token_length'].fillna(0)

test_features['pos'] = test_features['pos'].apply((lambda x: labels[x]))

def get_vowels(word):
    val = 0
    for w in word:
        if(w in ['A', 'a', 'E', 'e', 'I', 'i', 'O', 'o', 'U','u']):
            val+=1
    return val

test_features['token_vowels'] = test_features['token'].apply(lambda x: get_vowels(x) )

test_features['pos1'] = test_features['pos'].copy()
test_features['pos2'] = test_features['pos'] 

test_features['dep num1'] = test_features['dep num'] 
test_features['dep num2'] = test_features['dep num'] 

test_features['synonyms1'] = test_features['synonyms'] 
test_features['synonyms2'] = test_features['synonyms'] 

test_features['hypernyms1'] = test_features['hypernyms'] 
test_features['hypernyms2'] = test_features['hypernyms'] 

test_features['hyponyms1'] = test_features['hyponyms'] 
test_features['hyponyms2'] = test_features['hyponyms'] 

test_features['google frequency1'] = test_features['google frequency'] 
test_features['google frequency2'] = test_features['google frequency'] 

test_features.drop(['pos','dep num', 'synonyms', 'hyponyms', 'hypernyms', 'google frequency'], axis=1, inplace=True)


test_features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']] =  \
    scaler.transform(test_features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']])


In [21]:
sentences_test_list = list(test_features['sentence'])
test_tokens_list = list(test_features['token'])

test_vectors = get_embeddings(sentences_test_list, test_tokens_list)
print(test_vectors.shape)

# test_f_vectors = test_features[['token_length', 'token_vowels', 'syllables', 'pos', 'dep num', 'synonyms', 'hypernyms', 'hyponyms', 'google frequency', 'biomedical', 'bible', 'subtitles', 'wiki', 'familarity']].values

test_f_vectors = test_features[['token_length', 'token_vowels', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 
                        'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 
                        'google frequency1', 'google frequency2', 
                        'biomedical', 'bible', 'subtitles', 'wiki', 'familarity']].values

test_vectors = np.concatenate((test_vectors, test_f_vectors), axis=1)
print(test_vectors.shape)

(917, 300)
(917, 320)


In [22]:
# # # Gradient Boosting
reg = GradientBoostingRegressor(n_estimators=250).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/gradient_boosting_baseline.csv", index=False, header=False)

# Linear Regression
reg = LinearRegression().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/linear_regression_baseline.csv", index=False, header=False)

# # xgb Regression
# from xgboost import XGBRegressor
# reg = XGBRegressor(objective ='reg:squarederror', n_estimators=250).fit(vectors, np.array(complexity_train_list))
# y_pred = reg.predict(test_vectors)

# pred = pd.DataFrame({"ID":test_features.index, "complexity":y_pred})
# pred.to_csv(SUBMISSION_FOLDER+"/xgb_regression_baseline.csv", index=False, header=False)


# # AdaBoost
reg = AdaBoostRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/ada_boost_baseline.csv", index=False, header=False)

# SVM regressor
reg = SVR().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/SVM_baseline.csv", index=False, header=False)

# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/MLP_baseline.csv", index=False, header=False)

In [23]:
evaluate(SUBMISSION_FOLDER, FOLDER_PATH+"/references/lcp_single_test_labelled_preprocessed.csv")


For file gradient_boosting_baseline.csv
pearson  :  0.7327576203283477
spearman :  0.7005231534638561
mae      :  0.06711175279397771
mse      :  0.007541193308956854
r2       :  0.5340649883126738

For file linear_regression_baseline.csv
pearson  :  0.6873590114098053
spearman :  0.6756264174128696
mae      :  0.07243309012545256
mse      :  0.008763922190523868
r2       :  0.45851829797035115

For file ada_boost_baseline.csv
pearson  :  0.6939101001270307
spearman :  0.6816027968019428
mae      :  0.07241640271567897
mse      :  0.008909605803147074
r2       :  0.44951718992693

For file SVM_baseline.csv
pearson  :  0.7039790652215149
spearman :  0.6825872094709771
mae      :  0.0707195319367411
mse      :  0.008285997725066644
r2       :  0.48804701209759505

For file MLP_baseline.csv
pearson  :  0.4797932186467725
spearman :  0.4510805101895456
mae      :  0.11374913835819352
mse      :  0.02234535372724599
r2       :  -0.38061474139587914


In [32]:
'''
For file gradient_boosting_baseline.csv
pearson  :  0.7327576203283477
spearman :  0.7005231534638561
mae      :  0.06711175279397771
mse      :  0.007541193308956854
r2       :  0.5340649883126738
'''

'\nFor file gradient_boosting_baseline.csv\npearson  :  0.7327576203283477\nspearman :  0.7005231534638561\nmae      :  0.06711175279397771\nmse      :  0.007541193308956854\nr2       :  0.5340649883126738\n'

## Tesing Multi Word Complexity

In [24]:
sentences_train_list = list(multi_features['sentence'])
complexity_train_list = list(multi_features['complexity'])
tokens_train_list = list(multi_features['token'])

vectors = get_embeddings(sentences_train_list, tokens_train_list)
print(vectors.shape)

f_vectors = multi_features[['token_length', 'token_vowels', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 
                        'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 
                        'google frequency1', 'google frequency2', 
                        'biomedical', 'bible', 'subtitles', 'wiki', 'familarity']].values
print(f_vectors.shape)
vectors = np.concatenate((vectors, f_vectors), axis=1)

(1517, 300)
(1517, 20)


In [25]:
SUBMISSION_FOLDER = os.path.join(FOLDER_PATH,"predictions/baselines/features/multi")

if( not os.path.exists(SUBMISSION_FOLDER)):
    os.makedirs(SUBMISSION_FOLDER)

In [26]:
test_multi_f1 = pd.read_csv(os.path.join(FOLDER_PATH, "data/extra_features/lcp_multi_test_split_features.csv"), index_col=0)
test_multi_f1['token'] = test_multi_f1['token'].astype(str)
test_multi_f1['sentence'] = test_multi_f1['sentence'].astype(str)
test_multi_f1.set_index("id", inplace=True)

# drop unwanted features
test_multi_f1.drop(['parse', 'token1', 'token2', 'lemma1', 'lemma2', 'Unnamed: 0.1'], axis=1, inplace=True)

test_multi_f2 = pd.read_csv(os.path.join(FOLDER_PATH, "data/added_corpus_presence/lcp_multi_test_preprocessed.csv"), index_col=0)
test_multi_f2['token'] = test_multi_f2['token'].astype(str)
test_multi_f2['sentence'] = test_multi_f2['sentence'].astype(str)

test_multi_features = test_multi_f1.merge(test_multi_f2, on=['id','sentence', 'corpus', 'token'])
test_multi_features['token'] = test_multi_f2['token'].astype(str)
test_multi_features.head(2)

Unnamed: 0_level_0,sentence,corpus,token,token_length,syllables,pos1,dep num1,pos2,dep num2,synonyms1,hypernyms1,hyponyms1,synonyms2,hypernyms2,hyponyms2,google frequency1,google frequency2,token_vowels,biomedical,bible,subtitles,wiki,familarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3A9LA2FRWSEW9WO7UFA9AE6VQK3XHL,come intending bring bound chief priest,bible,chief priest,12,2,JJ,0,NN,2,4,1,10,2,2,11,0.0,39.121551,4,1,0,1,0,483.0
302U8RURJZ1WF35NXY44RD66WL4NVH,day lord take away beauty anklet headband cres...,bible,crescent necklace,17,4,NN,1,NN,2,2,1,0,1,1,3,4.830131,4.021996,5,1,1,1,1,268.0


In [27]:
test_multi_features['pos2'] = test_multi_features['pos2'].fillna('NN')

test_multi_features['pos1'] = test_multi_features['pos1'].apply((lambda x: labels[x]))
test_multi_features['pos2'] = test_multi_features['pos2'].apply((lambda x: labels[x]))

test_multi_features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']] =  \
    scaler.transform(test_multi_features[['token_length', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 'google frequency1', 'google frequency2', 'familarity', 'token_vowels']])


test_multi_features.head()

Unnamed: 0_level_0,sentence,corpus,token,token_length,syllables,pos1,dep num1,pos2,dep num2,synonyms1,hypernyms1,hyponyms1,synonyms2,hypernyms2,hyponyms2,google frequency1,google frequency2,token_vowels,biomedical,bible,subtitles,wiki,familarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3A9LA2FRWSEW9WO7UFA9AE6VQK3XHL,come intending bring bound chief priest,bible,chief priest,0.812903,-0.505404,0.277033,-0.707506,-0.248693,0.633561,-0.200765,0.336733,0.217289,-0.58203,3.160928,0.209791,-0.519379,-0.34235,0.393219,1,0,1,0,0.957911
302U8RURJZ1WF35NXY44RD66WL4NVH,day lord take away beauty anklet headband cres...,bible,crescent necklace,1.962864,0.69426,-0.322563,0.098326,-0.248693,0.633561,-0.553188,0.336733,-0.333525,-0.754824,0.195761,-0.211593,-0.493225,-0.553577,0.934757,1,1,1,1,0.121629
3UDTAB6HH6ZVX00DTRXAOJLWX0B094,unclean shall take ash burning sin offering ru...,bible,sin offering,0.812903,0.69426,-0.322563,1.70999,2.390326,-0.098772,0.504082,0.336733,-0.278444,1.837089,0.195761,-0.369612,-0.238985,0.113318,0.393219,1,0,1,0,0.05356
3L2OEKSTW9ASGQDOW725GFK5P77Y8D,precious treasure oil dwelling wise foolish ma...,bible,precious treasure,1.962864,0.69426,0.277033,-0.707506,-0.248693,-0.098772,-0.024553,-2.087483,-0.333525,0.109147,0.195761,-0.15892,-0.393608,-0.489569,2.559373,1,0,1,1,0.076898
39N6W9XWRDN795J6F5ET8S13DQKYGT,long god shall adversary reproach,bible,adversary reproach,2.192856,1.893924,-0.322563,-0.707506,-0.248693,2.098228,-0.7294,0.336733,-0.113199,-0.409235,0.195761,-0.264266,-0.489295,-0.547988,1.476296,1,0,1,1,-0.920806


In [28]:
sentences_test_list = list(test_multi_features['sentence'])
test_tokens_list = list(test_multi_features['token'])

test_vectors = get_embeddings(sentences_test_list, test_tokens_list)
print(test_vectors.shape)

test_f_vectors = test_multi_features[['token_length', 'token_vowels', 'syllables', 'pos1', 'pos2', 'dep num1', 'dep num2', 
                        'synonyms1', 'synonyms2', 'hypernyms1', 'hypernyms2', 'hyponyms1', 'hyponyms2', 
                        'google frequency1', 'google frequency2', 
                        'biomedical', 'bible', 'subtitles', 'wiki', 'familarity']].values
test_vectors = np.concatenate((test_vectors, test_f_vectors), axis=1)
print(test_vectors.shape)

(184, 300)
(184, 320)


In [29]:
reg = LinearRegression().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_multi_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/linear_regression_baseline.csv", index=False, header=False)

# Gradient Boosting
reg = GradientBoostingRegressor(n_estimators=100).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_multi_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/gradient_boosting_baseline.csv", index=False, header=False)

# AdaBoost
reg = AdaBoostRegressor().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_multi_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/ada_boost_baseline.csv", index=False, header=False)

# SVM regressor
reg = SVR().fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_multi_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/SVM_baseline.csv", index=False, header=False)

# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(vectors, np.array(complexity_train_list))
y_pred = reg.predict(test_vectors)

pred = pd.DataFrame({"ID":test_multi_features.index, "complexity":y_pred})
pred.to_csv(SUBMISSION_FOLDER+"/MLP_baseline.csv", index=False, header=False)

In [30]:
evaluate(SUBMISSION_FOLDER, FOLDER_PATH+"/references/lcp_multi_test_labelled_preprocessed.csv")


For file linear_regression_baseline.csv
pearson  :  0.730243938549043
spearman :  0.6997649142287377
mae      :  0.08612029690470382
mse      :  0.011363087082887057
r2       :  0.52922795387472

For file gradient_boosting_baseline.csv
pearson  :  0.7775428358026494
spearman :  0.7475872283891783
mae      :  0.08626575847804864
mse      :  0.011010708764663023
r2       :  0.5438269673884285

For file ada_boost_baseline.csv
pearson  :  0.7648474859163565
spearman :  0.7502571614605091
mae      :  0.08956356841439955
mse      :  0.012417509570243967
r2       :  0.48554329069889246

For file SVM_baseline.csv
pearson  :  0.7210195529865164
spearman :  0.715618549910612
mae      :  0.1545108952206832
mse      :  0.03618523973084165
r2       :  -0.4991524066797841

For file MLP_baseline.csv
pearson  :  0.24488732175520614
spearman :  0.21205995707562106
mae      :  0.29056876904625023
mse      :  0.12700353282624
r2       :  -4.261749080827882


In [31]:
'''
For file gradient_boosting_baseline.csv
pearson  :  0.7775428358026494
spearman :  0.7475872283891783
mae      :  0.08626575847804864
mse      :  0.011010708764663023
r2       :  0.5438269673884285
'''

'\nFor file gradient_boosting_baseline.csv\npearson  :  0.7775428358026494\nspearman :  0.7475872283891783\nmae      :  0.08626575847804864\nmse      :  0.011010708764663023\nr2       :  0.5438269673884285\n'