In [1]:
# #only run it when you are on colab
# !pip install syllables
# from google.colab import drive
# drive.mount('/content/drive')

# %cd /content/drive/MyDrive/nlp_project


In [2]:
import os
import nltk
import pandas as pd
import numpy as np
import gensim.downloader
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize 

import syllables 
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from eval import evaluate

In [3]:
word_vector = gensim.downloader.load('word2vec-google-news-300')

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ankit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# **add feature on single word**

In [5]:
def add_single_features(name,isTrain) :
    df = pd.read_csv("data/preprocessed/" +name )

    frequency_of_word = {}
    for i in range(len(df)) :
        word = df.at[i,"token"]
        if word in frequency_of_word :
            frequency_of_word[word]+=1
        else :
            frequency_of_word[word] = 1

    def vowel_count(str): 
        count = 0
        vowel = set("aeiouAEIOU") 
        for alphabet in str: 
            if alphabet in vowel: 
                  count = count + 1
        return count 


    def contain_numeral(str) :
        num = "0123456789"
        for alphabet in str: 
            if alphabet in num: 
                return True
        return False

    token_length=[]
    token_syllable= []
    token_vowels = []
    token_frequency = []
    token_contain_numeral = []
    nearest_class = []
    
    for i in range(len(df)) :
        word = df.at[i,"token"]
        # print(word)
        if type(word) is not str :
            token_length.append(0)
            token_syllable.append(0)
            token_frequency.append(0)
            token_vowels.append(0)
            token_contain_numeral.append(False)
            if isTrain :
                nearest_class.append(round(4*df.at[i,"complexity"])+1)   

            continue
        token_length.append(len(word))
        token_syllable.append(syllables.estimate(word))
        token_vowels.append(vowel_count(word))
        if word in frequency_of_word :
          token_frequency.append(frequency_of_word[word])
        else :
          token_frequency.append(0)
        token_contain_numeral.append(contain_numeral(word))

        # print(type(df.at[i,"complexity"]))
        if isTrain :
            nearest_class.append(round(4*df.at[i,"complexity"])+1)

    df['token_length'] = token_length
    df['token_syllable'] = token_syllable
    df['token_vowels'] = token_vowels
    df['token_frequency'] = token_frequency
    df['token_contain_numeral'] = token_contain_numeral
    if isTrain :
        df['nearest_class'] = nearest_class
    df.to_csv("data/added_features/"+name,index=False)

In [6]:
add_single_features("lcp_single_test_preprocessed.csv",False)
add_single_features("lcp_single_train_preprocessed.csv",True)

# **Models on single word**

In [7]:
data_folder = "data/added_features"
df_Train = pd.read_csv(os.path.join(data_folder,"lcp_single_train_preprocessed.csv"),index_col=0)
df_Test = pd.read_csv(os.path.join(data_folder,"lcp_single_test_preprocessed.csv"),index_col=0)
df_Test.head()

Unnamed: 0_level_0,corpus,sentence,token,token_length,token_syllable,token_vowels,token_frequency,token_contain_numeral
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
39TX062QX1OHFOH8FUL76K5L7D3X3S,bible,speak much prince world come nothing,prince,6,2,2,5,False
3CIS7GGG65JS8I3AZ9RG54AE4MUUEA,bible,house shall turned others field wife together ...,inhabitant,10,4,4,3,False
379OL9DBSSESUVWY1Z8JGBFG9BTY92,bible,stranger terrible nation cut left mountain val...,bough,5,1,2,2,False
3DFYDSXB2W00JYP2DA272KN69UQUJV,bible,sharpen tongue like sword aim arrow deadly word,arrow,5,2,2,3,False
31YWE12TE0CZG7IVH6OXJ1H1CFPX7X,bible,obey leader submit watch behalf soul give acco...,account,7,2,3,4,False


In [8]:
stat_feature = np.array(df_Train[['token_length','token_syllable','token_vowels','token_frequency','token_contain_numeral']])

embed_word = np.array(list(df_Train['token'].apply(lambda x:word_vector[x] if x in word_vector else word_vector['unk'])))
embed_sentence = np.array(list(df_Train['sentence'].apply
    (
    lambda x:
    sum([word_vector[w] if w in word_vector else word_vector['unk'] for w in x.split()])/len(x.split())
    )
                              )
                         )
Train_Vector = np.hstack((stat_feature,.5*embed_word+.5*embed_sentence))
Train_Vector.shape

(7662, 305)

In [9]:
stat_feature = np.array(df_Test[['token_length','token_syllable','token_vowels','token_frequency','token_contain_numeral']])

embed_word = np.array(list(df_Test['token'].apply(lambda x:word_vector[x] if x in word_vector else word_vector['unk'])))
embed_sentence = np.array(list(df_Test['sentence'].apply
    (
    lambda x:
    sum([word_vector[w] if w in word_vector else word_vector['unk'] for w in x.split()])/len(x.split())
    )
                              )
                         )
Test_Vector = np.hstack((stat_feature,.5*embed_word+.5*embed_sentence)) # change lambda1 and lambda2
Test_Vector.shape

(917, 305)

In [10]:
submission_folder = "predictions/single_with_features"

In [11]:
# Linear Regression
reg = LinearRegression().fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/linear_regression_with_features.csv", index=False, header=False)

In [12]:
# SVM regressor
reg = SVR().fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/SVM_with_features.csv", index=False, header=False)

In [13]:
# Gradient Boosting
reg = GradientBoostingRegressor().fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/gradient_boosting_with_features.csv", index=False, header=False)

In [14]:
# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/MLP_with_features.csv", index=False, header=False)

### evaluate baseline models

In [15]:
evaluate(submission_folder, "references/lcp_single_test_labelled_preprocessed.csv")


For file MLP_with_features.csv
pearson  :  0.6057132793111173
spearman :  0.5503812552403398
mae      :  0.08905447567691986
mse      :  0.013068293427130534
r2       :  0.19257136089172333

For file linear_regression_with_features.csv
pearson  :  0.6311508461209863
spearman :  0.6077421054566633
mae      :  0.07661968054084875
mse      :  0.009917703426107611
r2       :  0.3872315597232592

For file gradient_boosting_with_features.csv
pearson  :  0.6510099362950927
spearman :  0.6197440617933732
mae      :  0.07254862857837216
mse      :  0.009343434124671625
r2       :  0.4227129699873944

For file SVM_with_features.csv
pearson  :  0.6744168783485909
spearman :  0.6307686935582127
mae      :  0.07273162781364834
mse      :  0.008875871431429068
r2       :  0.4516014787439301


## ***add features on multi-word (2 - Word)***

In [29]:
def add_multiword_features(name,isTrain) :
    df = pd.read_csv("data/preprocessed/" +name )

    frequency_of_word = {}
    for i in range(len(df)) :
        words = df.at[i,"token"].split()
        for word in words :
            if word in frequency_of_word :
                frequency_of_word[word]+=1
            else :
                frequency_of_word[word] = 1

    def vowel_count(str): 
        count = 0
        vowel = set("aeiouAEIOU") 
        for alphabet in str: 
            if alphabet in vowel: 
                  count = count + 1
        return count 


    def contain_numeral(str) :
        num = "0123456789"
        for alphabet in str: 
            if alphabet in num: 
                return True
        return False

    token_length=[]
    token_syllable= []
    token_vowels = []
    token_frequency = []
    token_contain_numeral = []
    nearest_class = []
    
    for i in range(len(df)) :
        word1,word2 = df.at[i,"token"].split()
        # print(word1,word2)
        if (type(word1) is not str) or (type(word2) is not str) :
            token_length.append(0)
            token_syllable.append(0)
            token_frequency.append(0)
            token_vowels.append(0)
            token_contain_numeral.append(False)
            if isTrain :
                nearest_class.append(round(4*df.at[i,"complexity"])+1)   

            continue
        token_length.append(len(word1)+len(word2))
        token_syllable.append(syllables.estimate(word1)+syllables.estimate(word2))
        token_vowels.append(vowel_count(word1)+vowel_count(word2))
        token_frequency.append(frequency_of_word[word1]+frequency_of_word[word2])
        token_contain_numeral.append(contain_numeral(word1) or contain_numeral(word2))

        # print(type(df.at[i,"complexity"]))
        if isTrain :
            nearest_class.append(round(4*df.at[i,"complexity"])+1)

    df['token_length'] = token_length
    df['token_syllable'] = token_syllable
    df['token_vowels'] = token_vowels
    df['token_frequency'] = token_frequency
    df['token_contain_numeral'] = token_contain_numeral
    if isTrain :
        df['nearest_class'] = nearest_class
    df.to_csv("data/added_features/"+name,index=False)

In [30]:
add_multiword_features("lcp_multi_test_preprocessed.csv",False)
add_multiword_features("lcp_multi_train_preprocessed.csv",True)

## ***Models on multi-word (2 - Word)***

In [31]:
data_folder = "data/added_features"
df_Train = pd.read_csv(os.path.join(data_folder,"lcp_multi_train_preprocessed.csv"),index_col=0)
df_Test = pd.read_csv(os.path.join(data_folder,"lcp_multi_test_preprocessed.csv"),index_col=0)
df_Test.head()

Unnamed: 0_level_0,corpus,sentence,token,token_length,token_syllable,token_vowels,token_frequency,token_contain_numeral
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3A9LA2FRWSEW9WO7UFA9AE6VQK3XHL,bible,come intending bring bound chief priest,chief priest,11,2,4,7,False
302U8RURJZ1WF35NXY44RD66WL4NVH,bible,day lord take away beauty anklet headband cres...,crescent necklace,16,5,5,2,False
3UDTAB6HH6ZVX00DTRXAOJLWX0B094,bible,unclean shall take ash burning sin offering ru...,sin offering,11,4,4,10,False
3L2OEKSTW9ASGQDOW725GFK5P77Y8D,bible,precious treasure oil dwelling wise foolish ma...,precious treasure,16,5,8,2,False
39N6W9XWRDN795J6F5ET8S13DQKYGT,bible,long god shall adversary reproach,adversary reproach,17,6,6,2,False


In [32]:
stat_feature = np.array(df_Train[['token_length','token_syllable','token_vowels','token_frequency','token_contain_numeral']])


embed_word = np.array(list(df_Train['token'].apply
    (
    lambda x:
    sum([word_vector[w] if w in word_vector else word_vector['unk'] for w in x.split()])/len(x.split())
    )
                              )
                         )
embed_sentence = np.array(list(df_Train['sentence'].apply
    (
    lambda x:
    sum([word_vector[w] if w in word_vector else word_vector['unk'] for w in x.split()])/len(x.split())
    )
                              )
                         )
Train_Vector = np.hstack((stat_feature,.5*embed_word+.5*embed_sentence))
Train_Vector.shape

(1517, 305)

In [33]:
stat_feature = np.array(df_Test[['token_length','token_syllable','token_vowels','token_frequency','token_contain_numeral']])

embed_word = np.array(list(df_Test['token'].apply
    (
    lambda x:
    sum([word_vector[w] if w in word_vector else word_vector['unk'] for w in x.split()])/len(x.split())
    )
                              )
                         )
embed_sentence = np.array(list(df_Test['sentence'].apply
    (
    lambda x:
    sum([word_vector[w] if w in word_vector else word_vector['unk'] for w in x.split()])/len(x.split())
    )
                              )
                         )
Test_Vector = np.hstack((stat_feature,.5*embed_word+.5*embed_sentence)) # change lambda1 and lambda2
Test_Vector.shape

(184, 305)

In [34]:
submission_folder = "predictions/multiword_with_features"

In [35]:
# Linear Regression
reg = LinearRegression().fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/linear_regression_with_features.csv", index=False, header=False)

In [36]:
# SVM regressor
reg = SVR().fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/SVM_with_features.csv", index=False, header=False)

In [37]:
# Gradient Boosting
reg = GradientBoostingRegressor().fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/gradient_boosting_with_features.csv", index=False, header=False)

In [38]:
# MLP Regressor
reg = MLPRegressor(hidden_layer_sizes=(150)).fit(Train_Vector, np.array(df_Train['complexity']))
y_pred = reg.predict(Test_Vector)

pred = pd.DataFrame({"ID":df_Test.index, "complexity":y_pred})
pred.to_csv(submission_folder+"/MLP_with_features.csv", index=False, header=False)

In [39]:
evaluate(submission_folder, "references/lcp_multi_test_labelled_preprocessed.csv")


For file MLP_with_features.csv
pearson  :  0.6475028865558714
spearman :  0.6342185188430863
mae      :  0.09616392954292978
mse      :  0.015062593128924379
r2       :  0.37595763056893994

For file linear_regression_with_features.csv
pearson  :  0.6419859078673502
spearman :  0.6249874798605937
mae      :  0.09300396735257285
mse      :  0.014517452349322627
r2       :  0.398542781801825

For file gradient_boosting_with_features.csv
pearson  :  0.7223224401942772
spearman :  0.6899679064160567
mae      :  0.08551579078634078
mse      :  0.012048012220533717
r2       :  0.5008515447052295

For file SVM_with_features.csv
pearson  :  0.6427020682357478
spearman :  0.6163305512019623
mae      :  0.0962766986144412
mse      :  0.014806430533685208
r2       :  0.386570432197744
