In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import cmudict
from collections import defaultdict
import math
import torch
import transformers
from transformers import BertTokenizer,BertModel


In [16]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
import re
import warnings
from string import punctuation

from nltk.tokenize.api import TokenizerI
from nltk.util import ngrams
class SyllableTokenizer(TokenizerI):
    """
    Syllabifies words based on the Sonority Sequencing Principle (SSP).

        >>> from nltk.tokenize import SyllableTokenizer
        >>> from nltk import word_tokenize
        >>> SSP = SyllableTokenizer()
        >>> SSP.tokenize('justification')
        ['jus', 'ti', 'fi', 'ca', 'tion']
        >>> text = "This is a foobar-like sentence."
        >>> [SSP.tokenize(token) for token in word_tokenize(text)]
        [['This'], ['is'], ['a'], ['foo', 'bar', '-', 'li', 'ke'], ['sen', 'ten', 'ce'], ['.']]
    """

    def __init__(self, lang="en", sonority_hierarchy=False):
        """
        :param lang: Language parameter, default is English, 'en'
        :type lang: str
        :param sonority_hierarchy: Sonority hierarchy according to the
                                   Sonority Sequencing Principle.
        :type sonority_hierarchy: list(str)
        """
        # Sonority hierarchy should be provided in descending order.
        # If vowels are spread across multiple levels, they should be
        # passed assigned self.vowels var together, otherwise should be
        # placed in first index of hierarchy.
        if not sonority_hierarchy and lang == "en":
            sonority_hierarchy = [
                "aeiouy",  # vowels.
                "lmnrw",  # nasals.
                "zvsf",  # fricatives.
                "bcdgtkpqxhj",  # stops.
            ]

        self.vowels = sonority_hierarchy[0]
        self.phoneme_map = {}
        for i, level in enumerate(sonority_hierarchy):
            for c in level:
                sonority_level = len(sonority_hierarchy) - i
                self.phoneme_map[c] = sonority_level
                self.phoneme_map[c.upper()] = sonority_level

    def assign_values(self, token):
        """
        Assigns each phoneme its value from the sonority hierarchy.
        Note: Sentence/text has to be tokenized first.

        :param token: Single word or token
        :type token: str
        :return: List of tuples, first element is character/phoneme and
                 second is the soronity value.
        :rtype: list(tuple(str, int))
        """
        syllables_values = []
        for c in token:
            try:
                syllables_values.append((c, self.phoneme_map[c]))
            except KeyError:
                if c not in punctuation:
                    warnings.warn(
                        "Character not defined in sonority_hierarchy,"
                        " assigning as vowel: '{}'".format(c)
                    )
                    syllables_values.append((c, max(self.phoneme_map.values())))
                    self.vowels += c
                else:  # If it's a punctuation, assing -1.
                    syllables_values.append((c, -1))
        return syllables_values


    def validate_syllables(self, syllable_list):
        """
        Ensures each syllable has at least one vowel.
        If the following syllable doesn't have vowel, add it to the current one.

        :param syllable_list: Single word or token broken up into syllables.
        :type syllable_list: list(str)
        :return: Single word or token broken up into syllables
                 (with added syllables if necessary)
        :rtype: list(str)
        """
        valid_syllables = []
        front = ""
        for i, syllable in enumerate(syllable_list):
            if syllable in punctuation:
                valid_syllables.append(syllable)
                continue
            if not re.search("|".join(self.vowels), syllable):
                if len(valid_syllables) == 0:
                    front += syllable
                else:
                    valid_syllables = valid_syllables[:-1] + [
                        valid_syllables[-1] + syllable
                    ]
            else:
                if len(valid_syllables) == 0:
                    valid_syllables.append(front + syllable)
                else:
                    valid_syllables.append(syllable)

        return valid_syllables


    def tokenize(self, token):
        """
        Apply the SSP to return a list of syllables.
        Note: Sentence/text has to be tokenized first.

        :param token: Single word or token
        :type token: str
        :return syllable_list: Single word or token broken up into syllables.
        :rtype: list(str)
        """
        # assign values from hierarchy
        syllables_values = self.assign_values(token)

        # if only one vowel return word
        if sum(token.count(x) for x in self.vowels) <= 1:
            return [token]

        syllable_list = []
        syllable = syllables_values[0][0]  # start syllable with first phoneme
        for trigram in ngrams(syllables_values, n=3):
            phonemes, values = zip(*trigram)
            # Sonority of previous, focal and following phoneme
            prev_value, focal_value, next_value = values
            # Focal phoneme.
            focal_phoneme = phonemes[1]

            # These cases trigger syllable break.
            if focal_value == -1:  # If it's a punctuation, just break.
                syllable_list.append(syllable)
                syllable_list.append(focal_phoneme)
                syllable = ""
            elif prev_value >= focal_value == next_value:
                syllable += focal_phoneme
                syllable_list.append(syllable)
                syllable = ""

            elif prev_value > focal_value < next_value:
                syllable_list.append(syllable)
                syllable = ""
                syllable += focal_phoneme

            # no syllable break
            else:
                syllable += focal_phoneme

        syllable += syllables_values[-1][0]  # append last phoneme
        syllable_list.append(syllable)

        return self.validate_syllables(syllable_list)


In [22]:
class Features:
    emmbed_dict = {}
    with open('../input/glove300/glove.42B.300d.txt','r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:],'float32')
            emmbed_dict[word]=vector
    tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
    model=BertModel.from_pretrained('bert-base-uncased')
    
    def __init__(self,path):
        self.df=pd.read_csv(path,encoding='utf-8', delimiter='\t', quotechar='\t', keep_default_na=False)
        self.df.rename(columns = {'subcorpus':'corpus'}, inplace = True)
        self.maxlen=512
        
                
    def remove_whitespace(self,text):
        text=text.lower().strip()
        return " ".join(text.split());
    
    def remove_punct(self,text):
        tokenizer=RegexpTokenizer(r"\w+")
        text=tokenizer.tokenize(text)
        return " ".join(text)

    def remove_stopword(self,text):
        text=text.split()
        res=[]
        en_stopwords=stopwords.words('english')
        for t in text:
            if t not in en_stopwords:
                res.append(t)
        return " ".join(res)
    
    def synonyms(self,word):
        synonyms = [] 
        for syn in wordnet.synsets(word): 
            for l in syn.lemmas(): 
                synonyms.append(l.name()) 

        synonyms = set(synonyms)

        return len(synonyms)

    def antonyms(self,word):
        antonyms = [] 
        for syn in wordnet.synsets(word): 
            for l in syn.lemmas(): 
                if l.antonyms(): 
                    antonyms.append(l.antonyms()[0].name()) 

        antonyms = set(antonyms)    
        return len(antonyms)

    def hypernyms(self,word):
        hypernyms=0
        try:
            results = wordnet.synsets(word)
            hypernyms = len(results[0].hypernyms())
            return hypernyms
        except:
            return hypernyms


    def hyponyms(self,word):
        hyponyms=0
        try:
            results = wordnet.synsets(word)
        except:
            return hyponyms
        try:
            hyponyms = len(results[0].hyponyms())
            return hyponyms
        except:
            return hyponyms



    def clean(self):
        self.df.replace(to_replace= np.nan, value = "null", inplace=True)
        self.df['sentence']=self.df['sentence'].apply(self.remove_whitespace)
        self.df['sentence']=self.df['sentence'].apply(self.remove_punct)
        self.df['token']=self.df['token'].apply(self.remove_whitespace)
    
    def tf(self):
        res=[]
        token_freq = {}
        token_freq = defaultdict(lambda:0,token_freq)
        
        for sen in self.df['sentence']:
            sp=sen.split()
            for s in sp:
                token_freq[s]+=1
        for t in self.df['token']:
            val=0
            val=math.log10(1+token_freq[t])
            res.append(val)

        return res
    
    def vowels(self):
        res=[]
        vowel=['a','e','i','o','u']
        for token in self.df['token']:
            c=0
            for s in token:
                if s in vowel:
                    c+=1
            res.append(c)
        return res
    
    def token_len(self):
        res=[]
        for t in self.df['token']:
            res.append(len(t))
        return res


    def syllable_count(self):
        d = cmudict.dict()
        res=[]
        for word in self.df['token']:
            val=0
            try:
                val+=[len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
            except:
                st=SyllableTokenizer()
                val+=len(st.tokenize(word))
            res.append(val)
        return res
    
    def pos_tagger(self):
        tagger=spacy.load('en_core_web_sm')
        length=self.df.shape[0]
        pos_tags={'NOUN':[0]*length,'ADJ':[0]*length,'VERB':[0]*length,'ADV':[0]*length}
        res=[]
        for index, row in self.df.iterrows():
            sen=tagger(row['sentence'])
            for i in range(len(sen)):
                if sen[i].text==row['token'] and sen[i].pos_ in pos_tags:
                    pos_tags[sen[i].pos_][index]+=1
                    break
        temp_df=pd.DataFrame.from_dict(pos_tags)
        df=pd.concat([self.df,temp_df],axis=1)
        return df

    
    def pos_tagger1(self):
        tagger=spacy.load('en_core_web_sm')
        pos_tags={'NOUN':11,'ADJ':10,'VERB':9,'ADV':8,'ADP':7,'CONJ':6,'CCONJ':5,'AUX':4,'DET':3,'PRON':2,'SCONJ':1}
        pos_tags=defaultdict(lambda:0,pos_tags)
        res=[]
        for index, row in self.df.iterrows():
            sen=tagger(row['sentence'])
            f=False
            for i in range(len(sen)):
                if sen[i].text==row['token']:
                    cur=pos_tags[sen[i].pos_]
                    res.append(cur)
                    f=True

                    break
            if not f:
                res.append(0)
        return res
    
    def glove_token(self,token):
        if token in Features.emmbed_dict:
            return Features.emmbed_dict[token]
        else:
            return np.zeros((300,))
        
    def glove_embedding(self):
        res=[]
        for t in self.df['token']:
            ans=np.zeros((300,))
            ans+=self.glove_token(t)
            res.append(ans)
        return np.array(res)
    
    def bert_no_of_tokens(self):
        res=[]
        for t in self.df['token']:
            res.append(len(Features.tokenizer.tokenize(t)))
        return res
    
    def index_of_token(self,text,token):
        tokens=Features.tokenizer.tokenize(text)
        ids=[]
        s=""
        i=0
        while i<len(tokens) and i<self.maxlen-2:
            if token.startswith(tokens[i]):
                s=tokens[i]
                ids.append(i)
                i+=1
                while i<len(tokens) and i<self.maxlen-2 and tokens[i].startswith('##') and token.startswith(s+tokens[i][2:]):
                    s+=tokens[i][2:]
                    ids.append(i)
                    i+=1
                if s==token:
                    break
                else:
                    s=""
                    ids.clear()
            else:
                i+=1
        return ids
    
    def contextual_features(self):
        res1=[]
        res2=[]
        DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
        Features.model=Features.model.to(DEVICE)
        Features.model.eval()
        for i,row in self.df.iterrows():
            with torch.no_grad():
                emb=Features.tokenizer(row['sentence'],return_tensors='pt',truncation=True,max_length=512).to(DEVICE)
                pred=Features.model(input_ids=emb['input_ids'],attention_mask=emb['attention_mask'],output_hidden_states=True)
                emb_token=torch.zeros(1,768).to(DEVICE)
                ids=self.index_of_token(row['sentence'],row['token'])
                if len(ids)==0:
                    emb_token=torch.hstack((emb_token,emb_token,emb_token))
                elif len(ids)==1:
                    v1=pred.hidden_states[-2][0][ids[0]+1].reshape(1,768)
                    emb_token=torch.hstack((v1,emb_token,emb_token))
                elif len(ids)==2:
                    v1=pred.hidden_states[-2][0][ids[0]+1].reshape(1,768)
                    v2=pred.hidden_states[-2][0][ids[1]+1].reshape(1,768)
                    emb_token=torch.hstack((v1,v2,emb_token))
                else:
                    v1=pred.hidden_states[-2][0][ids[0]+1].reshape(1,768)
                    v2=pred.hidden_states[-2][0][ids[1]+1].reshape(1,768)
                    v3=pred.hidden_states[-2][0][ids[2]+1].reshape(1,768)
                    emb_token=torch.hstack((v1,v2,v3))

            res1.append(emb_token)
            res2.append(pred.pooler_output)
        res1=torch.cat(res1)
        res1=np.array(res1.detach().cpu())
        res2=torch.cat(res2)
        res2=np.array(res2.detach().cpu())
        
        return np.hstack((res1,res2))
    
    def preprocess(self):
        self.clean()
        self.df['synonyms'] = self.df['token'].apply(self.synonyms)
        self.df['antonyms'] = self.df['token'].apply(self.antonyms)
        #self.df['hypernyms'] = self.df['token'].apply(self.hypernyms)
        self.df['hyponyms'] = self.df['token'].apply(self.hyponyms)
        self.df['vowels']=self.vowels()
        self.df['token_len']=self.token_len()
        self.df['tf']=self.tf()
        self.df['syllable']=self.syllable_count()
        self.df=self.pos_tagger()
        self.df['bert_token_count']=self.bert_no_of_tokens()
        dummy=pd.get_dummies(self.df['corpus'],drop_first=True,prefix='corpus_')
        df_y=self.df['complexity']
        df_x=self.df.drop(columns=['corpus','id','sentence','token','complexity'])
        df_x = pd.concat([df_x, dummy],axis=1)
        np_vectors=np.hstack((self.glove_embedding(),self.contextual_features()))
        temp_df=pd.DataFrame(np_vectors)
        df_x=pd.concat([df_x,temp_df],axis=1)
        return pd.concat([df_x,df_y],axis=1)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
single_trail=Features('../input/lcp-trail/lcp_single_trial.tsv').preprocess()
single_trail.to_csv('single_trail.csv')

In [25]:
single_train=Features('../input/lcp-train/lcp_single_train.tsv').preprocess()
single_train.to_csv('single_train.csv')

In [26]:
single_test=Features('../input/lcp-test/lcp_single_test.tsv').preprocess()
single_test.to_csv('single_test.csv')

In [3]:
#import necessary libraries
from sklearn.metrics import r2_score
from scipy.stats import pearsonr,spearmanr
from sklearn.metrics import mean_squared_error,mean_absolute_error

#calculate the predicted scores and print it
def scores(actuall_labels, predicted_labels):
    print('Pearson Score: ',pearsonr(actuall_labels,predicted_labels))
    print('R2_Score: ',r2_score(actuall_labels,predicted_labels))
    print('Spearmanr Score: ',spearmanr(actuall_labels,predicted_labels))
    print('Mean Squared Error(MSE): ',mean_squared_error(actuall_labels,predicted_labels))
    print('Mean Absolute Error(MAE): ',mean_absolute_error(actuall_labels,predicted_labels))


In [4]:
train=pd.read_csv('./single_train.csv',index_col=[0])
test=pd.read_csv('./single_test.csv',index_col=[0])


In [5]:
train_x=train.drop(columns=['complexity'])
train_y=train['complexity']

In [6]:
test_x=test.drop(columns=['complexity'])
test_y=test['complexity']

In [7]:
trainx=np.array(train_x)
trainy=np.array(train_y)
testx=np.array(test_x)
testy=np.array(test_y)


In [8]:
from sklearn.ensemble import GradientBoostingRegressor
gbr =  GradientBoostingRegressor(random_state=19, n_estimators=100)
gbr.fit(trainx, trainy)
pred4=gbr.predict(testx)
scores(testy,pred4)

Pearson Score:  (0.7398624271653934, 1.1029380343801114e-159)
R2_Score:  0.5462761316286369
Spearmanr Score:  SpearmanrResult(correlation=0.7100449178537698, pvalue=1.5367627681660304e-141)
Mean Squared Error(MSE):  0.0073435550333192864
Mean Absolute Error(MAE):  0.06569798774817318
