In [146]:
import nltk
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('senseval')

[nltk_data] Error loading senseval2: Package 'senseval2' not found in
[nltk_data]     index


False

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/ihar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet as wn

In [4]:
from nltk.corpus import senseval as se

In [5]:
se.fileids()

['hard.pos',
 'interest.pos',
 'line.pos',
 'serve.pos',
 'test.pos',
 'train.pos',
 'trial.pos']

In [622]:
from typing import List, Tuple, Optional

def get_word_senses(word: str, pos: str = None) -> List[Tuple[str, str]]:
    """
    Get all possible word sences with their dictionary definition
    :param word: a word to check
    :param pos: a part of speech of that word
    """

    w = wn.morphy(word.lower(), pos)
    sences = []
    if w:
        lemmas = wn.lemmas(w, pos)
        for l in lemmas:
            sences.append((l.key(), l.synset().definition()))
            
    return sences


In [623]:
get_word_senses('art')

[('art%1:06:00::',
  'the products of human creativity; works of art collectively'),
 ('art%1:04:00::', 'the creation of beautiful or significant things'),
 ('art%1:09:00::',
  'a superior skill that you can learn by study and practice and observation'),
 ('art%1:10:00::',
  'photographs or other visual representations in a printed publication')]

In [645]:
def get_data(filename):
    X, y = [], []
    for instance in se.instances(filename):
        word = instance.word.split('.')[0]
        
        sense_chosen = None
        for sense in instance.senses:
            if word in sense:
                sense_chosen = sense
                
        if not sense_chosen:
            # Example in data: word = 'circuit' and senses are ('lap%1:04:01::',)
            if len(instance.senses[0]) > 1: # do not include sense = 'U'
                sense_chosen = instance.senses[0]
                
        if sense_chosen:
            sense_chosen = word + '%' + sense_chosen.split('%')[-1]
            
            senses = get_word_senses(word)
            # Select only senses that are present in wordnet
            for sense in senses:
                if sense_chosen == sense[0]:
                    X.append((word, instance.context))
                    y.append(sense_chosen)
                    break
        
    return X, y
    
X, y = get_data('train.pos')

In [646]:
from collections import defaultdict

common_sence = defaultdict(lambda: defaultdict(int))
all_ys = set()
sense_ids = defaultdict(list)

for i, sample in enumerate(X):
    common_sence[sample[0]][y[i]] += 1
    all_ys.add(y[i])
    sense_ids[y[i]].append(i)

score = 0
for i, sample in enumerate(X):
    pred = max(common_sence[sample[0]].items(), key=lambda x: x[1])[0]
    if pred == y[i]:
        score += 1.
        
print(f'Acc. of most common guess {score / len(X)}')

Acc. of most common guess 0.5165228113440198


In [675]:
from nltk.corpus import stopwords
from typing import Set, Callable
from nltk import word_tokenize

english_stopwords = set(stopwords.words('english'))


def simple_intersection(words_1: List[str], words_2: str) -> float:
    """
    Find how many words are shared between two lists
    """
        
    score = 1
    for word in words_1:
        if word in words_2:
            score += 1.
            
    return score / (max(len(words_1), len(words_2)) + 1)

def simplified_lesk(obj_word: str, context: List[str], stopwords: Set[str] = english_stopwords,
                   score_func: Callable = simple_intersection,
                   return_scores: bool = False,
                   prob_multiply: bool = False,
                   alpha: float = 0.4) -> [Optional[str], List[Tuple[str, float]]]:
    """
    Find context overlap with Wordnet difinitions in order to identify a sence of a word
    :param obj_word: a word to check
    :param context: a list of words describing context where the word was used
    :param stopwords: a list of stopwords to omit from definitions and context
    :param scoring_function: a function to compare two defintions and return their similarity score
    :param prob_multiply: whether to add probability of sence into score calculation
    :param alpha: how much value to give to Lesk score
    :return a wordnet sence or a list of score for each sence
    """
    
    def clear_words(words):
        cleared_words = []
        for word in words:
            w = wn.morphy(word.lower())
            if w and w not in stopwords and w not in obj_word:
                cleared_words.append(w)
            elif not w and word.lower() not in stopwords:
                cleared_words.append(word.lower())
        return cleared_words
    
    cleared_context = clear_words(context)
    senses = get_word_senses(obj_word)
    
    max_score = -1e9
    best_sense = None
    
    scores = []

    sum_scores = 0
    sum_probs = np.sum(list(common_sence[obj_word].values()))

    for sense in senses:
        if sense[0] not in all_ys:
            continue
            
        score = score_func(clear_words(word_tokenize(sense[1].lower())), cleared_context)
            
        scores.append((score, sense[0]))
        sum_scores += score
    
    # print(scores)
    for i, score in enumerate(scores):
        if prob_multiply:
            new_score = alpha * (score[0] / sum_scores) +\
                (1 - alpha) * (common_sence[obj_word][score[1]] / sum_probs)
            scores[i] = (new_score, score[1])
            
        if scores[i][0] > max_score:
            max_score = scores[i][0]
            best_sense = scores[i][1]
            
    if return_scores:
        return scores
    else:
        return best_sense
    

In [676]:
score = 0

mappings = defaultdict(lambda: defaultdict(int))
for i, sample in enumerate(X):
    pred = simplified_lesk(sample[0], sample[1], prob_multiply=False)
    mappings[y[i]][pred] += 1
    if pred == y[i]:
        score += 1.
        
print(f'Acc. of lesk with simple distance {score / len(X)}')

Acc. of lesk with simple distance 0.3409371146732429


In [672]:
mappings

defaultdict(<function __main__.<lambda>>,
            {'art%1:04:00::': defaultdict(int,
                         {'art%1:04:00::': 4,
                          'art%1:06:00::': 35,
                          'art%1:09:00::': 2}),
             'art%1:06:00::': defaultdict(int,
                         {'art%1:04:00::': 4,
                          'art%1:06:00::': 80,
                          'art%1:09:00::': 3}),
             'art%1:09:00::': defaultdict(int,
                         {'art%1:04:00::': 2,
                          'art%1:06:00::': 48,
                          'art%1:09:00::': 5}),
             'authority%1:07:00::': defaultdict(int,
                         {'authority%1:07:00::': 45,
                          'authority%1:09:00::': 2,
                          'authority%1:10:00::': 1,
                          'authority%1:18:00::': 6,
                          'authority%1:18:01::': 11}),
             'authority%1:07:02::': defaultdict(int,
                        

Lesk performs worse then most common approach

In [649]:
score = 0

mappings = defaultdict(lambda: defaultdict(int))
for i, sample in enumerate(X):
    pred = simplified_lesk(sample[0], sample[1], prob_multiply=True)
    mappings[y[i]][pred] += 1
    if pred == y[i]:
        score += 1.
        
print(f'Acc. of lesk with simple distance and probality of a sense {score / len(X)}')

Acc. of lesk with simple distance and probality of a sense 0.5187422934648582


After adding probability of a word into score equation, we still achieve results similar to just most common guess

In [650]:
from gensim.models.keyedvectors import KeyedVectors

word2vec_weights_path = '/home/ihar/kaggle/NLP_HW_1/GoogleNews-vectors-negative300.bin'

w2v_model = KeyedVectors.load_word2vec_format(word2vec_weights_path, binary=True)
w2v_model.init_sims(replace=True)  # normalize vectors

In [686]:
import numpy as np

def vectorise(words: List[str]) -> np.array:
    """
    Identify the vector values for each word in the given document
    :param words: a document to vectorise
    :return: a mean vector of word2vec vectors of the document
    """
    word_vecs = []
    for word in words:
        try:
            vec = w2v_model[word]
            word_vecs.append(vec)
        except KeyError:
            # Ignore, if the word doesn't exist in the vocabulary
            pass

    # Assuming that document vector is the mean of all the word vectors
    if len(word_vecs) == 0:
        return None
    vector = np.mean(word_vecs, axis=0)
    return vector

def cosine_sim(vec_a: np.array, vec_b: np.array):
    """
    Find the cosine similarity distance between two vectors.
    :param vec_a: a vector
    :param vec_b: a vector
    :return: a cosine similarity
    """
    csim = np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
    if np.isnan(np.sum(csim)):
        return 0
    return csim

def word2vec_cosdist(words_1: List[str], words_2: str) -> float:
    """
    Find how many words are shared between two lists
    """
        
    doc1, doc2 = vectorise(words_1), vectorise(words_2)
    
    score = cosine_sim(doc1, doc2)

    return score

In [687]:
score = 0

mappings = defaultdict(lambda: defaultdict(int))
for i, sample in enumerate(X):
    pred = simplified_lesk(sample[0], sample[1], score_func=word2vec_cosdist)

    mappings[y[i]][pred] += 1
        
    if pred == y[i]:
        score += 1.
        
print(f'Acc. of lesk with Word2Vec distance {score / len(X)}')

Acc. of lesk with Word2Vec distance 0.28865598027127004


In [674]:
mappings

defaultdict(<function __main__.<lambda>>,
            {'art%1:04:00::': defaultdict(int,
                         {'art%1:04:00::': 34,
                          'art%1:06:00::': 4,
                          'art%1:09:00::': 3}),
             'art%1:06:00::': defaultdict(int,
                         {'art%1:04:00::': 75,
                          'art%1:06:00::': 5,
                          'art%1:09:00::': 7}),
             'art%1:09:00::': defaultdict(int,
                         {'art%1:04:00::': 41,
                          'art%1:06:00::': 6,
                          'art%1:09:00::': 8}),
             'authority%1:07:00::': defaultdict(int,
                         {'authority%1:07:00::': 31,
                          'authority%1:14:00::': 3,
                          'authority%1:18:00::': 1,
                          'authority%1:18:01::': 30}),
             'authority%1:07:02::': defaultdict(int,
                         {'authority%1:07:00::': 6, 'authority%1:18:01::': 6

In [653]:
score = 0

mappings = defaultdict(lambda: defaultdict(int))
for i, sample in enumerate(X):
    pred = simplified_lesk(sample[0], sample[1], score_func=word2vec_cosdist, prob_multiply=True, 
                          alpha=0.6)
    mappings[y[i]][pred] += 1
        
    if pred == y[i]:
        score += 1.
        
print(f'Acc. of lesk with Word2Vec distance {score / len(X)}')

Acc. of lesk with Word2Vec distance 0.523921085080148


# ML approach

Use raw scores gerenerated by wor2dvec alogrithm and sense distrubtion in order to train a classifier

In [654]:
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

X = np.array(X)


word_labels = preprocessing.LabelEncoder().fit(X[:,0])
context_tf = TfidfVectorizer(min_df=0.05)
sentences = []

for sample in X:
    sentences.append(' '.join(sample[1]))

context_tf = context_tf.fit(sentences)    
 
X_new = []
y_new = []
max_len = 20

for j, sample in enumerate(X):

    new_sample = []
    new_sample.append(word_labels.transform([sample[0]])[0])
    
    pred = simplified_lesk(sample[0], sample[1], score_func=word2vec_cosdist, return_scores=True)

    max_len = max(max_len, len(pred))
    new_sample.extend(np.array(pred)[:, 0].astype(np.float32))
   
    for i in range(max_len - len(pred)):
        new_sample.append(0.)
        
    y_ind = None
    for i, y_pred in enumerate(pred):
        new_sample.append(common_sence[sample[0]][y_pred[1]])
        if y_pred[1] == y[j]:
            y_ind = i
            
    for i in range(max_len - len(pred)):
        new_sample.append(0.)
            
    if y_ind:
        X_new.append(np.array(new_sample))
        y_new.append(y_ind)
    
X_new = np.array(X_new)
y_new = np.array(y_new)
print(max_len)

20


In [655]:
from sklearn.model_selection import train_test_split
X_train, y_train = X_new, y_new

In [656]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.utils import class_weight
import gc


n_folds = 5
from sklearn.metrics.scorer import make_scorer

def rmsle_cv(model_class):
    """
    A function to test a model performance using k-fold cross validation
    """
    
    kf = StratifiedKFold(n_folds, shuffle=True)
    scores = []
    for train_index, test_index in kf.split(X_train, y_train):
        X, y = X_train[train_index], y_train[train_index]
        X, X_val, y, y_val = train_test_split(X, y, test_size=0.1, stratify=y)
        
        model = model_class(None)
    
        model.fit(X, y, eval_set=(X_val, y_val))
        X, y = X_train[test_index], y_train[test_index]
        y_pred = model.predict(X)
        score = accuracy_score(y, y_pred)
        scores.append(score)
        print(score)
  
    return(np.array(scores))

In [657]:
import catboost


model_cat = lambda class_weights: catboost.CatBoostClassifier(task_type='GPU', loss_function='MultiClass',
                                                              class_weights=class_weights,
                                            iterations=1000, max_depth=6, thread_count=12, verbose=False,
                                                          bagging_temperature=0.05,
                                                              learning_rate=0.05,
                                                         )


In [658]:
score = rmsle_cv(model_cat)
print("CatBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))



0.5553319919517102
0.5181451612903226
0.5476673427991886
0.5218718209562564
0.5412844036697247
CatBoost score: 0.5369 (0.0145)



In [659]:
import lightgbm as lgb
model_lgb = lambda class_weights: lgb.LGBMClassifier(
    num_leaves=200,
    learning_rate=0.05, n_estimators=115,
    bagging_fraction = 0.8,
    bagging_freq = 3,
    min_data_in_leaf=80,
    device='gpu',
    # is_unbalance=True,
    metrics=['acc'],
    n_jobs=-1,
    verbose=0,
)
score = rmsle_cv(model_lgb)



[1]	valid_0's multi_logloss: 2.78399
[2]	valid_0's multi_logloss: 2.66567
[3]	valid_0's multi_logloss: 2.56947
[4]	valid_0's multi_logloss: 2.48596
[5]	valid_0's multi_logloss: 2.41318
[6]	valid_0's multi_logloss: 2.3498
[7]	valid_0's multi_logloss: 2.29311
[8]	valid_0's multi_logloss: 2.24231
[9]	valid_0's multi_logloss: 2.19677
[10]	valid_0's multi_logloss: 2.15248
[11]	valid_0's multi_logloss: 2.1133
[12]	valid_0's multi_logloss: 2.07788
[13]	valid_0's multi_logloss: 2.04482
[14]	valid_0's multi_logloss: 2.01283
[15]	valid_0's multi_logloss: 1.98309
[16]	valid_0's multi_logloss: 1.95725
[17]	valid_0's multi_logloss: 1.93299
[18]	valid_0's multi_logloss: 1.91025
[19]	valid_0's multi_logloss: 1.88885
[20]	valid_0's multi_logloss: 1.86683
[21]	valid_0's multi_logloss: 1.84574
[22]	valid_0's multi_logloss: 1.82702
[23]	valid_0's multi_logloss: 1.80842
[24]	valid_0's multi_logloss: 1.79081
[25]	valid_0's multi_logloss: 1.77345
[26]	valid_0's multi_logloss: 1.75723
[27]	valid_0's multi_lo

[105]	valid_0's multi_logloss: 1.26371
[106]	valid_0's multi_logloss: 1.26239
[107]	valid_0's multi_logloss: 1.26224
[108]	valid_0's multi_logloss: 1.26097
[109]	valid_0's multi_logloss: 1.25947
[110]	valid_0's multi_logloss: 1.25813
[111]	valid_0's multi_logloss: 1.25791
[112]	valid_0's multi_logloss: 1.25741
[113]	valid_0's multi_logloss: 1.25683
[114]	valid_0's multi_logloss: 1.25762
[115]	valid_0's multi_logloss: 1.25701
0.513622603430878
[1]	valid_0's multi_logloss: 2.77456
[2]	valid_0's multi_logloss: 2.65072
[3]	valid_0's multi_logloss: 2.54508
[4]	valid_0's multi_logloss: 2.45638
[5]	valid_0's multi_logloss: 2.37756
[6]	valid_0's multi_logloss: 2.30923
[7]	valid_0's multi_logloss: 2.25011
[8]	valid_0's multi_logloss: 2.19627
[9]	valid_0's multi_logloss: 2.14716
[10]	valid_0's multi_logloss: 2.10198
[11]	valid_0's multi_logloss: 2.0604
[12]	valid_0's multi_logloss: 2.02304
[13]	valid_0's multi_logloss: 1.98674
[14]	valid_0's multi_logloss: 1.9537
[15]	valid_0's multi_logloss: 1.

[90]	valid_0's multi_logloss: 1.36182
[91]	valid_0's multi_logloss: 1.36058
[92]	valid_0's multi_logloss: 1.35958
[93]	valid_0's multi_logloss: 1.35874
[94]	valid_0's multi_logloss: 1.35796
[95]	valid_0's multi_logloss: 1.35627
[96]	valid_0's multi_logloss: 1.35635
[97]	valid_0's multi_logloss: 1.3544
[98]	valid_0's multi_logloss: 1.35369
[99]	valid_0's multi_logloss: 1.352
[100]	valid_0's multi_logloss: 1.35091
[101]	valid_0's multi_logloss: 1.34935
[102]	valid_0's multi_logloss: 1.34871
[103]	valid_0's multi_logloss: 1.34836
[104]	valid_0's multi_logloss: 1.34847
[105]	valid_0's multi_logloss: 1.34859
[106]	valid_0's multi_logloss: 1.34889
[107]	valid_0's multi_logloss: 1.34894
[108]	valid_0's multi_logloss: 1.34836
[109]	valid_0's multi_logloss: 1.34761
[110]	valid_0's multi_logloss: 1.34793
[111]	valid_0's multi_logloss: 1.34805
[112]	valid_0's multi_logloss: 1.34752
[113]	valid_0's multi_logloss: 1.34755
[114]	valid_0's multi_logloss: 1.34686
[115]	valid_0's multi_logloss: 1.34589

In [660]:
print("LGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

LGBM score: 0.5329 (0.0112)



In [661]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models, class_weights):
        self.models = models
        self.class_weights = class_weights
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y, eval_set):
        y_new = y.copy()
        self.models_ = [x(self.class_weights) for x, _ in self.models]
        self.weights = [weight for _, weight in self.models]
        
        # Train cloned base models
        for model in self.models_:
            
            model.fit(X, y_new,
                      eval_set=eval_set)


        return self
    
    # Now we do the predictions for cloned models and average them
    
    def predict(self, X):
        sum_weight = 0
        for weight in self.weights:
            sum_weight += weight
        predictions = np.add(*[
            model.predict_proba(X) * (self.weights[i] / sum_weight) for i, model in enumerate(self.models_)
        ])
  
        
        return np.argmax(predictions, axis=1) + 1 

In [664]:
averaged_models = lambda class_weights: AveragingModels(models = ((model_cat, 1), (model_lgb, 1)),
                                                        class_weights=class_weights)

score = rmsle_cv(averaged_models)



[1]	valid_0's multi_logloss: 2.78678
[2]	valid_0's multi_logloss: 2.66461
[3]	valid_0's multi_logloss: 2.56583
[4]	valid_0's multi_logloss: 2.47783
[5]	valid_0's multi_logloss: 2.40212
[6]	valid_0's multi_logloss: 2.33579
[7]	valid_0's multi_logloss: 2.27584
[8]	valid_0's multi_logloss: 2.22212
[9]	valid_0's multi_logloss: 2.17411
[10]	valid_0's multi_logloss: 2.12839
[11]	valid_0's multi_logloss: 2.08708
[12]	valid_0's multi_logloss: 2.04823
[13]	valid_0's multi_logloss: 2.01179
[14]	valid_0's multi_logloss: 1.97885
[15]	valid_0's multi_logloss: 1.94769
[16]	valid_0's multi_logloss: 1.91923
[17]	valid_0's multi_logloss: 1.89154
[18]	valid_0's multi_logloss: 1.86622
[19]	valid_0's multi_logloss: 1.84029
[20]	valid_0's multi_logloss: 1.81707
[21]	valid_0's multi_logloss: 1.79529
[22]	valid_0's multi_logloss: 1.77496
[23]	valid_0's multi_logloss: 1.75554
[24]	valid_0's multi_logloss: 1.73729
[25]	valid_0's multi_logloss: 1.72042
[26]	valid_0's multi_logloss: 1.70437
[27]	valid_0's multi_

[103]	valid_0's multi_logloss: 1.37891
[104]	valid_0's multi_logloss: 1.37849
[105]	valid_0's multi_logloss: 1.37799
[106]	valid_0's multi_logloss: 1.37759
[107]	valid_0's multi_logloss: 1.37722
[108]	valid_0's multi_logloss: 1.37655
[109]	valid_0's multi_logloss: 1.376
[110]	valid_0's multi_logloss: 1.37487
[111]	valid_0's multi_logloss: 1.37416
[112]	valid_0's multi_logloss: 1.37329
[113]	valid_0's multi_logloss: 1.37258
[114]	valid_0's multi_logloss: 1.37179
[115]	valid_0's multi_logloss: 1.37086
0.5338042381432896
[1]	valid_0's multi_logloss: 2.78612
[2]	valid_0's multi_logloss: 2.67184
[3]	valid_0's multi_logloss: 2.57927
[4]	valid_0's multi_logloss: 2.49642
[5]	valid_0's multi_logloss: 2.42564
[6]	valid_0's multi_logloss: 2.36309
[7]	valid_0's multi_logloss: 2.30509
[8]	valid_0's multi_logloss: 2.25229
[9]	valid_0's multi_logloss: 2.20505
[10]	valid_0's multi_logloss: 2.16359
[11]	valid_0's multi_logloss: 2.12495
[12]	valid_0's multi_logloss: 2.08866
[13]	valid_0's multi_logloss:

[89]	valid_0's multi_logloss: 1.36583
[90]	valid_0's multi_logloss: 1.36462
[91]	valid_0's multi_logloss: 1.36345
[92]	valid_0's multi_logloss: 1.36261
[93]	valid_0's multi_logloss: 1.36144
[94]	valid_0's multi_logloss: 1.36076
[95]	valid_0's multi_logloss: 1.36055
[96]	valid_0's multi_logloss: 1.35903
[97]	valid_0's multi_logloss: 1.3583
[98]	valid_0's multi_logloss: 1.35816
[99]	valid_0's multi_logloss: 1.35755
[100]	valid_0's multi_logloss: 1.35686
[101]	valid_0's multi_logloss: 1.35588
[102]	valid_0's multi_logloss: 1.3564
[103]	valid_0's multi_logloss: 1.35545
[104]	valid_0's multi_logloss: 1.35514
[105]	valid_0's multi_logloss: 1.35551
[106]	valid_0's multi_logloss: 1.35439
[107]	valid_0's multi_logloss: 1.35392
[108]	valid_0's multi_logloss: 1.35338
[109]	valid_0's multi_logloss: 1.35297
[110]	valid_0's multi_logloss: 1.35206
[111]	valid_0's multi_logloss: 1.352
[112]	valid_0's multi_logloss: 1.35151
[113]	valid_0's multi_logloss: 1.35089
[114]	valid_0's multi_logloss: 1.34982
[

In [665]:
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.5452 (0.0089)

