## Задание 1 Реализовать алгоритм Леска и проверить его на реальном датасете (8 баллов)
Ворднет можно использовать для дизамбигуации. Самый простой алгоритм дизамбигуации - алгоритм Леска. В нём нужное значение слова находится через пересечение слов контекста, в котором употреблено это слово, с определениями значений слова из ворднета. Значение с максимальным пересечением - нужное.

Реализуйте его

In [21]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\addre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from pymorphy2 import MorphAnalyzer
from string import punctuation
import json, os, re, sys
from collections import Counter
import numpy as np
from razdel import tokenize as razdel_tokenize
import gensim
import pandas as pd
from sklearn.metrics import adjusted_rand_score
import tqdm
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('russian'))

def normalize(text):
    
    words = [token.text.strip(punct) for token in list(razdel_tokenize(text))]
    words = [morph.parse(word)[0].normal_form for word in words if word and word not in stops]

    return words

In [48]:
def lesk(word, sentence):
    bestsense = 0
    maxoverlap = 0
    synsets = wn.synsets(word)
    
    for i, syns in enumerate(synsets):
        senses = word_tokenize(syns.definition())
        overlap = len(set(senses).intersection(set(sentence)))
        if overlap > maxoverlap:
            maxoverlap = overlap
            bestsense = i
        
    return bestsense

In [66]:
def get_words_in_context(words, n=3):
    neighbours = [] 
    for i in range(len(words)):
        if i-n<0:
            words_after = words[i+1:i+n+1]
            words_before = words[0:i]
            ws = words_before + words_after
            lst = list([words[i], ws])
            neighbours.append(lst)   
        
        else:
            words_after = words[i+1:i+n+1]
            words_before = words[i-n:i]
            ws = words_before + words_after
            lst = list([words[i], ws])
            neighbours.append(lst)
    return neighbours

In [41]:
corpus_wsd = []

corpus = open('corpus_wsd_50k.txt').read().split('\n\n')
for sent in corpus:
    corpus_wsd.append([s.split('\t') for s in sent.split('\n')])

In [64]:
corpus_wsd[1]

[['', 'have', 'Have'],
 ['', 'you', 'you'],
 ['permit%2:41:00::', 'permit', 'permitted'],
 ['', 'it', 'it'],
 ['', 'to', 'to'],
 ['become%2:42:01::', 'become', 'become'],
 ['', 'a', 'a'],
 ['giveaway%1:21:00::', 'giveaway', 'giveaway'],
 ['program%1:09:01::', 'program', 'program'],
 ['rather%4:02:02::', 'rather', 'rather'],
 ['', 'than', 'than'],
 ['', 'one', 'one'],
 ['', 'that', 'that'],
 ['have%2:42:00::', 'have', 'has'],
 ['', 'the', 'the'],
 ['goal%1:09:00::', 'goal', 'goal'],
 ['', 'of', 'of'],
 ['improved%3:00:00::', 'improved', 'improved'],
 ['employee%1:18:00::', 'employee', 'employee'],
 ['morale%1:26:00::', 'morale', 'morale'],
 ['', 'and', 'and'],
 ['', ',', ','],
 ['consequently%4:02:00::', 'consequently', 'consequently'],
 ['', ',', ','],
 ['increased%3:00:00::', 'increased', 'increased'],
 ['productivity%1:07:00::', 'productivity', 'productivity'],
 ['', '?', '?']]

In [67]:
corpus_wsd_short = corpus_wsd[:1000]

In [72]:
wordnet_list = []
lesk_list = []

for i, sent in enumerate(corpus_wsd_short):
    if sent[0][0]:
        context = []
        
        for w in sent:
            if '%' in w[0]:
                context.append(w[1])
                wn_var = wn.lemma_from_key(w[0]).synset()
                wordnet_list.append(wn_var)
        
        words_in_context = get_words_in_context(context, n=3)
        
        for st in words_in_context:
            i = lesk(st[0], st[1])
            lesk_var = wn.synsets(st[0])[i]
            lesk_list.append(lesk_var)

In [73]:
d = {'wordnet': wordnet_list, 'lesk': lesk_list}
df = pd.DataFrame(data=d)

In [74]:
df.head(10)

Unnamed: 0,wordnet,lesk
0,Synset('be.v.01'),Synset('beryllium.n.01')
1,Synset('bigger.s.01'),Synset('bigger.s.01')
2,Synset('fancy.a.01'),Synset('fancy.n.02')
3,Synset('truly.r.01'),Synset('truly.r.01')
4,Synset('want.v.02'),Synset('need.n.01')
5,Synset('exist.v.01'),Synset('beryllium.n.01')
6,Synset('other.a.01'),Synset('other.a.01')
7,Synset('cheap.a.01'),Synset('cheap.a.01')
8,Synset('communication.n.01'),Synset('communication.n.01')
9,Synset('technique.n.01'),Synset('technique.n.01')


In [75]:
match = np.zeros(df.shape[0])

for i, syn in enumerate(df['wordnet'].values):
    if df['lesk'][i] == syn:
        match[i] = 1
    else:
        match[i] = 0

In [76]:
df['match'] = match
df.head(10)

Unnamed: 0,wordnet,lesk,match
0,Synset('be.v.01'),Synset('beryllium.n.01'),0.0
1,Synset('bigger.s.01'),Synset('bigger.s.01'),1.0
2,Synset('fancy.a.01'),Synset('fancy.n.02'),0.0
3,Synset('truly.r.01'),Synset('truly.r.01'),1.0
4,Synset('want.v.02'),Synset('need.n.01'),0.0
5,Synset('exist.v.01'),Synset('beryllium.n.01'),0.0
6,Synset('other.a.01'),Synset('other.a.01'),1.0
7,Synset('cheap.a.01'),Synset('cheap.a.01'),1.0
8,Synset('communication.n.01'),Synset('communication.n.01'),1.0
9,Synset('technique.n.01'),Synset('technique.n.01'),1.0


In [77]:
true = df[df['match'] == 1].shape[0]
al = df['match'].shape[0]

In [78]:
print(true/al)

0.5414615675880349
