# Введение в обработку текста на естественном языке

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pymorphy2
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.metrics.distance import edit_distance
import random
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial import distance

## Лабораторная работа 9

### Расстояние редактирования

1.1 Загрузите предобработанные описания рецептов из файла `preprocessed_descriptions.csv`. Получите набор уникальных слов `words`, содержащихся в текстах описаний рецептов (воспользуйтесь `word_tokenize` из `nltk`).

In [None]:
data = pd.read_csv('preprocessed_descriptions.csv')
data

Unnamed: 0.1,Unnamed: 0,name,preprocessed_descriptions
0,0,george s at the cove black bean soup,an original recipe created by chef scott meska...
1,1,healthy for them yogurt popsicles,my children and their friends ask for my homem...
2,2,i can t believe it s spinach,these were so go it surprised even me
3,3,italian gut busters,my sisterinlaw made these for us at a family g...
4,4,love is in the air beef fondue sauces,i think a fondue is a very romantic casual din...
...,...,...,...
29995,29995,zurie s holey rustic olive and cheddar bread,this is based on a french recipe but i changed...
29996,29996,zwetschgenkuchen bavarian plum cake,this is a traditional fresh plum cake thought ...
29997,29997,zwiebelkuchen southwest german onion cake,this is a traditional late summer early fall s...
29998,29998,zydeco soup,this is a delicious soup that i originally fou...


In [None]:
all_words = []
for i in data['preprocessed_descriptions']:
    all_words.extend([word for word in word_tokenize(str(i)) if word.isalpha()])
words = set(all_words)
print(f'Количество уникальных слов: {len(words)}')
print(words)

Количество уникальных слов: 30743


1.2 Сгенерируйте 5 пар случайно выбранных слов и посчитайте между ними расстояние редактирования.

In [None]:
from random import sample
for i in range(5):
    print(f'Слова: {sample(words, 2)}, расстояние: {edit_distance(sample(words, 2)[0], sample(words, 2)[1])}')

Слова: ['imperamagna', 'humbles'], расстояние: 10
Слова: ['louisana', 'healtier'], расстояние: 11
Слова: ['cookingaccording', 'carnations'], расстояние: 11
Слова: ['drinkswap', 'soupfor'], расстояние: 14
Слова: ['crustacean', 'fratelli'], расстояние: 11


1.3 Напишите функцию, которая для заданного слова `word` возвращает `k` ближайших к нему слов из списка `words` (близость слов измеряется с помощью расстояния Левенштейна)

In [None]:
def near_dist(word, k):
    distance = {i : edit_distance(word, i) for i in words}
    sort_dist = sorted(distance.items(), key = lambda x: x[1])
    return sort_dist[:k]
near_dist('humbles',5)

[('humbles', 0),
 ('humble', 1),
 ('rumbled', 2),
 ('tumblers', 2),
 ('crumbles', 2)]

### Стемминг, лемматизация

2.1 На основе результатов 1.1 создайте `pd.DataFrame` со столбцами:
    * word
    * stemmed_word
    * normalized_word

Столбец `word` укажите в качестве индекса.

Для стемминга воспользуйтесь `SnowballStemmer`, для нормализации слов - `WordNetLemmatizer`. Сравните результаты стемминга и лемматизации.

In [None]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
df_words = pd.DataFrame(words, columns = ['word'])
df_words['stemmed_word'] = df_words.apply(lambda x: stemmer.stem(x['word']), axis = 1)
df_words['normalized_word'] = df_words.apply(lambda x: lemmatizer.lemmatize(x['word']), axis = 1)
df_words = df_words.set_index('word')
df_words

Unnamed: 0_level_0,stemmed_word,normalized_word
word,Unnamed: 1_level_1,Unnamed: 2_level_1
asked,ask,asked
foofoo,foofoo,foofoo
brackets,bracket,bracket
containerever,containerev,containerever
potaoes,potao,potaoes
...,...,...
speckle,speckl,speckle
makejust,makejust,makejust
brain,brain,brain
pastes,past,paste


2.2. Удалите стоп-слова из описаний рецептов. Какую долю об общего количества слов составляли стоп-слова? Сравните топ-10 самых часто употребляемых слов до и после удаления стоп-слов.

In [None]:
stop_words = stopwords.words('english')
words_no_stop = [word for word in all_words if word not in stop_words]
print(f'Доля стоп слов составляла: {(len(all_words) - len(words_no_stop)) / len(all_words) * 100}\nДо удаления:\n')
for k, v in nltk.FreqDist(all_words).most_common(10):
    print(f'{k} - {v}')
print('\nПосле удаления:\n')
for k, v in nltk.FreqDist(words_no_stop).most_common(10):
    print(f'{k} - {v}')

Доля стоп слов составляла: 46.10722870409009
До удаления:

the - 40072
a - 34951
and - 30245
this - 26859
i - 24836
to - 23471
is - 20285
it - 19756
of - 18364
for - 15939

После удаления:

recipe - 14871
make - 6326
time - 5137
use - 4620
great - 4430
like - 4167
easy - 4152
one - 3872
made - 3810
good - 3791


### Векторное представление текста

3.1 Выберите случайным образом 5 рецептов из набора данных. Представьте описание каждого рецепта в виде числового вектора при помощи `TfidfVectorizer`

In [None]:
data3 = data.sample(5)
tv = TfidfVectorizer()
for i in data3['preprocessed_descriptions']:
    print(tv.fit_transform(sent_tokenize(i)).toarray(), '\n')

[[0.21821789 0.21821789 0.21821789 0.21821789 0.21821789 0.21821789
  0.21821789 0.21821789 0.21821789 0.21821789 0.21821789 0.21821789
  0.21821789 0.21821789 0.21821789 0.21821789 0.21821789 0.21821789
  0.21821789 0.21821789 0.21821789]] 

[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111 0.22222222 0.22222222 0.11111111
  0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.22222222
  0.33333333 0.11111111 0.11111111 0.11111111 0.11111111 0.22222222
  0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.44444444 0.11111111]] 

[[0.13018891 0.13018891 0.13018891 0.13018891 0.13018891 0.13018891
  0.13018891 0.13018891 0.13018891 0.13018891 0.13018891 0.13018891
  0.13018891 0.13018891 0.13018891 0.13018891 0.13018891 0.13018891
  0.13018891 0.39056673 0.1

In [None]:
data3

Unnamed: 0.1,Unnamed: 0,name,preprocessed_descriptions
5127,5127,cheese cracker pizza nachos,i honestly dont know how we came up with this ...
18848,18848,not my mothers meatloaf or meatloaf for an exp...,you know how it is your away from home in a la...
29851,29851,zesty raspberry chipotle black bean dip,this creamy tangy mildly spicy dip is always a...
16999,16999,marbled banana bars,two favorites in one moist cakey bar they rea...
1981,1981,baked southwestern egg rolls with avocado ranch,make sure you dont fill the rolls too much or ...


3.2 Вычислите близость между каждой парой рецептов, выбранных в задании 3.1, используя косинусное расстояние (`scipy.spatial.distance.cosine`) Результаты оформите в виде таблицы `pd.DataFrame`. В качестве названий строк и столбцов используйте названия рецептов.

In [None]:
from scipy.spatial.distance import cosine, pdist, squareform
recipes = pd.DataFrame({
    'recipe1': [0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068,
                0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068,
                0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0.2236068, 0, 0, 0],
    'recipe2': [0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614,
                0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.39223227, 0.19611614,
                0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614,
                0.19611614, 0.19611614, 0.19611614, 0.19611614, 0.19611614],
    'recipe3': [0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501,
                0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501, 0.2773501, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    'recipe4': [0.32879797, 0.16439899, 0.16439899, 0.16439899, 0.16439899, 0.16439899,
                0.16439899, 0.16439899, 0.16439899, 0.16439899, 0.16439899, 0.16439899,
                0.16439899, 0.16439899, 0.16439899, 0.32879797, 0.16439899, 0.16439899,
                0.16439899, 0.16439899, 0.16439899, 0.49319696, 0.16439899],
    'recipe5': [0.33333333, 0.33333333, 0.33333333, 0.33333333, 0.33333333, 0.33333333,
                0.33333333, 0.33333333, 0.33333333, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]})


df = pd.DataFrame(recipes)

distances = pdist(df.values)
square_distances = squareform(distances)


distance_df = pd.DataFrame(square_distances, columns=df.index, index=df.index)


pd.DataFrame(distance_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.0,0.164399,0.164399,0.164399,0.164399,0.164399,0.164399,0.164399,0.164399,0.371669,...,0.463747,0.463747,0.433629,0.463747,0.463747,0.463747,0.463747,0.514841,0.514841,0.514841
1,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
2,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
3,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
4,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
5,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
6,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
7,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
8,0.164399,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.433629,0.433629,0.463747,0.433629,0.433629,0.433629,0.433629,0.487887,0.588339,0.487887
9,0.371669,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,...,0.27735,0.27735,0.322413,0.27735,0.27735,0.27735,0.27735,0.356263,0.4848,0.356263


In [None]:
from scipy.spatial.distance import cosine
data3['preprocessed_descriptions'] = data3['preprocessed_descriptions'].astype(str)
tv = TfidfVectorizer()
vectors = tv.fit_transform(data3['preprocessed_descriptions']).toarray()
df3_2 = pd.DataFrame(index = data3['name'], columns = data3['name'])
for i, r1 in enumerate(data3['name']):
    for j, r2 in enumerate(data3['name']):
        df3_2.at[r1, r2] = 1 - cosine(vectors[i], vectors[j])
df3_2

name,cheese cracker pizza nachos,not my mothers meatloaf or meatloaf for an expat in morroco,zesty raspberry chipotle black bean dip,marbled banana bars,baked southwestern egg rolls with avocado ranch
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
cheese cracker pizza nachos,1.0,0.101797,0.11524,0.020248,0.021428
not my mothers meatloaf or meatloaf for an expat in morroco,0.101797,1.0,0.224969,0.090628,0.042477
zesty raspberry chipotle black bean dip,0.11524,0.224969,1.0,0.103057,0.042442
marbled banana bars,0.020248,0.090628,0.103057,1.0,0.110937
baked southwestern egg rolls with avocado ranch,0.021428,0.042477,0.042442,0.110937,1.0


3.3 Какие рецепты являются наиболее похожими? Прокомментируйте результат (словами).

In [None]:
res = df3_2.where(df3_2 != 1).max().max()
res

0.22496852868467032

In [None]:
from scipy.spatial.distance import cosine
matrix = pd.DataFrame({i:[cosine(vectors[index], vectors[index2]) for index2, k in enumerate(sample.name)] for index,i in enumerate(sample.name)}, index=sample.name)
matrix

In [None]:

row_max, row_index = matrix.values.max(axis=1), matrix.values.argmax(axis=1)
col_max, col_index = max(row_max), np.argmax(row_max)

print(f'Наиболее похожие рецепты: \n1){matrix.columns[col_index]}\n2){matrix.index.values[row_index][col_index]}\n Расстрояние между ними равно: {col_max}')