# Подготовка

In [None]:
import nltk
nltk.download(['gutenberg'])

In [None]:
ls /root/nltk_data/corpora/gutenberg

In [None]:
import re
import nltk
import math
import numpy as np
import pandas as pd

# Некоторые функции NLTK

## Токенизация

In [None]:
some_text = """We produce about two million dollars for each hour we work.  The
fifty hours is one conservative estimate for how long it we take
to get any etext selected, entered, proofread, edited, copyright
searched and analyzed, the copyright letters written, etc.  This
projected audience is one hundred million readers.  If our value
per text is nominally estimated at one dollar, then we produce 2
million dollars per hour this year we, will have to do four text
files per month:  thus upping our productivity from one million.
The Goal of Project Gutenberg is to Give Away One Trillion Etext
Files by the December 31, 2001.  [10,000 x 100,000,000=Trillion]
This is ten thousand titles each to one hundred million readers,
which is 10% of the expected number of computer users by the end
of the year 2001."""

In [None]:
nltk.download('punkt')

In [None]:
sentences = nltk.sent_tokenize(some_text)
sentences

In [None]:
words = [nltk.word_tokenize(s) for s in sentences]
words

## Лемматизация и стемминг слова

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
!python -m spacy download ru_core_news_sm

In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import wordnet

In [None]:
stemmer = PorterStemmer()
snowball_en = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [None]:
word = 'dogs'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word))

In [None]:
word = 'walked'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

In [None]:
word = 'drove'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

In [None]:
word = 'seen'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(lemmatizer.lemmatize(word, wordnet.VERB))

In [None]:
word = 'домами'
print(stemmer.stem(word))
print(snowball_en.stem(word))
print(WordNetLemmatizer().lemmatize(word))

In [None]:
import spacy
snowball_ru = SnowballStemmer('russian')
model = spacy.load("ru_core_news_sm")

In [None]:
word = 'собаки'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

In [None]:
word = 'собаками'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

In [None]:
word = 'ходил'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

In [None]:
word = 'прохаживал'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

In [None]:
word = 'прохаживался'
print(snowball_ru.stem(word))
for token in model(word):
  print(token.lemma_)

## Стоп-слова

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
filtered_words = [word for word in words[0] if not word in stop_words]
filtered_words_2 = list(filter(lambda s: s not in stop_words, words[0]))
print(words[0])
print(filtered_words)
print(filtered_words_2)

In [None]:
print(stopwords.raw('russian')[:30])

# Мешок слов
По сути это мультисет или счетчик, но конкретно используемый как предстваление текста.

In [None]:
reviews = [
           'This pasta is very tasty and affordable.',
           'This pasta is not tasty and is affordable.',
           'This pasta is delicious and cheap.',
           'Pasta is tasty and pasta tastes good.',
]

In [None]:
from itertools import chain

words = chain(*map(nltk.word_tokenize, reviews))
unique_words = set(map(str.lower, words))
unique_words

In [None]:
import pandas as pd
vocabulary = pd.Series(list(unique_words))
vocabulary

In [None]:
lookup = pd.Series({v: k for k, v in vocabulary.items()})
lookup

In [None]:
review_words = [nltk.word_tokenize(r) for r in reviews]
review_words[0]

In [None]:
[[w.lower() for w in review] for review in review_words]

In [None]:
[[lookup[w.lower()] for w in review] for review in review_words]


In [None]:
from collections import Counter
[Counter(lookup[w.lower()] for w in review) for review in review_words]


In [None]:
def word_frequencies(words, lookup):
  counters = pd.Series(0, index=lookup.values)
  for w in words:
    counters[lookup[w]] += 1
  return counters

word_frequencies(review_words[0], lookup)

In [None]:
freqs = pd.DataFrame([word_frequencies(r_w, lookup) for r_w in review_words])
freqs

In [None]:
freqs.columns = lookup.index
freqs

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
vectorizer.vocabulary_

In [None]:
df = freqs.reindex(sorted(freqs.columns), axis=1).drop(columns='.')
df

In [None]:
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(reviews)
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# N-граммы

In [None]:
review_words[0]

In [None]:
from nltk import ngrams

bigrams = ngrams(review_words[0], 2)
trigrams = ngrams(review_words[0], 3)
fourgrams = ngrams(review_words[0], 4)

bigrams, trigrams, fourgrams

In [None]:
list(bigrams), list(trigrams), list(fourgrams)

In [None]:
from collections import Counter
alice_words = nltk.word_tokenize(nltk.corpus.gutenberg.raw("carroll-alice.txt"))
ng = ngrams(alice_words, 2)
Counter(ng).most_common(30)

In [None]:
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform([nltk.corpus.gutenberg.raw("carroll-alice.txt")])
X

In [None]:
pd.Series(X.toarray()[0], index=vectorizer.get_feature_names_out()).sort_values(ascending=False).head(20)


# TF-IDF

In [None]:
reviews

In [None]:
docs = [r_w[:-1] for r_w in review_words]
docs

In [None]:
docs[0]

In [None]:
def tf(word, doc):
    return doc.count(word)

tf('pasta', docs[0]), len(docs[0])

In [None]:
def df(word, docs):
    return sum(1 for doc in docs if word in doc)

df('pasta', docs)

In [None]:
def idf(word, docs):
    N = len(docs)
    return math.log((1 + N) / (1 + df(word, docs))) + 1

idf('pasta', docs) 

In [None]:
def tf_idf(word, doc, docs):
    return tf(word, doc) * idf(word, docs)

In [None]:
[tf_idf(w, docs[0], docs) for w in docs[0]]

In [None]:
v1 = list(zip(docs[0], [tf_idf(w, docs[0], docs) for w in docs[0]]))
v1

In [None]:
import math
norm = math.sqrt(sum(v**2 for _, v in v1))
[(k, v/norm) for k, v in v1]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
values = tfidf_vectorizer.fit_transform(reviews)

feature_names = tfidf_vectorizer.get_feature_names_out()
pd.DataFrame(values.toarray(), columns = feature_names)

## Извлечение ключевых слов

In [None]:
names = nltk.corpus.gutenberg.fileids()
names

In [None]:
texts = [nltk.corpus.gutenberg.raw(n) for n in names]

In [None]:
corpus = pd.DataFrame({'Name': names, 'Text': texts})
corpus

In [None]:
# corpus['Text'] = corpus['Text'].apply(lambda t: t.lower())

In [None]:
corpus

In [None]:
vectorizer=CountVectorizer()
vectors = vectorizer.fit_transform(corpus['Text'])
vectors

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer().fit(vectors)

In [None]:
feature_names = vectorizer.get_feature_names_out()
feature_names[2000:2010]

In [None]:
doc = corpus["Text"][0]

In [None]:
tf_idf_vector=tfidf.transform(vectorizer.transform([doc]))
tf_idf_vector

In [None]:
from scipy.sparse import coo_matrix
from typing import Dict

def vector_to_dict(vector: coo_matrix) -> Dict[int, float]:
    return {k: v for k, v in zip(vector.col, vector.data)}


In [None]:
token_scores = vector_to_dict(tf_idf_vector.tocoo())
token_scores = pd.DataFrame(token_scores.items(), columns=["word_id", "score"])
token_scores = token_scores.sort_values("score", ascending=False)
token_scores

In [None]:
token_scores['word'] = np.array(feature_names)[token_scores.word_id]
token_scores.head(10)

In [None]:
token_scores.tail(10)

In [None]:
token_scores.head()[['word', 'score']].values

In [None]:
def get_keywords(text, n=10, tfidf=tfidf, vectorizer=vectorizer):
    #generate tf-idf for the given document
    tf_idf_vector=tfidf.transform(vectorizer.transform([text]))
    token_scores = pd.DataFrame(
        vector_to_dict(tf_idf_vector.tocoo()).items(),
        columns=["word_id", "score"]
    )
    token_scores['word'] = np.array(vectorizer.get_feature_names_out())[token_scores.word_id]
    top = token_scores.sort_values("score", ascending=False).head(n)
    top.score = np.round(top.score, 3)
    return {word: score for word, score in top[["word", "score"]].values}

In [None]:
keywords = get_keywords(corpus["Text"][7])

In [None]:
for k in keywords:
    print(k, keywords[k])

In [None]:
corpus['Keywords'] = corpus["Text"].map(get_keywords)
corpus.Keywords[7]

In [None]:
corpus['kw'] = corpus["Keywords"].map(lambda d: " ".join(d.keys()))
corpus.kw[7]

In [None]:
corpus[["Name", "kw"]]

In [None]:
pd.set_option("max_colwidth", 200)

In [None]:
corpus[["Name", "Keywords"]]

In [None]:
pd.set_option("max_colwidth", 80)

## Сходство документов

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('count', vectorizer), ('idf', tfidf)])
tf_idf_vector = pipe.transform(corpus.Text)
tf_idf_vector

In [None]:
from scipy.spatial import distance
print(distance.euclidean([10, 10], [13, 14]))


In [None]:
distance.euclidean(tf_idf_vector[7].toarray(), tf_idf_vector[8].toarray())

In [None]:
corpus

In [None]:
a = corpus[['Name']].reset_index()
cross = a.merge(a, how='cross')
cross

In [None]:
from itertools import product

product_ = pd.DataFrame(product(corpus.index, corpus.index), columns=['id1', 'id2'])
product_

In [None]:
corpus.Name.loc[product_.id1]

In [None]:
product_['Name1'] = corpus.Name.loc[product_.id1].values
product_['Name2'] = corpus.Name.loc[product_.id2].values
product_

In [None]:
def euclidean_distance(id1, id2, tf_idf_vector=tf_idf_vector):
    return distance.euclidean(tf_idf_vector[id1].toarray(), tf_idf_vector[id2].toarray())

product_['Distance'] = product_.apply(lambda x: euclidean_distance(x.id1, x.id2), axis=1)
product_

In [None]:
result = product_.sort_values(by=['Distance'])
result

In [None]:
result[result['Distance'] > 0].head(10)

# Задание
1. Извлечь ключевые слова из всех текстов в корпусе, после устранения стоп-слов
2. Найти ключевые триграммы для текстов (без устранения стоп-слов)
