In [1]:
import nltk
import numpy as np
import re
from math import log
import pymorphy3
from nltk.tokenize import word_tokenize as wt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ayanami/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ayanami/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ayanami/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

***Лемматизация, удаление стоп слов и токенизация по словам***

In [2]:
def preprocessing(text: list[str]) -> list[list[str]]:
    processed_texts = []
    stop_words = set(stopwords.words('russian'))
    morph = pymorphy3.MorphAnalyzer()
    for text in text:
        tokens = wt(text)
        filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
        lem_text = [morph.parse(token)[0].normal_form for token in filtered_tokens]
        print(f"Лематизация текста:\n {lem_text}\n")
        processed_texts.append(lem_text)
    return processed_texts

***Словарь слов***

In [3]:
def make_dict(texts: list[list[str]]) -> dict[str, int]:
    unique_words = sorted(set(word for text in texts for word in text))
    return {word: i for i, word in enumerate(unique_words)}

***Bag of Words***

In [4]:
def count_num_words(texts: list[list[str]], word_dict: dict[str, int]) -> np.array:
    bow_array = np.zeros((len(texts), len(word_dict)))
    for i, text in enumerate(texts):
        for word in text:
            if word in word_dict:
                bow_array[i][word_dict[word]] += 1
    return bow_array

***TF-IDF***

In [5]:
def count_tfidf(texts: np.array, word_dict: dict[str, int]) -> list[list[str]]:
    row, col = bow_array.shape
    tfidf_array = np.zeros((row, col))
    idf = np.zeros(col)
    for j in range(col):
        doc_count = 0
        for i in range(row):
            if bow_array[i][j] > 0:
                doc_count += 1
        idf[j] = log((row / (doc_count + 1)))
    for i in range(row):
        total_words = sum(bow_array[i])
        for j in range(col):
            tf = bow_array[i][j] / total_words if total_words > 0 else 0
            tfidf_array[i][j] = tf * idf[j]

    return tfidf_array

In [6]:
with open("res/text.txt", 'r') as file:
    text = [re.sub(r'[^\w\s]','',line).strip() for line in file]
print(text)
res = preprocessing(text)

['У лукоморья дуб зелёный', 'Златая цепь на дубе том', 'И днём и ночью кот учёный', 'Всё ходит по цепи кругом', 'Идёт направо  песнь заводит', 'Налево  сказку говорит']
Лематизация текста:
 ['лукоморье', 'дуб', 'зелёный']

Лематизация текста:
 ['златой', 'цепь', 'дуб']

Лематизация текста:
 ['день', 'ночью', 'кот', 'учёный']

Лематизация текста:
 ['всё', 'ходить', 'цепь', 'кругом']

Лематизация текста:
 ['идти', 'направо', 'песня', 'заводить']

Лематизация текста:
 ['налево', 'сказка', 'говорить']



In [7]:
word_dict = make_dict(res)
print(f"Словарь слов:\n{word_dict}")

bow_array = count_num_words(res, word_dict)
print(f"Bag of Words:\n{bow_array}")

tfidf_array = count_tfidf(bow_array, word_dict)
print(f"TF-IDF:\n{tfidf_array}")

Словарь слов:
{'всё': 0, 'говорить': 1, 'день': 2, 'дуб': 3, 'заводить': 4, 'зелёный': 5, 'златой': 6, 'идти': 7, 'кот': 8, 'кругом': 9, 'лукоморье': 10, 'налево': 11, 'направо': 12, 'ночью': 13, 'песня': 14, 'сказка': 15, 'учёный': 16, 'ходить': 17, 'цепь': 18}
Bag of Words:
[[0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.]]
TF-IDF:
[[0.         0.         0.         0.23104906 0.         0.3662041
  0.         0.         0.         0.         0.3662041  0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.         0.         0.23104906 0.         0.
  0.3662041  0.         0.         0.         0.         0.
  0.         0.         0.       