In [27]:
import string

import nltk
import numpy as np
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [28]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kgalanov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kgalanov\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kgalanov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
text_data = open('data.txt', 'r').read()

text_data = text_data.lower()

tokens = word_tokenize(text_data)

stop_words = set(stopwords.words("russian"))
tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]

lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(token) for token in tokens]

tokens

['спасибо',
 'скачали',
 'книгу',
 'бесплатной',
 'электронной',
 'библиотеке',
 'royallib.ru',
 'http',
 '//royallib.ru',
 'книги',
 'автора',
 'http',
 '//royallib.ru/author/tolstoy_lev.html',
 'эта',
 'книга',
 'других',
 'форматах',
 'http',
 '//royallib.ru/book/tolstoy_lev/voyna_i_mir_kniga_1.html',
 'приятного',
 'чтения',
 'первый',
 'часть',
 'первая',
 'i',
 '–',
 'eh',
 'bien',
 'mon',
 'prince',
 'g',
 '234',
 'ne',
 'et',
 'lucques',
 'ne',
 'sont',
 'plus',
 'que',
 'de',
 'apanage',
 'de',
 'поместья',
 'de',
 'la',
 'famille',
 'buonaparte',
 'non',
 'je',
 'vous',
 'pr',
 '233',
 'viens',
 'que',
 'si',
 'vous',
 'ne',
 'me',
 'dites',
 'pa',
 'que',
 'nous',
 'avon',
 'la',
 'guerre',
 'si',
 'vous',
 'vous',
 'permettez',
 'encore',
 'de',
 'pallier',
 'toutes',
 'le',
 'infamy',
 'toutes',
 'le',
 'atrocit',
 '233',
 's',
 'de',
 'cet',
 'antichrist',
 'ma',
 'parole',
 'j',
 '’',
 'y',
 'crois',
 '–',
 'je',
 'ne',
 'vous',
 'connais',
 'plus',
 'vous',
 'n',
 '’',


In [30]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
data = fetch_20newsgroups(categories=categories)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data.data)

classifier = SVC(kernel='linear')
classifier.fit(X, data.target)

kmeans = KMeans(n_clusters=len(categories))
kmeans.fit(X)

new_data = ["Atheism is a non-prophet organization", "OpenGL on the GPU is fast"]
X_new = vectorizer.transform(new_data)
predicted = classifier.predict(X_new)

print(f"Predicted categories for new data: {np.array(data.target_names)[predicted]}")

cluster_centers_indices = np.argsort(kmeans.cluster_centers_.sum(axis=1))
print("Most relevant categories based on cluster centers:")
for i, center_idx in enumerate(cluster_centers_indices):
    print(f"Cluster {i}: {data.target_names[center_idx]}")

Predicted categories for new data: ['alt.atheism' 'comp.graphics']
Most relevant categories based on cluster centers:
Cluster 0: comp.graphics
Cluster 1: alt.atheism
Cluster 2: sci.med
Cluster 3: soc.religion.christian


In [36]:
documents = [
    "Machine learning is the study of computer algorithms that improve automatically through experience.",
    "Data science is an interdisciplinary field focused on extracting knowledge from data sets.",
    "Python is a widely used high-level programming language for general-purpose programming.",
]

tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]

word2vec_model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

word_vector = word2vec_model.wv["learning"]
print("Word vector for 'learning':", word_vector)

tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_documents)]
doc2vec_model = Doc2Vec(tagged_documents, vector_size=100, window=5, min_count=1, workers=4, epochs=20)

doc_vector = doc2vec_model.dv[0]
print("Document vector for document 0:", doc_vector)

Word vector for 'learning': [-7.1909428e-03  4.2328904e-03  2.1633946e-03  7.4407146e-03
 -4.8892652e-03 -4.5643463e-03 -6.0981740e-03  3.2993674e-03
 -4.4994629e-03  8.5228849e-03 -4.2888271e-03 -9.1054197e-03
 -4.8163556e-03  6.4164903e-03 -6.3713240e-03 -5.2615367e-03
 -7.3044109e-03  6.0222615e-03  3.3575939e-03  2.8483903e-03
 -3.1385506e-03  6.0308911e-03 -6.1527453e-03 -1.9801008e-03
 -5.9830821e-03 -9.9568011e-04 -2.0209861e-03  8.4859459e-03
  7.8001023e-05 -8.5753258e-03 -5.4290984e-03 -6.8759858e-03
  2.6923812e-03  9.4566476e-03 -5.8159959e-03  8.2650259e-03
  8.5320519e-03 -7.0626391e-03 -8.8832127e-03  9.4691841e-03
  8.3743641e-03 -4.6908916e-03 -6.7260410e-03  7.8421365e-03
  3.7633455e-03  8.0955038e-03 -7.5715459e-03 -9.5250849e-03
  1.5774060e-03 -9.8057678e-03 -4.8858845e-03 -3.4601032e-03
  9.6209226e-03  8.6235693e-03 -2.8356076e-03  5.8268728e-03
  8.2370946e-03 -2.2629809e-03  9.5285419e-03  7.1602152e-03
  2.0415008e-03 -3.8487636e-03 -5.0817500e-03 -3.0516528e