In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import adjusted_rand_score
from sklearn.model_selection import train_test_split
import math
from collections import Counter
import numpy as np

In [2]:
df = pd.read_csv('res/bbc-text.csv')
category_num = df['category'].nunique()

In [3]:
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [4]:
stemmer = SnowballStemmer('english')
stop_words = stopwords.words('english')

In [5]:
def preprocess(text):
    tokens = text.lower().split()
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    stemmed = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed)

In [None]:
df['cleaned_text'] = df['text'].apply(preprocess)

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_text'])

In [None]:
kmeans = KMeans(n_clusters=category_num, random_state=228)
kmeans.fit(X)
clusters = kmeans.predict(X)

In [None]:
ari = adjusted_rand_score(df['category'], clusters)
print(f"Adjusted Rand Index: {ari}")

In [None]:
import matplotlib.pyplot as plt

pca = PCA(n_components=2, random_state=228)
X_reduced = pca.fit_transform(X.toarray())

plot_df = pd.DataFrame({
    'x': X_reduced[:, 0],
    'y': X_reduced[:, 1],
    'cluster': clusters,
    'category': df['category']
})

plt.figure(figsize=(12, 8))
scatter = plt.scatter(plot_df['x'], plot_df['y'],c=plot_df['cluster'], cmap='viridis',alpha=0.6)

plt.legend(*scatter.legend_elements(),
           title="Кластеры")
plt.title(f"Визуализация кластеров (ARI: {ari:.3f})")
plt.xlabel("X")
plt.ylabel("Y")
plt.show()

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(df['cleaned_text'], df['category'], test_size=0.3, random_state=228)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=228)

In [None]:
print(f"Всего данных: {len(df)}")
print(f"Тренировочные: {len(X_train)} ({len(X_train)/len(df):.0%})")
print(f"Валидационные: {len(X_val)} ({len(X_val)/len(df):.0%})")
print(f"Тестовые: {len(X_test)} ({len(X_test)/len(df):.0%})")
print("\nКоличество примеров по категориям в y_train:")
print(y_train.value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
from datasets import Dataset

le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

train_df = pd.DataFrame({'text': X_train, 'label': le.transform(y_train)})
val_df = pd.DataFrame({'text': X_val, 'label': le.transform(y_val)})
test_df = pd.DataFrame({'text': X_test, 'label': le.transform(y_test)})

dataset = Dataset.from_pandas(pd.concat([train_df, val_df, test_df]))
dataset = dataset.train_test_split(test_size=0.3, seed=228)
val_test_split = dataset['test'].train_test_split(test_size=0.5, seed=228)

dataset = {
    'train': dataset['train'],
    'validation': val_test_split['train'],
    'test': val_test_split['test']
}

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = {k: v.map(tokenize_function, batched=True) for k, v in dataset.items()}

In [None]:
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(le.classes_)
).to(device)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
test_results = trainer.evaluate(tokenized_datasets["test"])
print(f"\nТочность на тесте: {test_results['eval_accuracy']:.3f}")
print(f"F1-мера на тесте: {test_results['eval_f1']:.3f}")

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

preds_output = trainer.predict(tokenized_datasets["test"])
y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = preds_output.label_ids

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap="Blues", xticks_rotation=45)
plt.title("Матрица ошибок")
plt.show()

In [None]:
from IPython.display import Markdown, display

text = f"""
**Вывод:**

- Для обучения, валидации, тестирования была использована дообученная модель ***bert-base-uncased***.
- Точность ***accuracy = {test_results['eval_accuracy']:.3f}***, как и ***f1 = {test_results['eval_f1']:.3f}***, говорит о том, что модель с задачей классификации справилась как минимум неплохо.
- Низкий ***loss***, стремящийся к ***{test_results['eval_loss']:.3f}***, подтверждает хорошее обучение.
"""

display(Markdown(text))