In [32]:
# pip install datasets catboost

In [78]:
import re
import string
import random
import numpy as np
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score, balanced_accuracy_score

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<torch._C.Generator at 0x7ba85ecb9530>

In [79]:
def remove_stop_words(text):
  stop_words = stopwords.words("english")
  text = " ".join(word for word in text.split(" ") if word not in stop_words)
  return text

def stem_text(text):
  stemmer = nltk.SnowballStemmer("english")
  text = " ".join(stemmer.stem(word) for word in text.split(" "))
  return text


def clean_text(text):
  text = text.lower()
  text = re.sub(r"\[.*?\]", "", text)
  text = re.sub(r"https?://\S+|www\.\S+", " ", text)
  text = re.sub(r"<.*?>+", " ", text)
  text = re.sub(r"[%s]" % re.escape(string.punctuation), " ", text)
  text = re.sub(r"\n", " ", text)
  text = re.sub(r"\w*\d\w*", " ", text)
  text = re.sub(r" +", " ", text)
  text = remove_stop_words(text)
  text = stem_text(text)
  return text

def sample_data(dataset, num_samples_per_class, labels):
    sampled_data = []
    for label in range(len(labels)):
        class_data = [example for example in dataset if example["label"] == label]
        sampled_data.extend(random.sample(class_data, num_samples_per_class))
        random.shuffle(sampled_data)
    return sampled_data

def format_instruction(example):
    instruction = """
    Instruction: Determine the category of the given text (provided below).
    Choose exactly one category from the following options: World, Sports, Business, or Sci/Tech.
    Your output should be a single word representing the selected category."""
    text = example["text"]
    output = labels[example["label"]]
    return {
        "instruction": instruction,
        "text": text,
        "output": output
    }

## Data Loading

In [80]:
labels = ["World", "Sports", "Business", "Sci/Tech"]

dataset = load_dataset("ag_news")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [81]:
train_samples = sample_data(dataset["train"], 400, labels)
test_samples = sample_data(dataset["test"], 100, labels)

formatted_train = [format_instruction(sample) for sample in train_samples]
formatted_test = [format_instruction(sample) for sample in test_samples]

# save_to_file(formatted_train, "small_train_data.txt")
# save_to_file(formatted_test, "small_test_data.txt")

In [82]:
train_texts = [o['text'] for o in formatted_train]
test_texts = [o['text'] for o in formatted_test]

train_outputs = [o['output'] for o in formatted_train]
test_outputs = [o['output'] for o in formatted_test]

df_train = pd.DataFrame({'text': train_texts, 'label': train_outputs})
df_test = pd.DataFrame({'text': test_texts, 'label': test_outputs})

df_train['text'] = df_train['text'].astype(str)
df_test['text'] = df_test['text'].astype(str)

df_train['cleaned_text'] = df_train['text'].apply(lambda x: clean_text(x))
df_test['cleaned_text'] = df_test['text'].apply(lambda x: clean_text(x))

In [83]:
df_train['text'][13]

'Search Engine Milestones for August 2004 Notable news and announcements from the web search world during the past month. '

### SVM + Tfidf

In [122]:
vectorizer = TfidfVectorizer(max_features=3500)
x_train_tfidf = vectorizer.fit_transform(df_train['cleaned_text'])
x_test_tfidf = vectorizer.transform(df_test['cleaned_text'])

model = svm.SVC(kernel='linear', probability=True, C=0.5)
model.fit(x_train_tfidf, df_train['label'])

In [123]:
test_predictions = model.predict(x_test_tfidf)
overall_accuracy = accuracy_score(df_test['label'], test_predictions)
weighted_accuracy = balanced_accuracy_score(df_test['label'], test_predictions)
print("Total Accuracy:", overall_accuracy)
print("Weighted Accuracy", weighted_accuracy)

Total Accuracy: 0.855
Weighted Accuracy 0.855


### CatBoost + Tfidf

In [86]:
from catboost import CatBoostClassifier

x_train_tfidf = x_train_tfidf.toarray()
x_test_tfidf = x_test_tfidf.toarray()

In [133]:
catboost_model = CatBoostClassifier(
    iterations=7000,
    learning_rate=0.03,
    depth=6,
    task_type="GPU",
    loss_function="MultiClass",
    verbose=100
)

In [134]:
catboost_model.fit(
    x_train_tfidf,
    df_train['label'],
    eval_set=(x_test_tfidf, df_test['label']),
    use_best_model=True,
    early_stopping_rounds=50
)

0:	learn: 1.3756079	test: 1.3777536	best: 1.3777536 (0)	total: 97.9ms	remaining: 11m 25s
100:	learn: 0.9927630	test: 1.0540977	best: 1.0540977 (100)	total: 3.29s	remaining: 3m 44s
200:	learn: 0.8564317	test: 0.9419499	best: 0.9419499 (200)	total: 5.8s	remaining: 3m 16s
300:	learn: 0.7740094	test: 0.8785787	best: 0.8785787 (300)	total: 8.38s	remaining: 3m 6s
400:	learn: 0.7176921	test: 0.8384872	best: 0.8384872 (400)	total: 9.74s	remaining: 2m 40s
500:	learn: 0.6722822	test: 0.8072815	best: 0.8072810 (499)	total: 10.7s	remaining: 2m 18s
600:	learn: 0.6367200	test: 0.7825732	best: 0.7825732 (600)	total: 13.8s	remaining: 2m 26s
700:	learn: 0.6059185	test: 0.7625755	best: 0.7625755 (700)	total: 15.4s	remaining: 2m 18s
800:	learn: 0.5798681	test: 0.7469247	best: 0.7468788 (799)	total: 16.4s	remaining: 2m 6s
900:	learn: 0.5588514	test: 0.7355867	best: 0.7355867 (900)	total: 17.3s	remaining: 1m 56s
1000:	learn: 0.5402034	test: 0.7257684	best: 0.7257586 (999)	total: 18.2s	remaining: 1m 49s
110

<catboost.core.CatBoostClassifier at 0x7ba753772530>

In [135]:
y_pred = catboost_model.predict(x_test_tfidf)
y_pred_proba = catboost_model.predict_proba(x_test_tfidf)

accuracy = accuracy_score(df_test['label'], y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8225


### CatBoost + BERT embeddings

In [136]:
from transformers import AutoTokenizer, AutoModel
import torch

bert_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
model = AutoModel.from_pretrained(bert_model_name)

def get_bert_embeddings_batch(texts, model, tokenizer, max_length=128, batch_size=16, device="cuda"):
    model.eval()
    model.to(device)
    embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, padding='max_length', truncation=True,
                           max_length=max_length, return_tensors="pt").to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.extend(cls_embeddings)

    return embeddings

In [137]:
train_embeddings = get_bert_embeddings_batch(
    texts=df_train['cleaned_text'].tolist(),
    model=model,
    tokenizer=tokenizer,
    max_length=128,
    batch_size=16,
    device="cuda"
)

test_embeddings = get_bert_embeddings_batch(
    texts=df_test['cleaned_text'].tolist(),
    model=model,
    tokenizer=tokenizer,
    max_length=128,
    batch_size=16,
    device="cuda"
)

100%|██████████| 100/100 [00:12<00:00,  7.92it/s]
100%|██████████| 25/25 [00:03<00:00,  7.66it/s]


In [138]:
x_train = train_embeddings
y_train = df_train['label']
x_test = test_embeddings
y_test = df_test['label']

catboost_model_bert = CatBoostClassifier(
    iterations=3000,
    learning_rate=0.1,
    depth=6,
    loss_function="MultiClass",
    verbose=50,
    task_type="GPU"
)

catboost_model_bert.fit(
    x_train,
    y_train,
    eval_set=(x_test, y_test),
    early_stopping_rounds=50,
    use_best_model=True
)

0:	learn: 1.3028867	test: 1.3078564	best: 1.3078564 (0)	total: 22.6ms	remaining: 1m 7s
50:	learn: 0.3971889	test: 0.5756845	best: 0.5756845 (50)	total: 1.01s	remaining: 58.5s
100:	learn: 0.2569678	test: 0.5130727	best: 0.5130727 (100)	total: 1.92s	remaining: 55.2s
150:	learn: 0.1946265	test: 0.4871980	best: 0.4871980 (150)	total: 3.89s	remaining: 1m 13s
200:	learn: 0.1552318	test: 0.4731175	best: 0.4731175 (200)	total: 5.96s	remaining: 1m 23s
250:	learn: 0.1270661	test: 0.4637890	best: 0.4637890 (250)	total: 6.79s	remaining: 1m 14s
300:	learn: 0.1055007	test: 0.4563731	best: 0.4563128 (298)	total: 7.61s	remaining: 1m 8s
350:	learn: 0.0903589	test: 0.4487520	best: 0.4486456 (346)	total: 8.47s	remaining: 1m 3s
400:	learn: 0.0768570	test: 0.4431409	best: 0.4431409 (400)	total: 9.29s	remaining: 1m
450:	learn: 0.0671008	test: 0.4417788	best: 0.4417788 (450)	total: 10.1s	remaining: 57.1s
500:	learn: 0.0585880	test: 0.4406841	best: 0.4404406 (486)	total: 10.9s	remaining: 54.6s
550:	learn: 0.0

<catboost.core.CatBoostClassifier at 0x7ba753773250>

In [139]:
# Предсказания и оценка
y_pred_bert = catboost_model_bert.predict(x_test)
accuracy = accuracy_score(y_test, y_pred_bert)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8450
