In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [13]:
data = pd.read_csv("dataset/sample.csv")
print(data.head())
print(data.info())

         class                                               text
0  arrangement  СОГЛАШЕНИЕ N 8\nо расторжении трудового догово...
1  arrangement  Соглашение о предоставлении опциона на заключе...
2  arrangement  Соглашение\nо реструктуризации задолженности\n...
3  arrangement  Дополнительное соглашение\r\nк договору купли-...
4  arrangement  Соглашение\nо расторжении договора об оказании...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   class   501 non-null    object
 1   text    501 non-null    object
dtypes: object(2)
memory usage: 8.0+ KB
None


In [14]:
X = data["text"]
y = data["class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('russian'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Приведение к нижнему регистру
    text = text.lower()
    # Удаление пунктуации и спецсимволов
    text = re.sub(r'[^\w\s]', '', text)
    # Удаление числовых символов
    text = re.sub(r'\d+', '', text)
    # Токенизация
    tokens = nltk.word_tokenize(text)
    # Удаление стоп-слов и лемматизация
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    # Объединение токенов обратно в строку
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Применение предобработки к текстам
X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Сергей\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Сергей\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
vectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.7, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Определение сетки параметров
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Создание модели логистической регрессии
model = LogisticRegression(random_state=42, max_iter=1000)

# Поиск по сетке параметров
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',  # Оптимизируем по accuracy
                           n_jobs=-1,  # Используем все доступные ядра процессора
                           verbose=2) 

grid_search.fit(X_train_vec, y_train)

# Лучшие найденные параметры
print("Best parameters: ", grid_search.best_params_)

# Лучшая модель
best_model = grid_search.best_estimator_

# Оценка качества лучшей модели на тестовой выборке
y_pred = best_model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1-score:", f1_score(y_test, y_pred, average="weighted"))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "D:\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



Best parameters:  {'C': 100, 'penalty': 'l2'}
Accuracy: 0.9801980198019802
Precision: 0.9738723872387239
Recall: 0.9801980198019802
F1-score: 0.9759547383309759


  _warn_prf(average, modifier, msg_start, len(result))


In [47]:
%%time
#modelLR = LogisticRegression(C=10, max_iter=1000)
modelLR = LogisticRegression(C=100, penalty='l2', max_iter=1000)
modelLR.fit(X_train_vec, y_train)

CPU times: total: 594 ms
Wall time: 582 ms


LogisticRegression(C=100, max_iter=1000)

In [48]:
y_pred = modelLR.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1-score:", f1_score(y_test, y_pred, average="weighted"))

Accuracy: 0.9801980198019802
Precision: 0.9738723872387239
Recall: 0.9801980198019802
F1-score: 0.9759547383309759


  _warn_prf(average, modifier, msg_start, len(result))


CatBoost

In [25]:
%%time
from catboost import CatBoostClassifier

# Создание и обучение модели CatBoost
modelCB = CatBoostClassifier(iterations=300, 
                           #eval_metric='Accuracy',
                           learning_rate=0.05,
                           random_seed=42,
                           logging_level='Verbose',  # Устанавливаем уровень логирования
                           metric_period=50,
                           od_type='Iter',  # Тип детектора переобучения
                           od_wait=100)  # Число итераций между проверками условия останова

modelCB.fit(X_train_vec, y_train)

0:	learn: 2.2936170	total: 335ms	remaining: 1m 40s
50:	learn: 0.6788766	total: 16.6s	remaining: 1m 21s
100:	learn: 0.3516000	total: 34.6s	remaining: 1m 8s
150:	learn: 0.2059833	total: 54.2s	remaining: 53.5s
200:	learn: 0.1309667	total: 1m 14s	remaining: 36.6s
250:	learn: 0.0960295	total: 1m 34s	remaining: 18.5s
299:	learn: 0.0797905	total: 1m 54s	remaining: 0us
CPU times: total: 12min 38s
Wall time: 1min 55s


<catboost.core.CatBoostClassifier at 0x298fc4566e0>

In [24]:
# Предсказания на тестовой выборке
y_pred = modelCB.predict(X_test_vec)

# Расчет метрик
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1-score:", f1_score(y_test, y_pred, average="weighted"))

Accuracy: 0.9405940594059405
Precision: 0.9372372763592149
Recall: 0.9405940594059405
F1-score: 0.9345106875379596


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.model_selection import GridSearchCV

# Определение сетки параметров
param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 300, 500],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

# Создание модели CatBoost
model = CatBoostClassifier(eval_metric='Accuracy',  # Изменяем метрику на Accuracy
                           logging_level='Verbose',  # Устанавливаем уровень логирования
                           metric_period=50,
                           random_seed=42)

# Поиск по сетке параметров
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           cv=5,  # 5-fold cross-validation
                           scoring='accuracy',  # Оптимизируем по Accuracy
                           n_jobs=-1,  # Используем все доступные ядра процессора
                           verbose=2) 

grid_search.fit(X_train_vec, y_train)

# Лучшие найденные параметры
print("Best parameters: ", grid_search.best_params_)

# Лучшая модель
best_model = grid_search.best_estimator_

# Оценка качества лучшей модели на тестовой выборке
y_pred = best_model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("F1-score:", f1_score(y_test, y_pred, average="weighted"))

Transformers

In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [9]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding='max_length', max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding='max_length', max_length=512)

In [21]:
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score

from torch.optim import AdamW
from tqdm import tqdm
import torch

# Загрузка предобученной модели и токенизатора
model_name = 'DeepPavlov/rubert-base-cased-sentence'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=11)  # 11 классов

# Перевод текстов в последовательности токенов
def tokenize(texts, max_length=512):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens=True,
                            max_length=max_length,
                            pad_to_max_length=True,
                            return_attention_mask=True,
                            return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Токенизация текстов
train_input_ids, train_attention_masks = tokenize(X_train.tolist())
val_input_ids, val_attention_masks = tokenize(X_test.tolist())

# Создание датасетов и даталоадеров
#train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train.values))
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, torch.tensor(y_train, dtype=torch.long))
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)

#val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, torch.tensor(y_test.values))
val_dataset = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, torch.tensor(y_test, dtype=torch.long))
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=8)

# Перенос модели на GPU, если доступно
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Оптимизатор и планировщик скорости обучения
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 5
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Цикл обучения
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}'):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        model.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1} average training loss: {avg_train_loss:.2f}')

    model.eval()
    accuracies = []
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        accuracies.append(flat_accuracy(logits, label_ids))
    print(f'Validation accuracy: {np.mean(accuracies):.2f}')

# Оценка на тестовой выборке
model.eval()
predictions , true_labels = [], []
for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    predictions.append(logits)
    true_labels.append(label_ids)

print(f'Accuracy: {flat_accuracy(np.concatenate(predictions, axis=0), np.concatenate(true_labels, axis=0)):.2f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Epoch 1: 100%|███████████████████████████████████████████████████████████████████████| 50/50 [1:01:25<00:00, 73.71s/it]


Epoch 1 average training loss: 1.92


NameError: name 'flat_accuracy' is not defined

In [None]:
#pip install --upgrade urllib3

In [22]:
from sklearn.metrics import accuracy_score

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

print(f'Accuracy: {flat_accuracy(np.concatenate(predictions, axis=0), np.concatenate(true_labels, axis=0)):.2f}')

NameError: name 'predictions' is not defined

In [24]:
model.eval()
predictions , true_labels = [], []
for batch in val_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    predictions.append(logits)
    true_labels.append(label_ids)

In [25]:
print(f'Accuracy: {flat_accuracy(np.concatenate(predictions, axis=0), np.concatenate(true_labels, axis=0)):.2f}')

Accuracy: 0.88
