In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping


#from google.colab import drive
#drive.mount('/content/drive')

#md = '/content/drive/My Drive/novosti/prep_dataset.csv'

In [2]:
# 1. Загрузка данных
data = pd.read_csv('prep_dataset.csv')  # Замените на путь к вашему файлу
texts = data['main_text'].values
labels = data['category'].values

# 2. Предобработка данных
# Преобразование меток в числовой формат
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

X_train_val, X_test, y_train_val, y_test = train_test_split(texts, labels_encoded, test_size=0.2, stratify=labels_encoded, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42)

In [3]:
# 3. Векторизация текста (для RNN и CNN) и EarlyStopping
max_length = 100  # Максимальная длина последовательности
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_length)
X_val_padded = pad_sequences(X_val_seq, maxlen=max_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length)

y_train_one_hot = to_categorical(y_train, num_classes=num_classes)
y_val_one_hot = to_categorical(y_val, num_classes=num_classes)
y_test_one_hot = to_categorical(y_test, num_classes=num_classes)

In [4]:
early_stopping = EarlyStopping(
    monitor='val_loss', # Отслеживаем значение val_loss (потери на валидационной выборке)
    patience=2,          # Сколько эпох ждать улучшения, прежде чем остановить обучение
    restore_best_weights=True # Восстанавливаем веса модели, соответствующие лучшей эпохе
)

In [6]:
from keras.layers import GRU

# Модель GRU
def create_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
    model.add(GRU(64))  # Используем GRU вместо LSTM
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
# Обучение GRU
gru_model = create_gru_model()
history_gru = gru_model.fit(
    X_train_padded, y_train_one_hot,
    validation_data=(X_val_padded, y_val_one_hot),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)
gru_loss, gru_accuracy = gru_model.evaluate(X_test_padded, y_test_one_hot)



Epoch 1/10
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m616s[0m 375ms/step - accuracy: 0.5261 - loss: 1.3398 - val_accuracy: 0.8230 - val_loss: 0.5742
Epoch 2/10
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m644s[0m 389ms/step - accuracy: 0.8872 - loss: 0.3679 - val_accuracy: 0.8318 - val_loss: 0.5544
Epoch 3/10
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 389ms/step - accuracy: 0.9408 - loss: 0.1909 - val_accuracy: 0.8242 - val_loss: 0.6244
Epoch 4/10
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m670s[0m 382ms/step - accuracy: 0.9594 - loss: 0.1281 - val_accuracy: 0.8278 - val_loss: 0.6802
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.8355 - loss: 0.5481


In [8]:
print(f"Точность GRU: {gru_accuracy:.4f}")

Точность GRU: 0.8331


In [9]:
from keras.layers import Bidirectional

# Модель Bidirectional LSTM
def create_bi_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))
    model.add(Bidirectional(LSTM(64)))  # Двунаправленный LSTM
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [10]:
# Обучение Bidirectional LSTM
bi_lstm_model = create_bi_lstm_model()
history_bi_lstm = bi_lstm_model.fit(
    X_train_padded, y_train_one_hot,
    validation_data=(X_val_padded, y_val_one_hot),
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping]
)
bi_lstm_loss, bi_lstm_accuracy = bi_lstm_model.evaluate(X_test_padded, y_test_one_hot)

Epoch 1/10




[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m712s[0m 433ms/step - accuracy: 0.5710 - loss: 1.2494 - val_accuracy: 0.8136 - val_loss: 0.6080
Epoch 2/10
[1m1628/1628[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m744s[0m 434ms/step - accuracy: 0.8816 - loss: 0.3886 - val_accuracy: 0.8295 - val_loss: 0.5741
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 34ms/step - accuracy: 0.8161 - loss: 0.6075


In [11]:
print(f"Точность Bidirectional LSTM: {bi_lstm_accuracy:.4f}")

Точность Bidirectional LSTM: 0.8132


In [12]:
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer

# Извлечение признаков с помощью TF-IDF
vectorizer = TfidfVectorizer(max_features=5000) # Ограничиваем количество признаков
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [14]:
# Обучение XGBoost
model_xgb = xgb.XGBClassifier(objective='multi:softmax', num_class=num_classes, random_state=42)
model_xgb.fit(X_train_tfidf, y_train)
accuracy_xgb = model_xgb.score(X_test_tfidf, y_test)

In [15]:
print(f"Точность XGBoost: {accuracy_xgb}")

Точность XGBoost: 0.8649115732473068


In [16]:
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

model_svm = SVC(kernel='linear', C=1, decision_function_shape='ovr', random_state=42) # Можно экспериментировать с ядром и параметрами C
model_svm.fit(X_train_tfidf, y_train)
accuracy_svm = model_svm.score(X_test_tfidf, y_test)

In [17]:
print(f"Accuracy SVM: {accuracy_svm}")

Accuracy SVM: 0.8801198225704245
