In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertModel
import torch

# 1. Загрузка данных
# Предположим, у вас есть база данных в формате CSV с колонками 'text' (комментарий) и 'label' (токсичный: 1, нетоксичный: 0)
data = pd.read_csv('токсик коментс.csv')

# Проверим данные
print("Пример данных:")
print(data.head())

# Разделение на тренировочные и тестовые данные
X = data['Text']
y = data['IsToxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Размер тренировочного набора: {len(X_train)}")
print(f"Размер тестового набора: {len(X_test)}")

# 2. Настройка BERT
# Загрузка предобученной модели и токенизатора BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Функция для извлечения эмбеддингов
def get_bert_embeddings(texts):
    """
    Преобразует список текстов в эмбеддинги BERT.
    Аргументы:
        texts (list): Список текстовых строк.
    Возвращает:
        numpy.array: Эмбеддинги текстов.
    """
    inputs = tokenizer(list(texts), return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    # Используем [CLS] токен как представление текста
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings.numpy()

# 3. Извлечение эмбеддингов
print("Извлечение эмбеддингов для тренировочных данных...")
X_train_embeddings = get_bert_embeddings(X_train)

print("Извлечение эмбеддингов для тестовых данных...")
X_test_embeddings = get_bert_embeddings(X_test)

# 4. Построение модели Logistic Regression
print("Обучение модели Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_embeddings, y_train)

# Оценка модели Logistic Regression
lr_predictions = lr_model.predict(X_test_embeddings)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print("\nLogistic Regression Results:")
print(f"Accuracy: {lr_accuracy * 100:.2f}%")
print(classification_report(y_test, lr_predictions))

# 5. Построение модели Random Forest
print("Обучение модели Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_embeddings, y_train)

# Оценка модели Random Forest
rf_predictions = rf_model.predict(X_test_embeddings)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("\nRandom Forest Results:")
print(f"Accuracy: {rf_accuracy * 100:.2f}%")
print(classification_report(y_test, rf_predictions))

# 6. Сравнение моделей
print("\nСравнение моделей:")
print(f"Logistic Regression Accuracy: {lr_accuracy * 100:.2f}%")
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")



Пример данных:
              CommentId      VideoId  \
0  Ugg2KwwX0V8-aXgCoAEC  04kJtp6pVXI   
1  Ugg2s5AzSPioEXgCoAEC  04kJtp6pVXI   
2  Ugg3dWTOxryFfHgCoAEC  04kJtp6pVXI   
3  Ugg7Gd006w1MPngCoAEC  04kJtp6pVXI   
4  Ugg8FfTbbNF8IngCoAEC  04kJtp6pVXI   

                                                Text  IsToxic  IsAbusive  \
0  If only people would just take a step back and...    False      False   
1  Law enforcement is not trained to shoot to app...     True       True   
2  \nDont you reckon them 'black lives matter' ba...     True       True   
3  There are a very large number of people who do...    False      False   
4  The Arab dude is absolutely right, he should h...    False      False   

   IsThreat  IsProvocative  IsObscene  IsHatespeech  IsRacist  IsNationalist  \
0     False          False      False         False     False          False   
1     False          False      False         False     False          False   
2     False          False       True         F

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

ImportError: 
BertModel requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [3]:
pip install transformers

Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.24.0 (from transformers)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.24.0->transformers)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting typing-extensions>=3.7.4.3 (from huggingf

In [5]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.5.1%2Bcpu-cp312-cp312-win_amd64.whl (205.4 MB)
     ---------------------------------------- 0.0/205.4 MB ? eta -:--:--
     ---------------------------------------- 0.0/205.4 MB ? eta -:--:--
     ---------------------------------------- 0.3/205.4 MB ? eta -:--:--
     ---------------------------------------- 0.3/205.4 MB ? eta -:--:--
     -------------------------------------- 0.5/205.4 MB 621.2 kB/s eta 0:05:30
     -------------------------------------- 0.5/205.4 MB 621.2 kB/s eta 0:05:30
     -------------------------------------- 0.5/205.4 MB 621.2 kB/s eta 0:05:30
     -------------------------------------- 0.5/205.4 MB 621.2 kB/s eta 0:05:30
     -------------------------------------- 0.5/205.4 MB 621.2 kB/s eta 0:05:30
     -------------------------------------- 0.5/205.4 MB 621.2 kB/s eta 0:05:30
     -------------------------------------- 0.5/

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# 1. Загрузка данных
data = pd.read_csv('токсик коментс.csv')  # Замените на ваш путь к данным
X = data['Text']
y = data['IsToxic']

# 2. Разделение на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Векторизация текста с TF-IDF
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

# 4. Создание и обучение моделей
# Логистическая регрессия с подбором параметров
lr_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('lr', LogisticRegression(max_iter=1000))
])
lr_params = {
    'lr__C': [0.1, 1, 10],
    'lr__solver': ['liblinear', 'lbfgs']
}
lr_grid = GridSearchCV(lr_pipeline, lr_params, cv=3, scoring='accuracy')
lr_grid.fit(X_train, y_train)

# Случайный лес с подбором параметров
rf_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('rf', RandomForestClassifier())
])
rf_params = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5, 10]
}
rf_grid = GridSearchCV(rf_pipeline, rf_params, cv=3, scoring='accuracy')
rf_grid.fit(X_train, y_train)

# 5. Оценка моделей
lr_best = lr_grid.best_estimator_
rf_best = rf_grid.best_estimator_

y_pred_lr = lr_best.predict(X_test)
y_pred_rf = rf_best.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report (Logistic Regression):\n", classification_report(y_test, y_pred_lr))
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))


Logistic Regression Accuracy: 0.685
Random Forest Accuracy: 0.645

Classification Report (Logistic Regression):
               precision    recall  f1-score   support

       False       0.63      0.77      0.70        93
        True       0.76      0.61      0.67       107

    accuracy                           0.69       200
   macro avg       0.69      0.69      0.68       200
weighted avg       0.70      0.69      0.68       200


Classification Report (Random Forest):
               precision    recall  f1-score   support

       False       0.58      0.87      0.70        93
        True       0.80      0.45      0.57       107

    accuracy                           0.65       200
   macro avg       0.69      0.66      0.64       200
weighted avg       0.70      0.65      0.63       200

