In [None]:
#Импорт библиотек
import os
import pandas as pd
import numpy as np
import re
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings

In [None]:
# Отключаем предупреждения
warnings.filterwarnings('ignore')
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['HF_DATASETS_OFFLINE'] = '1'
from tqdm import tqdm
tqdm.pandas(disable=True)

In [None]:
# ==================== КОНФИГУРАЦИЯ ====================
CONFIG = {
    'data': {
        'train_path': "/kaggle/input/train-dataset/train_v2_drcat_02.csv",
        'test_path': "/kaggle/input/llm-detect-ai-generated-text/test_essays.csv",
        'bert_path': "/kaggle/input/bert-basing",
        'max_length': 128,
        'random_state': 42,
        'n_folds': 3
    },
    'tfidf': {
        'max_features': 10000,
        'ngram_range': (1, 2)
    },
    'svd': {
        'n_components': 300
    },
    'model': {
        'hidden_dim': 128,
        'dropout': 0.3,
        'lr': {
            'model': 1e-4,
            'bert': 2e-5
        }
    },
    'training': {
        'batch_size': 32,
        'epochs': 3,
        'warmup_steps': 100,
        'patience': 2
    }
}

In [None]:
# ==================== КЛАСС ДАТАСЕТА ====================
class TextDataset(Dataset):
    def __init__(self, texts, tfidf_vectorizer, svd, tokenizer, max_length=128, labels=None):
        self.texts = texts
        self.labels = labels
        self.tfidf_vectorizer = tfidf_vectorizer
        self.svd = svd
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        # TF-IDF + SVD фичи
        tfidf_features = self.tfidf_vectorizer.transform([text]).toarray()[0]
        tfidf_features = self.svd.transform(tfidf_features.reshape(1, -1))[0]

        # Токенизация BERT
        inputs = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        item = {
            'tfidf_features': torch.FloatTensor(tfidf_features),
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0)
        }

        if self.labels is not None:
            item['labels'] = torch.FloatTensor([self.labels[idx]])

        return item

In [None]:
# ==================== 1. ПОДГОТОВКА ДАННЫХ ====================
def clean_text(text):
    """Расширенная очистка текста от артефактов"""
    patterns = [
        r'sincerely,\s*\[your name\]',
        r'as an (8th|eighth\-grade) student',
        r'(writing|today) to express',
        r'hey there! so,',
        r'first impressions are',
        r'a four\-day school week',
        r'reduce traffic congestion[,\.]',
        r'i will explore',
        r'\[.*?\]',
        r'\b(please|kindly|thank you)\b',
        r'\dth grade',
        r'positive attitude is',
        r'personal growth and',
        r'career at a'
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    return ' '.join(text.split()).strip()

def augment_data(df, n_samples=2000):
    """Аугментация данных через смешивание текстов"""
    # Выбираем образцы для каждого класса
    n_ai_samples = min(n_samples//2, len(df[df['label'] == 1]))
    n_human_samples = min(n_samples//2, len(df[df['label'] == 0]))

    ai_texts = df[df['label'] == 1]['text'].sample(n_ai_samples).tolist()
    human_texts = df[df['label'] == 0]['text'].sample(n_human_samples).tolist()

    mixed_samples = []
    for ai, human in zip(ai_texts, human_texts):
        # Смешиваем половинки текстов
        mixed_ai_human = ai[:len(ai)//2] + human[len(human)//2:]
        mixed_human_ai = human[:len(human)//2] + ai[len(ai)//2:]

        mixed_samples.extend([
            {'text': mixed_ai_human, 'label': 1},
            {'text': mixed_human_ai, 'label': 0}
        ])

    return pd.concat([df, pd.DataFrame(mixed_samples)])

def analyze_data(train_df, test_df):
    """Анализ данных с визуализацией"""
    print("\n=== АНАЛИЗ ДАННЫХ ===")

    # Распределение метокTextDataset
    print("\nРаспределение меток:")
    print(train_df['label'].value_counts(normalize=True))

    # Длина текстов
    train_df['length'] = train_df['text'].apply(len)
    plt.figure(figsize=(10, 4))
    sns.boxplot(x='label', y='length', data=train_df)
    plt.title("Распределение длины текстов")
    plt.show()

In [None]:
# ==================== 2. МОДЕЛЬ ====================
class EnhancedHybridModel(nn.Module):
    """Улучшенная гибридная модель"""
    def __init__(self, tfidf_dim, bert_dim, hidden_dim=128, dropout=0.3):
        super().__init__()
        # Проекции для признаков
        self.tfidf_proj1 = nn.Linear(tfidf_dim, hidden_dim)
        self.tfidf_proj2 = nn.Sequential(
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        self.bert_proj = nn.Sequential(
            nn.Linear(bert_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout)
        )

        # Механизм внимания (упрощенная версия)
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, 1),
            nn.Softmax(dim=0)
        )

        # Классификатор
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim//2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim//2, 1),
            nn.Sigmoid()
        )

    def forward(self, tfidf_features, bert_features):
        # Проекции признаков
        tfidf_out = self.tfidf_proj1(tfidf_features)
        tfidf_out = self.tfidf_proj2(tfidf_out)
        bert_out = self.bert_proj(bert_features)

        # Комбинирование через attention
        combined = torch.stack([tfidf_out, bert_out], dim=1)
        attn_weights = self.attention(combined)
        attended = (combined * attn_weights).sum(dim=1)

        return self.classifier(attended)

In [None]:
# ==================== 3. ОБУЧЕНИЕ ====================
def evaluate(model, val_loader, bert_model, device):
    """Расширенная оценка модели"""
    model.eval()
    bert_model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            bert_outputs = bert_model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'])

            outputs = model(
                batch['tfidf_features'],
                bert_outputs.last_hidden_state[:, 0, :])

            preds.extend(outputs.cpu().numpy().flatten())
            labels.extend(batch['labels'].cpu().numpy().flatten())

    preds = np.array(preds)
    labels = np.array(labels)

    return {
        'auc': roc_auc_score(labels, preds),
        'f1': f1_score(labels, (preds > 0.5).astype(int)),
        'acc': accuracy_score(labels, (preds > 0.5).astype(int)),
        'recall_ai': recall_score(labels, (preds > 0.5).astype(int), pos_label=1)
    }

def train_epoch(model, train_loader, bert_model, criterion, optimizer, scheduler, device):
    """Одна эпоха обучения"""
    model.train()
    bert_model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc='Training'):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        bert_outputs = bert_model(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'])

        outputs = model(
            batch['tfidf_features'],
            bert_outputs.last_hidden_state[:, 0, :])

        loss = criterion(outputs, batch['labels'])
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)

In [None]:
def train_and_validate():
    # Инициализация
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Загрузка и очистка данных
    train = pd.read_csv(CONFIG['data']['train_path'])
    test = pd.read_csv(CONFIG['data']['test_path'])
    train['text'] = train['text'].apply(clean_text)
    test['text'] = test['text'].apply(clean_text)

    # Аугментация данных
    train = augment_data(train)
    analyze_data(train, test)

    # Инициализация TF-IDF и SVD
    tfidf = TfidfVectorizer(
        max_features=CONFIG['tfidf']['max_features'],
        ngram_range=CONFIG['tfidf']['ngram_range'])
    svd = TruncatedSVD(n_components=CONFIG['svd']['n_components'])

    # Кросс-валидация
    skf = StratifiedKFold(
        n_splits=CONFIG['data']['n_folds'],
        shuffle=True,
        random_state=CONFIG['data']['random_state'])

    fold_metrics = []
    test_preds = np.zeros(len(test))

    for fold, (train_idx, val_idx) in enumerate(skf.split(train['text'], train['label'])):
        print(f"\n=== Fold {fold+1}/{CONFIG['data']['n_folds']} ===")

        # Разделение данных
        train_df, val_df = train.iloc[train_idx], train.iloc[val_idx]

        # Обучение TF-IDF и SVD
        tfidf_matrix = tfidf.fit_transform(train_df['text'])
        svd.fit(tfidf_matrix)

        # Даталоадеры
        tokenizer = AutoTokenizer.from_pretrained(CONFIG['data']['bert_path'], local_files_only=True)

        train_dataset = TextDataset(
            train_df['text'].tolist(),
            tfidf, svd, tokenizer,
            max_length=CONFIG['data']['max_length'],
            labels=train_df['label'].tolist())

        val_dataset = TextDataset(
            val_df['text'].tolist(),
            tfidf, svd, tokenizer,
            max_length=CONFIG['data']['max_length'],
            labels=val_df['label'].tolist())

        test_dataset = TextDataset(
            test['text'].tolist(),
            tfidf, svd, tokenizer,
            max_length=CONFIG['data']['max_length'])

        train_loader = DataLoader(
            train_dataset,
            batch_size=CONFIG['training']['batch_size'],
            shuffle=True)

        val_loader = DataLoader(
            val_dataset,
            batch_size=CONFIG['training']['batch_size'])

        test_loader = DataLoader(
            test_dataset,
            batch_size=CONFIG['training']['batch_size'])

        # Инициализация моделей
        bert_model = AutoModel.from_pretrained(CONFIG['data']['bert_path'], local_files_only=True).to(device)
        model = EnhancedHybridModel(
            tfidf_dim=CONFIG['svd']['n_components'],
            bert_dim=768,
            hidden_dim=CONFIG['model']['hidden_dim'],
            dropout=CONFIG['model']['dropout']).to(device)

        # Оптимизатор с разными learning rates
        optimizer = torch.optim.AdamW([
            {'params': model.tfidf_proj1.parameters(), 'lr': CONFIG['model']['lr']['model']},
            {'params': model.tfidf_proj2.parameters(), 'lr': CONFIG['model']['lr']['model']},
            {'params': model.bert_proj.parameters(), 'lr': CONFIG['model']['lr']['bert']},
            {'params': model.attention.parameters(), 'lr': CONFIG['model']['lr']['model']},
            {'params': model.classifier.parameters(), 'lr': CONFIG['model']['lr']['model']}
        ])

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=CONFIG['training']['warmup_steps'],
            num_training_steps=len(train_loader)*CONFIG['training']['epochs'])

        criterion = nn.BCELoss()

        # Обучение
        best_auc = 0
        patience = 0

        for epoch in range(CONFIG['training']['epochs']):
            train_loss = train_epoch(
                model, train_loader, bert_model,
                criterion, optimizer, scheduler, device)

            val_metrics = evaluate(model, val_loader, bert_model, device)
            print(f"\nEpoch {epoch+1}:")
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val AUC: {val_metrics['auc']:.4f}")
            print(f"Val Recall (AI): {val_metrics['recall_ai']:.4f}")

            # Ранняя остановка
            if val_metrics['auc'] > best_auc:
                best_auc = val_metrics['auc']
                patience = 0
                torch.save(model.state_dict(), f'best_model_fold{fold}.pt')
            else:
                patience += 1
                if patience >= CONFIG['training']['patience']:
                    print(f"Early stopping at epoch {epoch+1}")
                    break

        fold_metrics.append(best_auc)

        # Предсказание на тесте
        model.load_state_dict(torch.load(f'best_model_fold{fold}.pt'))
        model.eval()

        fold_preds = []
        with torch.no_grad():
            for batch in test_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                bert_outputs = bert_model(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'])

                outputs = model(
                    batch['tfidf_features'],
                    bert_outputs.last_hidden_state[:, 0, :])

                fold_preds.extend(outputs.cpu().numpy().flatten())

        test_preds += np.array(fold_preds) / CONFIG['data']['n_folds']
    # Сохранение результатов
    submission = pd.DataFrame({
        'id': test['id'],
        'generated': test_preds
    })
    submission.to_csv('submission.csv', index=False)

    print("\n=== ИТОГОВЫЕ МЕТРИКИ ===")
    print(f"Средний AUC по фолдам: {np.mean(fold_metrics):.4f} (±{np.std(fold_metrics):.4f})")

In [None]:
if __name__ == "__main__":
    train_and_validate()