In [None]:
#Импорт библиотек

import os
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Отключаем предупреждения

warnings.filterwarnings('ignore')

CONFIG = {
    'data': {
        'train_path': "/kaggle/input/train-dataset/train_v2_drcat_02.csv",
        'test_path': "/kaggle/input/llm-detect-ai-generated-text/test_essays.csv",
        'random_state': 42,
        'n_folds': 3
    },
    'tfidf': {
        'max_features': 10000,
        'ngram_range': (1, 2)
    },
    'svd': {
        'n_components': 300
    },
    'model': {
        'type': 'logistic',  # 'logistic', 'random_forest' или 'svm'
        'params': {
            'logistic': {'C': 1.0, 'max_iter': 1000},
            'random_forest': {'n_estimators': 100, 'max_depth': 5},
            'svm': {'C': 1.0, 'kernel': 'linear'}
        }
    }
}


In [None]:
# Подготовка датасетов

def clean_text(text):
    """Расширенная очистка текста от артефактов"""
    patterns = [
        r'sincerely,\s*\[your name\]',
        r'as an (8th|eighth\-grade) student',
        r'(writing|today) to express',
        r'hey there! so,',
        r'first impressions are',
        r'a four\-day school week',
        r'reduce traffic congestion[,\\.]',
        r'i will explore',
        r'\[.*?\]',
        r'\b(please|kindly|thank you)\b',
        r'\dth grade',
        r'positive attitude is',
        r'personal growth and',
        r'career at a'
    ]

    for pattern in patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    return ' '.join(text.split()).strip()

def augment_data(df, n_samples=2000):
    """Аугментация данных через смешивание текстов"""
    n_ai_samples = min(n_samples//2, len(df[df['label'] == 1]))
    n_human_samples = min(n_samples//2, len(df[df['label'] == 0]))

    ai_texts = df[df['label'] == 1]['text'].sample(n_ai_samples).tolist()
    human_texts = df[df['label'] == 0]['text'].sample(n_human_samples).tolist()

    mixed_samples = []
    for ai, human in zip(ai_texts, human_texts):
        mixed_ai_human = ai[:len(ai)//2] + human[len(human)//2:]
        mixed_human_ai = human[:len(human)//2] + ai[len(ai)//2:]

        mixed_samples.extend([
            {'text': mixed_ai_human, 'label': 1},
            {'text': mixed_human_ai, 'label': 0}
        ])

    return pd.concat([df, pd.DataFrame(mixed_samples)])

def analyze_data(train_df, test_df):
    """Анализ данных с визуализацией"""
    print("\n=== АНАЛИЗ ДАННЫХ ===")
    print("\nРаспределение меток:")
    print(train_df['label'].value_counts(normalize=True))

    train_df['length'] = train_df['text'].apply(len)
    plt.figure(figsize=(10, 4))
    sns.boxplot(x='label', y='length', data=train_df)
    plt.title("Распределение длины текстов")
    plt.show()

In [None]:
#Модель

def train_and_validate():
    # Загрузка и очистка данных
    train = pd.read_csv(CONFIG['data']['train_path'])
    test = pd.read_csv(CONFIG['data']['test_path'])
    train['text'] = train['text'].apply(clean_text)
    test['text'] = test['text'].apply(clean_text)

    # Аугментация данных
    train = augment_data(train)
    analyze_data(train, test)

    # Инициализация TF-IDF и SVD
    tfidf = TfidfVectorizer(
        max_features=CONFIG['tfidf']['max_features'],
        ngram_range=CONFIG['tfidf']['ngram_range'])

    svd = TruncatedSVD(n_components=CONFIG['svd']['n_components'])

    # Выбор модели
    model_type = CONFIG['model']['type']
    if model_type == 'logistic':
        model = LogisticRegression(**CONFIG['model']['params']['logistic'], random_state=CONFIG['data']['random_state'])
    elif model_type == 'random_forest':
        model = RandomForestClassifier(**CONFIG['model']['params']['random_forest'], random_state=CONFIG['data']['random_state'])
    elif model_type == 'svm':
        model = SVC(**CONFIG['model']['params']['svm'], probability=True, random_state=CONFIG['data']['random_state'])
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    # Создаем pipeline
    pipeline = Pipeline([
        ('tfidf', tfidf),
        ('svd', svd),
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # Кросс-валидация
    skf = StratifiedKFold(
        n_splits=CONFIG['data']['n_folds'],
        shuffle=True,
        random_state=CONFIG['data']['random_state'])

    fold_metrics = []
    test_preds = np.zeros(len(test))

    for fold, (train_idx, val_idx) in enumerate(skf.split(train['text'], train['label'])):
        print(f"\n=== Fold {fold+1}/{CONFIG['data']['n_folds']} ===")

        # Разделение данных
        train_df, val_df = train.iloc[train_idx], train.iloc[val_idx]

        # Обучение pipeline
        pipeline.fit(train_df['text'], train_df['label'])

        # Предсказание на валидации
        val_preds = pipeline.predict_proba(val_df['text'])[:, 1]

        # Метрики
        val_labels = val_df['label'].values
        auc = roc_auc_score(val_labels, val_preds)
        f1 = f1_score(val_labels, (val_preds > 0.5).astype(int))
        acc = accuracy_score(val_labels, (val_preds > 0.5).astype(int))
        recall_ai = recall_score(val_labels, (val_preds > 0.5).astype(int), pos_label=1)

        print(f"Val AUC: {auc:.4f}")
        print(f"Val F1: {f1:.4f}")
        print(f"Val Accuracy: {acc:.4f}")
        print(f"Val Recall (AI): {recall_ai:.4f}")

        fold_metrics.append(auc)

        # Предсказание на тесте
        test_preds += pipeline.predict_proba(test['text'])[:, 1] / CONFIG['data']['n_folds']

    # Сохранение результатов
    submission = pd.DataFrame({
        'id': test['id'],
        'generated': test_preds
    })
    submission.to_csv('submission_tfidf_svd.csv', index=False)

    print("\n=== ИТОГОВЫЕ МЕТРИКИ ===")
    print(f"Средний AUC по фолдам: {np.mean(fold_metrics):.4f} (±{np.std(fold_metrics):.4f})")

In [None]:
if __name__ == "__main__":
    train_and_validate()