In [None]:
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from rouge_score import rouge_scorer

from src.data_utils import clean_string
from src.data_utils import save_results_to_file
from src.data_utils import save_selection_to_file
from src.lstm_model import LSTMLanguageModel
from src.lstm_model import calculate_rouge_batch


MAX_SEQUENCE_LEN = 80
VOCAB_SIZE = 50257  # для GPT-2 токенизатора
EMBEDDING_DIM = 256
HIDDEN_DIM = 256
NUM_LAYERS = 2
BATCH_SIZE = 256
LEARNING_RATE = 0.001
NUM_EPOCHS = 5
MODEL_NAME = 'distilgpt2'
OUTPUT_FILE = 'results/all_experiments.json'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
###### ЭТАП 1. Загрузка датасета, очистка, токенизация, разбиение
# загрузка датасета
#
# Из-за проблем с запуском ВМ обучение прогоняется локально на CPU для небольшой выборки
# для демонстрации работы кода
#
raw = pd.read_csv('./data/tweets_small.txt', sep='\t', header=None, names=['tweets'],
                  on_bad_lines='skip')

tweets = raw['tweets']
# "чистим" тексты
cleaned_tweets = raw['tweets'].apply(clean_string)

df = pd.DataFrame(cleaned_tweets, columns=['tweets'])
df.to_csv('./data/cleaned_tweets_small.txt', index=False, header=False, encoding='utf-8')

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)# создайте токенизатор
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'  # Left-padding нужен для decoder-only моделей

In [None]:
# Токенизация текстов
def tokenize_texts(texts):
    tokenized = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_SEQUENCE_LEN,
        return_tensors="pt"
    )
    return tokenized


# Токенизируем все тексты
tokenized_data = tokenize_texts(cleaned_tweets)
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

# Разделение на train (80%), validation (10%), test (10%)
X_temp, X_test, mask_temp, mask_test = train_test_split(
    input_ids, attention_mask, test_size=0.1, random_state=42
)

X_train, X_val, mask_train, mask_val = train_test_split(
    X_temp, mask_temp, test_size=0.111, random_state=42  # 0.111 = 10% / 90%
)

print(f"Размеры выборок:")
print(f"Train: {X_train.shape[0]} samples")
print(f"Validation: {X_val.shape[0]} samples")
print(f"Test: {X_test.shape[0]} samples")

save_selection_to_file(X_train, mask_train, tokenizer, './data/train.csv')
save_selection_to_file(X_val, mask_val, tokenizer, './data/val.csv')
save_selection_to_file(X_test, mask_test, tokenizer, './data/test.csv')

In [None]:
# Создание DataLoader'ов
train_dataset = TensorDataset(X_train, mask_train)
val_dataset = TensorDataset(X_val, mask_val)
test_dataset = TensorDataset(X_test, mask_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
#####################################################
#             Этап 3. Работа с LSTM моделью         #
#####################################################

print("=" * 60)
print("ЧАСТЬ 1: ОБУЧЕНИЕ LSTM МОДЕЛИ")
print("=" * 60)

# Инициализация модели
lstm_model = LSTMLanguageModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE)

In [None]:
# Обучение модели
train_losses = []
val_rouge_scores = []

print("Начинаем обучение LSTM модели...")
for epoch in range(NUM_EPOCHS):
    lstm_model .train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS}')
    
    for batch_inputs, batch_masks in progress_bar:
        batch_inputs = batch_inputs.to(DEVICE)
        batch_masks = batch_masks.to(DEVICE)
        
        # Подготовка данных: X = все кроме последнего токена, y = все кроме первого
        X = batch_inputs[:, :-1]
        y = batch_inputs[:, 1:]
        
        optimizer.zero_grad()
        output, _ = lstm_model (X)
        
        # Reshape для loss function
        loss = criterion(output.reshape(-1, VOCAB_SIZE), y.reshape(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    
    # Валидация и вычисление ROUGE
    rouge_scores = calculate_rouge_batch(lstm_model , tokenizer, val_loader, DEVICE, BATCH_SIZE, 
                                        'lstm', num_samples=50)
    val_rouge_scores.append(rouge_scores)
    
    print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f}, '
          f'ROUGE-1 = {rouge_scores["rouge1"]:.4f}, '
          f'ROUGE-2 = {rouge_scores["rouge2"]:.4f}, '
          f'ROUGE-L = {rouge_scores["rougeL"]:.4f}')


In [None]:
# Тестирование на тестовой выборке
print("\nТестирование на тестовой выборке...")
lstm_test_rouge = calculate_rouge_batch(lstm_model, tokenizer, test_loader, DEVICE, BATCH_SIZE, 
                                    'lstm', num_samples=100)
print(f"Test ROUGE-1: {lstm_test_rouge['rouge1']:.4f}")
print(f"Test ROUGE-2: {lstm_test_rouge['rouge2']:.4f}")
print(f"Test ROUGE-L: {lstm_test_rouge['rougeL']:.4f}")

# Сохраняем результаты в файл
save_results_to_file(lstm_test_rouge, 
                    OUTPUT_FILE, 
                    'LSTM_custom', 
                    MAX_SEQUENCE_LEN, 
                    'lstm', 
                    experiment_name='LSTM_256_hidden',
                    additional_info={'hidden_dim': HIDDEN_DIM, 'num_layers': NUM_LAYERS})

In [None]:
# Примеры генерации
print("\nПримеры автодополнений:")
# Примеры генерации LSTM
print("\nПримеры автодополнений LSTM:")
for i in range(3):
    sample_input = X_test[i]
    real_length = mask_test[i].sum().item()
    prompt_length = int(real_length * 0.75)
    
    prompt_ids = sample_input[:prompt_length]
    reference_ids = sample_input[prompt_length:real_length]
    
    prompt_text = tokenizer.decode(prompt_ids, skip_special_tokens=True)
    reference_text = tokenizer.decode(reference_ids, skip_special_tokens=True)
    
    generated = lstm_model.generate(prompt_ids.unsqueeze(0), max_new_tokens=20, temperature=0.7)
    generated_tokens = generated[0, prompt_ids.shape[0]:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    print(f"\nLSTM Пример {i+1}:")
    print(f"Промпт: '{prompt_text}'")
    print(f"Эталон: '{reference_text}'")
    print(f"Сгенерировано: '{generated_text}'")

In [None]:
#####################################################
# Этап 4. Использование предобученного трансформера #
#####################################################
print("\n" + "=" * 60)
print("ЧАСТЬ 2: ТЕСТИРОВАНИЕ TRANSFORMER МОДЕЛИ")
print("=" * 60)

# Загрузка модели
transformer_model  = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
transformer_model.eval()  # Переводим модель в режим оценки

# Вычисление метрик ROUGE на тестовой выборке
print("Тестирование Transformer на тестовой выборке...")
transformer_test_rouge = calculate_rouge_batch(transformer_model, tokenizer, test_loader, DEVICE, 
                                                BATCH_SIZE, 'transformer', num_samples=100)

print(f"Transformer Test ROUGE-1: {transformer_test_rouge['rouge1']:.4f}")
print(f"Transformer Test ROUGE-2: {transformer_test_rouge['rouge2']:.4f}")
print(f"Transformer Test ROUGE-L: {transformer_test_rouge['rougeL']:.4f}")



# Сохраняем результаты в файл
save_results_to_file( transformer_test_rouge,
            OUTPUT_FILE,  # ТОТ ЖЕ ФАЙЛ!
            'distilgpt2',
            MAX_SEQUENCE_LEN,
            model_type='transformer',
            experiment_name='DistilGPT2_baseline'
            )

# Дополнительная статистика
print(f"\n=== ДОПОЛНИТЕЛЬНАЯ СТАТИСТИКА ===")
print(f"Размер тренировочной выборки: {X_train.shape[0]} примеров")
print(f"Размер валидационной выборки: {X_val.shape[0]} примеров")
print(f"Общий размер датасета: {len(cleaned_tweets)} твитов")

# Выводим путь к файлам результатов
print(f"\nФайлы с результатами:")
print(f"- Детальный JSON: {OUTPUT_FILE}")


In [None]:
# Примеры генерации Transformer
print("\nПримеры автодополнений Transformer:")
for i in range(3):
    sample_input = X_test[i]
    real_length = mask_test[i].sum().item()
    prompt_length = int(real_length * 0.75)
    
    prompt_ids = sample_input[:prompt_length]
    reference_ids = sample_input[prompt_length:real_length]
    
    prompt_text = tokenizer.decode(prompt_ids, skip_special_tokens=True)
    reference_text = tokenizer.decode(reference_ids, skip_special_tokens=True)
    
    with torch.no_grad():
        generated = transformer_model.generate(
            prompt_ids.unsqueeze(0).to(DEVICE),
            attention_mask=torch.ones_like(prompt_ids).unsqueeze(0).to(DEVICE),
            max_new_tokens=20,
            do_sample=True,
            top_p=0.95,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated_tokens = generated[0, prompt_ids.shape[0]:]
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    print(f"\nTransformer Пример {i+1}:")
    print(f"Промпт: '{prompt_text}'")
    print(f"Эталон: '{reference_text}'")
    print(f"Сгенерировано: '{generated_text}'")

In [None]:
# ========== СРАВНЕНИЕ РЕЗУЛЬТАТОВ ==========

print("\n" + "=" * 60)
print("СРАВНЕНИЕ РЕЗУЛЬТАТОВ")
print("=" * 60)
print(f"LSTM ROUGE-1:     {lstm_test_rouge['rouge1']:.4f}")
print(f"Transformer ROUGE-1: {transformer_test_rouge['rouge1']:.4f}")
print()
print(f"LSTM ROUGE-2:     {lstm_test_rouge['rouge2']:.4f}")
print(f"Transformer ROUGE-2: {transformer_test_rouge['rouge2']:.4f}")
print()
print(f"LSTM ROUGE-L:     {lstm_test_rouge['rougeL']:.4f}")
print(f"Transformer ROUGE-L: {transformer_test_rouge['rougeL']:.4f}")