In [1]:
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset
from rouge_score import rouge_scorer

from src.data_utils import clean_string
from src.data_utils import save_results_to_file
from src.data_utils import save_selection_to_file
from src.lstm_model import LSTMLanguageModel
from src.lstm_model import calculate_rouge_batch


MAX_SEQUENCE_LEN = 80
VOCAB_SIZE = 50257  # для GPT-2 токенизатора
EMBEDDING_DIM = 256
HIDDEN_DIM = 256
NUM_LAYERS = 2
BATCH_SIZE = 256
LEARNING_RATE = 0.001
NUM_EPOCHS = 10
MODEL_NAME = 'distilgpt2'
OUTPUT_FILE = 'results/all_experiments.json'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
###### ЭТАП 1. Загрузка датасета, очистка, токенизация, разбиение
# загрузка датасета
#
# Из-за проблем с запуском ВМ обучение прогоняется локально на CPU для небольшой выборки
# для демонстрации работы кода
#
raw = pd.read_csv('./data/tweets_small.txt', sep='\t', header=None, names=['tweets'],
                  on_bad_lines='skip')

tweets = raw['tweets']
# "чистим" тексты
cleaned_tweets = raw['tweets'].apply(clean_string)

df = pd.DataFrame(cleaned_tweets, columns=['tweets'])
df.to_csv('./data/cleaned_tweets_small.txt', index=False, header=False, encoding='utf-8')

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)# создайте токенизатор
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'  # Left-padding нужен для decoder-only моделей

In [4]:
# Токенизация текстов
def tokenize_texts(texts):
    tokenized = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_SEQUENCE_LEN,
        return_tensors="pt"
    )
    return tokenized


# Токенизируем все тексты
tokenized_data = tokenize_texts(cleaned_tweets)
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

# Разделение на train (80%), validation (10%), test (10%)
X_temp, X_test, mask_temp, mask_test = train_test_split(
    input_ids, attention_mask, test_size=0.1, random_state=42
)

X_train, X_val, mask_train, mask_val = train_test_split(
    X_temp, mask_temp, test_size=0.111, random_state=42  # 0.111 = 10% / 90%
)

print(f"Размеры выборок:")
print(f"Train: {X_train.shape[0]} samples")
print(f"Validation: {X_val.shape[0]} samples")
print(f"Test: {X_test.shape[0]} samples")

save_selection_to_file(X_train, mask_train, tokenizer, './data/train.csv')
save_selection_to_file(X_val, mask_val, tokenizer, './data/val.csv')
save_selection_to_file(X_test, mask_test, tokenizer, './data/test.csv')

Размеры выборок:
Train: 1680 samples
Validation: 210 samples
Test: 210 samples


In [5]:
# Создание DataLoader'ов
train_dataset = TensorDataset(X_train, mask_train)
val_dataset = TensorDataset(X_val, mask_val)
test_dataset = TensorDataset(X_test, mask_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [6]:
#####################################################
#             Этап 3. Работа с LSTM моделью         #
#####################################################

print("=" * 60)
print("ЧАСТЬ 1: ОБУЧЕНИЕ LSTM МОДЕЛИ")
print("=" * 60)

# Инициализация модели
lstm_model = LSTMLanguageModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(lstm_model.parameters(), lr=LEARNING_RATE)

ЧАСТЬ 1: ОБУЧЕНИЕ LSTM МОДЕЛИ


In [7]:
# Обучение модели
train_losses = []
val_rouge_scores = []

print("Начинаем обучение LSTM модели...")
for epoch in range(NUM_EPOCHS):
    lstm_model .train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{NUM_EPOCHS}')
    
    for batch_inputs, batch_masks in progress_bar:
        batch_inputs = batch_inputs.to(DEVICE)
        batch_masks = batch_masks.to(DEVICE)
        
        # Подготовка данных: X = все кроме последнего токена, y = все кроме первого
        X = batch_inputs[:, :-1]
        y = batch_inputs[:, 1:]
        
        optimizer.zero_grad()
        output, _ = lstm_model (X)
        
        # Reshape для loss function
        loss = criterion(output.reshape(-1, VOCAB_SIZE), y.reshape(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    
    # Валидация и вычисление ROUGE
    rouge_scores = calculate_rouge_batch(lstm_model , tokenizer, val_loader, DEVICE, BATCH_SIZE, 
                                        'lstm', num_samples=50)
    val_rouge_scores.append(rouge_scores)
    
    print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f}, '
          f'ROUGE-1 = {rouge_scores["rouge1"]:.4f}, '
          f'ROUGE-2 = {rouge_scores["rouge2"]:.4f}, '
          f'ROUGE-L = {rouge_scores["rougeL"]:.4f}')


Начинаем обучение LSTM модели...


Epoch 1/10: 100%|██████████| 7/7 [02:22<00:00, 20.37s/it, loss=10.0345]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: ' Kellaring gripping externalToEVA successor Government roots shrug cache ensuredfixes connectivity Carbon'
ROUGE-1: 0.000

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: ' Blink hast proletariat Crimean Fil outline spacious Arkansasril Takeruframework understanding eas'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: 'assic farewell cruiseascal Regulations ate Proceedingsweak!-- combo assistance invention lawnMethodsinterpret'
ROUGE-1: 0.000

Обработано 150 примеров
Epoch 1: Loss = 10.5811, ROUGE-1 = 0.0000, ROUGE-2 = 0.0000, ROUGE-L = 0.0000


Epoch 2/10: 100%|██████████| 7/7 [02:15<00:00, 19.33s/it, loss=7.6709]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: 'ah 2 dictateneyFriday a until bed that to goings soon'
ROUGE-1: 0.154

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: ' bitch like same family Maxwell way break 6ess big now here got'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' all way thats set with gotdoes sunme optionhsorry feel need mute'
ROUGE-1: 0.000

Обработано 150 примеров
Epoch 2: Loss = 8.4578, ROUGE-1 = 0.0233, ROUGE-2 = 0.0010, ROUGE-L = 0.0228


Epoch 3/10: 100%|██████████| 7/7 [02:04<00:00, 17.80s/it, loss=6.9598]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: ' sick to to you tired to a this like around not the for'
ROUGE-1: 0.250

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: 'n really to amit finish b itive have they not going'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' of go just nim allorting and worried see i even but byc'
ROUGE-1: 0.000

Обработано 150 примеров
Epoch 3: Loss = 7.1329, ROUGE-1 = 0.0443, ROUGE-2 = 0.0000, ROUGE-L = 0.0407


Epoch 4/10: 100%|██████████| 7/7 [02:00<00:00, 17.28s/it, loss=7.0156]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: ' now fault theynot to new out a bt feel show my'
ROUGE-1: 0.143

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: ' sad i so monkeys no i thishealthy them atbeing iooo'
ROUGE-1: 0.154

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' my i a it ofaha notottam have l to and i find'
ROUGE-1: 0.235

Обработано 150 примеров
Epoch 4: Loss = 6.9751, ROUGE-1 = 0.0539, ROUGE-2 = 0.0000, ROUGE-L = 0.0483


Epoch 5/10: 100%|██████████| 7/7 [01:59<00:00, 17.05s/it, loss=6.9480]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: ' life in not just caughtv i to just better tweetqu in'
ROUGE-1: 0.000

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: ' not find days very for it notaha forseeing t wideoo'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' something in to there tomorrow with my a half how like whereiint'
ROUGE-1: 0.118

Обработано 150 примеров
Epoch 5: Loss = 6.9502, ROUGE-1 = 0.0409, ROUGE-2 = 0.0000, ROUGE-L = 0.0380


Epoch 6/10: 100%|██████████| 7/7 [01:58<00:00, 16.87s/it, loss=7.0016]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: ' things now backg video now 9i to just see it'
ROUGE-1: 0.000

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: 'bler thatim imin we most feel damn him go early did'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' killed nowam die too want 2 nowin and was class her i in'
ROUGE-1: 0.000

Обработано 150 примеров
Epoch 6: Loss = 6.9124, ROUGE-1 = 0.0334, ROUGE-2 = 0.0000, ROUGE-L = 0.0312


Epoch 7/10: 100%|██████████| 7/7 [11:10:38<00:00, 5748.32s/it, loss=6.8626]   



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: 'o done as c knows im my for to this ist'
ROUGE-1: 0.143

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: ' inuesday aais is i amp other thatored aemouth me'
ROUGE-1: 0.167

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' my sad thevein beena is know i guys its my he a'
ROUGE-1: 0.118

Обработано 150 примеров
Epoch 7: Loss = 6.9020, ROUGE-1 = 0.0378, ROUGE-2 = 0.0000, ROUGE-L = 0.0371


Epoch 8/10: 100%|██████████| 7/7 [02:56<00:00, 25.23s/it, loss=6.9737]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: 'm wish mom to whe much option be its is i dojust'
ROUGE-1: 0.000

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: 'ter lights to butive on i for w going care he are'
ROUGE-1: 0.133

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' 1i and leg totoo on try week damn imii a the birthday'
ROUGE-1: 0.118

Обработано 150 примеров
Epoch 8: Loss = 6.9023, ROUGE-1 = 0.0427, ROUGE-2 = 0.0007, ROUGE-L = 0.0421


Epoch 9/10: 100%|██████████| 7/7 [02:50<00:00, 24.36s/it, loss=6.9924]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: 'alia get you they he on is ofee on these so though'
ROUGE-1: 0.000

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: ' no via up a afford the my i out how lately ofot'
ROUGE-1: 0.133

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: ' the some a isdays books me lost heart it lifejust turning the to'
ROUGE-1: 0.222

Обработано 150 примеров
Epoch 9: Loss = 6.9037, ROUGE-1 = 0.0414, ROUGE-2 = 0.0010, ROUGE-L = 0.0389


Epoch 10/10: 100%|██████████| 7/7 [02:03<00:00, 17.68s/it, loss=6.8383]



--- Пример 1 ---
Промпт: 'running nose spinning head not a good combination'
Эталон: ' for a meeting'
Сгенерировано: ' summermaso the oftntt for sad is of a'
ROUGE-1: 0.364

--- Пример 2 ---
Промпт: 'awe i love you too 1 am here'
Эталон: ' i miss you'
Сгенерировано: ' morning back freak bad down think house is the toouch ixt'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'yeah great vid i had the 12quot single but sold'
Эталон: ' it a few years ago'
Сгенерировано: 'c who this int my together was the now john y it organizing not'
ROUGE-1: 0.105

Обработано 150 примеров
Epoch 10: Loss = 6.8924, ROUGE-1 = 0.0414, ROUGE-2 = 0.0000, ROUGE-L = 0.0402


In [8]:
# Тестирование на тестовой выборке
print("\nТестирование на тестовой выборке...")
lstm_test_rouge = calculate_rouge_batch(lstm_model, tokenizer, test_loader, DEVICE, BATCH_SIZE, 
                                    'lstm', num_samples=100)
print(f"Test ROUGE-1: {lstm_test_rouge['rouge1']:.4f}")
print(f"Test ROUGE-2: {lstm_test_rouge['rouge2']:.4f}")
print(f"Test ROUGE-L: {lstm_test_rouge['rougeL']:.4f}")

# Сохраняем результаты в файл
save_results_to_file(lstm_test_rouge, 
                    OUTPUT_FILE, 
                    'LSTM_custom', 
                    MAX_SEQUENCE_LEN, 
                    'lstm', 
                    experiment_name='LSTM_256_hidden',
                    additional_info={'hidden_dim': HIDDEN_DIM, 'num_layers': NUM_LAYERS})


Тестирование на тестовой выборке...

--- Пример 1 ---
Промпт: 'i got the i can has chezburger book from the lobo and you are not here'
Эталон: ' to look at it wif me'
Сгенерировано: ' would the ats to know may go im beef not reading oners wish work why'
ROUGE-1: 0.095

--- Пример 2 ---
Промпт: 'someone please take gossip girl away from'
Эталон: ' me im addicted'
Сгенерировано: ' much inoh we i for over to i re daddy want add'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'mo jobs no money how in the hell is min wage here 4'
Эталон: ' fn clams an hour'
Сгенерировано: ' im week sopm sports1 sup early want gear ao none is he'
ROUGE-1: 0.000

Обработано 157 примеров
Test ROUGE-1: 0.0393
Test ROUGE-2: 0.0000
Test ROUGE-L: 0.0376

Результаты добавлены в файл: results/all_experiments.json
Всего экспериментов в файле: 1


In [14]:
from src.models_common import show_generation_examples

In [15]:
# Использование
show_generation_examples(lstm_model, tokenizer, test_loader, DEVICE, 'lstm', 3)



Примеры автодополнений LSTM:

LSTM Пример 1:
Промпт: 'i got the i can has chezburger book from the lobo and you are not here'
Эталон: ' to look at it wif me'
Сгенерировано: ' want somefe thingaf toi awayokezzc i today gone int her is is do'

LSTM Пример 2:
Промпт: 'someone please take gossip girl away from'
Эталон: ' me im addicted'
Сгенерировано: ' thans one o i a something as town sleep hasi to a momsi to preaching tomorrow supposed'

LSTM Пример 3:
Промпт: 'mo jobs no money how in the hell is min wage here 4'
Эталон: ' fn clams an hour'
Сгенерировано: ' at made classes chat i feel toim ha headache coming i episodes thedam that nall to u'


In [16]:
#####################################################
# Этап 4. Использование предобученного трансформера #
#####################################################
print("\n" + "=" * 60)
print("ЧАСТЬ 2: ТЕСТИРОВАНИЕ TRANSFORMER МОДЕЛИ")
print("=" * 60)

# Загрузка модели
transformer_model  = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
transformer_model.eval()  # Переводим модель в режим оценки

# Вычисление метрик ROUGE на тестовой выборке
print("Тестирование Transformer на тестовой выборке...")
transformer_test_rouge = calculate_rouge_batch(transformer_model, tokenizer, test_loader, DEVICE, 
                                                BATCH_SIZE, 'transformer', num_samples=100)

print(f"Transformer Test ROUGE-1: {transformer_test_rouge['rouge1']:.4f}")
print(f"Transformer Test ROUGE-2: {transformer_test_rouge['rouge2']:.4f}")
print(f"Transformer Test ROUGE-L: {transformer_test_rouge['rougeL']:.4f}")



# Сохраняем результаты в файл
save_results_to_file( transformer_test_rouge,
            OUTPUT_FILE,  # ТОТ ЖЕ ФАЙЛ!
            'distilgpt2',
            MAX_SEQUENCE_LEN,
            model_type='transformer',
            experiment_name='DistilGPT2_baseline'
            )

# Дополнительная статистика
print(f"\n=== ДОПОЛНИТЕЛЬНАЯ СТАТИСТИКА ===")
print(f"Размер тренировочной выборки: {X_train.shape[0]} примеров")
print(f"Размер валидационной выборки: {X_val.shape[0]} примеров")
print(f"Общий размер датасета: {len(cleaned_tweets)} твитов")

# Выводим путь к файлам результатов
print(f"\nФайлы с результатами:")
print(f"- Детальный JSON: {OUTPUT_FILE}")



ЧАСТЬ 2: ТЕСТИРОВАНИЕ TRANSFORMER МОДЕЛИ
Тестирование Transformer на тестовой выборке...

--- Пример 1 ---
Промпт: 'i got the i can has chezburger book from the lobo and you are not here'
Эталон: ' to look at it wif me'
Сгенерировано: '. I was there with the i can but i couldn't do it so i was'
ROUGE-1: 0.091

--- Пример 2 ---
Промпт: 'someone please take gossip girl away from'
Эталон: ' me im addicted'
Сгенерировано: ' you. I will be honest with you. I will take gossip'
ROUGE-1: 0.000

--- Пример 3 ---
Промпт: 'mo jobs no money how in the hell is min wage here 4'
Эталон: ' fn clams an hour'
Сгенерировано: '-7 people $20 an hour 0 $25 a hour 0 $50'
ROUGE-1: 0.267

Обработано 155 примеров
Transformer Test ROUGE-1: 0.0714
Transformer Test ROUGE-2: 0.0128
Transformer Test ROUGE-L: 0.0691

Результаты добавлены в файл: results/all_experiments.json
Всего экспериментов в файле: 3

=== ДОПОЛНИТЕЛЬНАЯ СТАТИСТИКА ===
Размер тренировочной выборки: 1680 примеров
Размер валидационной выборки: 210 

In [17]:
show_generation_examples(transformer_model, tokenizer, test_loader, DEVICE, 'transformer', 3)


Примеры автодополнений TRANSFORMER:

TRANSFORMER Пример 1:
Промпт: 'i got the i can has chezburger book from the lobo and you are not here'
Эталон: ' to look at it wif me'
Сгенерировано: ' to eat. You are here to give a reason why you should eat this food. You are here'

TRANSFORMER Пример 2:
Промпт: 'someone please take gossip girl away from'
Эталон: ' me im addicted'
Сгенерировано: ' me!”
If you have questions about the game, please contact me or ask me for'

TRANSFORMER Пример 3:
Промпт: 'mo jobs no money how in the hell is min wage here 4'
Эталон: ' fn clams an hour'
Сгенерировано: ':10 PM
















'


In [18]:
# ========== СРАВНЕНИЕ РЕЗУЛЬТАТОВ ==========

print("\n" + "=" * 60)
print("СРАВНЕНИЕ РЕЗУЛЬТАТОВ")
print("=" * 60)
print(f"LSTM ROUGE-1:     {lstm_test_rouge['rouge1']:.4f}")
print(f"Transformer ROUGE-1: {transformer_test_rouge['rouge1']:.4f}")
print()
print(f"LSTM ROUGE-2:     {lstm_test_rouge['rouge2']:.4f}")
print(f"Transformer ROUGE-2: {transformer_test_rouge['rouge2']:.4f}")
print()
print(f"LSTM ROUGE-L:     {lstm_test_rouge['rougeL']:.4f}")
print(f"Transformer ROUGE-L: {transformer_test_rouge['rougeL']:.4f}")


СРАВНЕНИЕ РЕЗУЛЬТАТОВ
LSTM ROUGE-1:     0.0393
Transformer ROUGE-1: 0.0714

LSTM ROUGE-2:     0.0000
Transformer ROUGE-2: 0.0128

LSTM ROUGE-L:     0.0376
Transformer ROUGE-L: 0.0691
