In [2]:

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from src.data_utils import load_and_clean_data

data_dir = Path('./data/')
raw_data_path = data_dir / 'tweets.txt'

clean_df = load_and_clean_data(raw_data_path)


train_df, temp_df = train_test_split(clean_df, test_size=0.2, random_state=42)

val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


print("4. Сохранение файлов...")
train_path = data_dir / 'train.csv'
val_path = data_dir / 'val.csv'
test_path = data_dir / 'test.csv'

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("\nПроцесс завершен! Файлы сохранены:")
print(f"  - Обучающая выборка: {train_path} ({len(train_df)} строк)")
print(f"  - Валидационная выборка: {val_path} ({len(val_df)} строк)")
print(f"  - Тестовая выборка: {test_path} ({len(test_df)} строк)")


print("\n--- Пример данных из созданного train.csv ---")
print(train_df.head())

1. Чтение сырого файла: data/tweets.txt
2. Очистка текста...
Готово. Получено 1596801 чистых строк.
4. Сохранение файлов...

Процесс завершен! Файлы сохранены:
  - Обучающая выборка: data/train.csv (1277440 строк)
  - Валидационная выборка: data/val.csv (159680 строк)
  - Тестовая выборка: data/test.csv (159681 строк)

--- Пример данных из созданного train.csv ---
                                                     text
86449      or you could be like me work six days in a row
989848                 good morning have a wonderful week
869848  hope this works or a simple pic what a capacit...
938814  just read the sweet message my best friend sen...
75206   why is no one shopping am on my lunch isnt out...


In [3]:
from pathlib import Path
from torch.utils.data import DataLoader
from src.next_token_dataset import TextDataset, PadCollate

data_dir = Path('./data/')
BATCH_SIZE = 128

train_dataset = TextDataset(file_path=data_dir / 'train.csv')

vocab = train_dataset.word2idx
pad_idx = vocab[train_dataset.pad_token]
vocab_size = train_dataset.vocab_size

val_dataset = TextDataset(file_path=data_dir / 'val.csv', vocab=vocab)
test_dataset = TextDataset(file_path=data_dir / 'test.csv', vocab=vocab)

collate_fn = PadCollate(pad_idx=pad_idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

print(f"Размер словаря: {vocab_size}")
print("DataLoader'ы готовы.")

inputs, targets = next(iter(train_loader))
print(f"Размер батча (вход): {inputs.shape}")
print(f"Размер батча (цель): {targets.shape}")

Размер словаря: 20000
DataLoader'ы готовы.
Размер батча (вход): torch.Size([128, 26])
Размер батча (цель): torch.Size([128, 26])


In [4]:
import torch
from src.lstm_model import LSTMModel

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_LAYERS = 2
DROPOUT = 0.3

device = "cuda" if torch.cuda.is_available() else "cpu"

model = LSTMModel(
    vocab_size=vocab_size,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    dropout_prob=DROPOUT
).to(device)


inputs, targets = next(iter(train_loader))
inputs, targets = inputs.to(device), targets.to(device)

output = model(inputs)

print("Модель создана и успешно обработала один батч данных.")
print(f"Размер входа: {inputs.shape}")
print(f"Размер выхода (логитов): {output.shape}")
print(f"Ожидаемый размер выхода: ({BATCH_SIZE}, {inputs.shape[1]}, {vocab_size})")


idx2word = train_dataset.idx2word



Модель создана и успешно обработала один батч данных.
Размер входа: torch.Size([128, 26])
Размер выхода (логитов): torch.Size([128, 26, 20000])
Ожидаемый размер выхода: (128, 26, 20000)


In [5]:
import torch.optim as optim
import torch.nn as nn
from src.lstm_train import train_loop

LEARNING_RATE = 0.001
NUM_EPOCHS = 5 

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

train_loop(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    num_epochs=NUM_EPOCHS,
    device=device,
    vocab=vocab,
    idx2word=idx2word
)

  from .autonotebook import tqdm as notebook_tqdm
Training: 100%|██████████| 9980/9980 [08:49<00:00, 18.84it/s]
Validation Loss: 100%|██████████| 1248/1248 [00:25<00:00, 48.27it/s]



Epoch 1/5 | Time: 555.67s
Train Loss: 5.3679
Validation Loss: 5.0174
Model saved to models/best_lstm_model.pth


Training: 100%|██████████| 9980/9980 [08:57<00:00, 18.56it/s]
Validation Loss: 100%|██████████| 1248/1248 [00:25<00:00, 48.55it/s]



Epoch 2/5 | Time: 563.46s
Train Loss: 4.9695
Validation Loss: 4.8791
Model saved to models/best_lstm_model.pth


Training: 100%|██████████| 9980/9980 [08:57<00:00, 18.55it/s]
Validation Loss: 100%|██████████| 1248/1248 [00:25<00:00, 48.69it/s]



Epoch 3/5 | Time: 563.56s
Train Loss: 4.8627
Validation Loss: 4.8186
Model saved to models/best_lstm_model.pth


Training: 100%|██████████| 9980/9980 [08:57<00:00, 18.57it/s]
Validation Loss: 100%|██████████| 1248/1248 [00:25<00:00, 48.03it/s]



Epoch 4/5 | Time: 563.49s
Train Loss: 4.8038
Validation Loss: 4.7877
Model saved to models/best_lstm_model.pth


Training: 100%|██████████| 9980/9980 [08:58<00:00, 18.52it/s]
Validation Loss: 100%|██████████| 1248/1248 [00:26<00:00, 47.87it/s]



Epoch 5/5 | Time: 564.81s
Train Loss: 4.7648
Validation Loss: 4.7668
Model saved to models/best_lstm_model.pth

Calculating ROUGE on validation set with the best model...


Downloading builder script: 6.27kB [00:00, 11.1MB/s]
Calculating ROUGE: 100%|██████████| 20/20 [00:06<00:00,  2.93it/s]


ROUGE Scores: {'rouge1': np.float64(0.7341423209622171), 'rouge2': np.float64(0.6808170117078052), 'rougeL': np.float64(0.7341640330187014), 'rougeLsum': np.float64(0.7341614718383911)}


In [6]:
from pathlib import Path


model_path = Path('./models/best_lstm_model.pth')
model.load_state_dict(torch.load(model_path, map_location=device))


test_prompts = [
    "i feel so",
    "today is a",
    "i want to",
    "what are you",
    "this is the"
]

print("\n--- Примеры генерации текста ---")
for prompt in test_prompts:
    generated_text = model.generate(
        start_seq=prompt,
        max_len=15,
        vocab=vocab,
        idx2word=idx2word,
        device=device
    )
    print(f"PROMPT: '{prompt}'")
    print(f"GENERATED: '{generated_text}'\n")


--- Примеры генерации текста ---
PROMPT: 'i feel so'
GENERATED: '<bos> i feel so bad for you'

PROMPT: 'today is a'
GENERATED: '<bos> today is a good day'

PROMPT: 'i want to'
GENERATED: '<bos> i want to go to the beach but i cant'

PROMPT: 'what are you'
GENERATED: '<bos> what are you doing today'

PROMPT: 'this is the'
GENERATED: '<bos> this is the first time i was in the <unk>'



In [8]:

from src.eval_transformer_pipeline import evaluate_transformer
from transformers import pipeline

print("--- Оценка качества distilgpt2 на валидационной выборке ---")
val_file_path = data_dir / 'val.csv'
transformer_rouge_scores = evaluate_transformer(val_file_path, device, limit_rows=2560)

print("\nROUGE Scores для distilgpt2:")
print(transformer_rouge_scores)


print("\n\n--- Примеры генерации текста с помощью distilgpt2 ---")
generator = pipeline("text-generation", model="distilgpt2", device=0 if device == "cuda" else -1)

test_prompts = [
    "i feel so",
    "today is a",
    "i want to",
    "what are you",
    "this is the"
]

for prompt in test_prompts:
    result = generator(
        prompt, 
        max_length=20,
        num_return_sequences=1,
        pad_token_id=generator.tokenizer.eos_token_id,
        do_sample=True,
        top_k=50
    )
    print(f"PROMPT: '{prompt}'")
    print(f"GENERATED: '{result[0]['generated_text']}'\n")

--- Оценка качества distilgpt2 на валидационной выборке ---


Device set to use cuda:0
Evaluating Transformer:   0%|          | 8/2560 [00:00<02:20, 18.20it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating Transformer: 100%|██████████| 2560/2560 [01:46<00:00, 24.08it/s]



ROUGE Scores для distilgpt2:
{'rouge1': np.float64(0.6428079579346148), 'rouge2': np.float64(0.5871336061540489), 'rougeL': np.float64(0.6428515842464073), 'rougeLsum': np.float64(0.6422893195632884)}


--- Примеры генерации текста с помощью distilgpt2 ---


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take pre

PROMPT: 'i feel so'
GENERATED: 'i feel so happy with the way I've been doing this.


"I feel like I've had my best friend's wedding present for the last 2 years," he said, "but I'm not sure what's going on with the wedding. I'm not sure how I'll get that done."'

PROMPT: 'today is a'
GENERATED: 'today is a way to look at the social and economic issues associated with a change in the political and economic environment."'



Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


PROMPT: 'i want to'
GENERATED: 'i want to know why we are here.‰























































































































































































































































'

PROMPT: 'what are you'
GENERATED: 'what are you going to do to get that back in the court?”




This is a great example of how the court is going to treat both sides of the argument.'

PROMPT: 'this is the'
GENERATED: 'this is the same as that. I have done a lot of research about the human brain. If you can find out more about human brain structure, please feel free to send me a message.

This article is from the archive of our partner.















































































































































































































'



In [10]:
from src.eval_transformer_pipeline import evaluate_transformer
import torch


device = "cuda" if torch.cuda.is_available() else "cpu"
val_path = './data/val.csv'

print("Запуск оценки модели distilgpt2 на валидационной выборке...")

transformer_rouge_scores = evaluate_transformer(val_path, device, limit_rows=1000)

print("\n--- ROUGE метрики для distilgpt2 ---")
print(transformer_rouge_scores)

Запуск оценки модели distilgpt2 на валидационной выборке...


Device set to use cuda:0
Evaluating Transformer: 100%|██████████| 1000/1000 [00:40<00:00, 24.58it/s]



--- ROUGE метрики для distilgpt2 ---
{'rouge1': np.float64(0.6449691258211186), 'rouge2': np.float64(0.5882963689757161), 'rougeL': np.float64(0.6443800144601144), 'rougeLsum': np.float64(0.6443285582174039)}


In [11]:
from transformers import pipeline


generator = pipeline(
    "text-generation",
    model="distilgpt2",
    device=0 if device == "cuda" else -1
)


test_prompts = [
    "i feel so",
    "today is a",
    "i want to",
    "what are you",
    "this is the"
]

print("\n--- Примеры генерации текста (distilgpt2) ---")
for prompt in test_prompts:
    result = generator(
        prompt,
        max_length=20, 
        num_return_sequences=1,
        pad_token_id=generator.tokenizer.eos_token_id,
        do_sample=True,
        top_k=50
    )
    generated_text = result[0]['generated_text']
    print(f"PROMPT:    '{prompt}'")
    print(f"GENERATED: '{generated_text}'\n")

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



--- Примеры генерации текста (distilgpt2) ---


Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


PROMPT:    'i feel so'
GENERATED: 'i feel so much better.
But as my friend, I've been to this place for the past few years. I've always wanted to make sure I didn't feel so bad for myself. I try to make sure I don't feel bad for myself.
I do feel good for people who want me to be good. I try to take care of myself in a way that makes people happy.
Let me know how you feel about yourself & how you feel about yourself.'



Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


PROMPT:    'today is a'
GENERATED: 'today is a new project that is going to bring back the old magic of life: the secret life of the past and the future.


The project is a new project that is going to bring back the old magic of life: the secret life of the past and the future. The first time you were able to see the history of the past, you had to come back to the past and see how the present has changed.
The final project will consist of the following:
The present
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past
The history of the past


Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


PROMPT:    'what are you'
GENERATED: 'what are you getting?





























































































































































































































































'

PROMPT:    'this is the'
GENERATED: 'this is the only way to see if the user is the same person.



















































































































































































































































'



In [9]:
from src.eval_lstm import calculate_rouge
from pathlib import Path

model_path = Path('./models/best_lstm_model.pth')
model.load_state_dict(torch.load(model_path, map_location=device))

print("--- Финальная оценка лучшей модели (LSTM) на тестовом датасете ---")


final_rouge_scores = calculate_rouge(model, test_loader, vocab, idx2word, device, limit_batches=100)

print("\nИтоговые ROUGE Scores на тестовой выборке:")
print(f"ROUGE-1: {final_rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {final_rouge_scores['rouge2']:.4f}")

--- Финальная оценка лучшей модели (LSTM) на тестовом датасете ---


Calculating ROUGE: 100%|██████████| 100/100 [00:35<00:00,  2.86it/s]



Итоговые ROUGE Scores на тестовой выборке:
ROUGE-1: 0.7329
ROUGE-2: 0.6801


## Итоговые выводы
1. Качество на конкретной задаче: Если цель максимально точно дополнять короткие посты в том же стиле, что и исходные данные, то легкая LSTM-модель является явным фаворитом. Она лучше понимает контекст и стиль датасета, что подтверждается и метриками, и ручным анализом.
2. Общая способность к генерации: Модель distilgpt2, безусловно, является более мощной и "креативной". Она способна генерировать гораздо более сложный и разнообразный текст. Однако для данной бизнес-задачи эта сложность является избыточной и даже вредной, так как нарушает стилистику.
3. Ресурсы и ограничения: Наша LSTM-модель обучалась за разумное время, она легкая и быстрая в работе. distilgpt2 требует значительно больше ресурсов (памяти, вычислительной мощности), что может стать проблемой при внедрении.

Рекомендуется использовать разработанную и обученную LSTM-модель.