# Утилиты для загрузки/очистки/разбиения датасета. (data_utils.py)

In [1]:
"""
data_utils.py
Утилиты для загрузки/очистки/разбиения датасета.

структура:
- data/raw_dataset.csv
- data/dataset_processed.csv
- data/train.csv, data/val.csv, data/test.csv
"""
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split


# пути
RAW_PATH = "data/raw_dataset.csv"
PROCESSED_PATH = "data/dataset_processed.csv"
TRAIN_PATH = "data/train.csv"
VAL_PATH = "data/val.csv"
TEST_PATH = "data/test.csv"


def clear_datatset():
    # Очистка набора. Согласно заднию нужно так сделать,
    # но скорее всего используемый мной токенизатор позволяет корректно учиться и без этого
    with open(RAW_PATH, "r", encoding="utf-8", errors="ignore") as f:
        texts = [line.rstrip("\n") for line in f]

    df = pd.DataFrame({"text": texts})

    def _clean_text(text):
        if not isinstance(text, str):
            return ""
        text = text.lower()  # к нижнему регистру
        text = re.sub(r"[^a-z0-9 ]+", " ", text)  # оставить только буквы и цифры
        text = re.sub(r"\s+", " ", text).strip()  # убрать дублирующиеся пробелы
        return text

    df_clean = df["text"].map(_clean_text)
    df_clean = df_clean[df_clean.str.len() > 0].reset_index(drop=True) # выкинем пустые после очистки

    df_clean.to_csv(PROCESSED_PATH, index=False, header=False)

    return df_clean


def split_dataset(df_clean, test_size=0.8):
    # разбиение набора на части
    train_df, tmp_df = train_test_split(
        df_clean,
        test_size=1-test_size,
        random_state=42,
        shuffle=True
    )

    val_df, test_df = train_test_split(
        tmp_df,
        test_size=0.5,
        random_state=42,
        shuffle=True
    )

    train_df.to_csv(TRAIN_PATH, index=False, header=False)
    val_df.to_csv(VAL_PATH, index=False, header=False)
    test_df.to_csv(TEST_PATH, index=False, header=False)

    return train_df, val_df, test_df


def prepare_all_data(force: bool = False, debug_test: bool = False):
    # Основная функция для подготовки наборов данных
    if (
        not force
        and not debug_test
        and os.path.isfile(TRAIN_PATH)
        and os.path.isfile(VAL_PATH)
        and os.path.isfile(TEST_PATH)
    ):
        print("All data files already exist, simple read")

        train_df = pd.read_csv(TRAIN_PATH, header=None)
        val_df = pd.read_csv(VAL_PATH, header=None)
        test_df = pd.read_csv(TEST_PATH, header=None)
    
    else:
        print("Create cleared data file")
        df = clear_datatset()

        # Если отладка, то для теста оставить в наборе только первые 20000 строк
        if debug_test:
            df = df.head(20000)
            print(f"First 5 texts:\n{df.head()}\ntotal length = {len(df)}\n")

        print("Split data files")
        train_df, val_df, test_df = split_dataset(df)
    
    print(f"Lenght: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}")
    return train_df, val_df, test_df


# if __name__ == "__main__":
#     prepare_all_data(force=True)



# Torch Dataset для задачи next-token prediction (автодополнение текста). (next_token_dataset.py)

In [2]:
"""
next_token_dataset.py
Torch Dataset для задачи next-token prediction (автодополнение текста).
"""
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


class NextTokenDataset(Dataset):
    def __init__(self, ids_list, eos_id):
        self.ids_list = ids_list
        self.eos_id = eos_id

    def __len__(self):
        return len(self.ids_list)

    def __getitem__(self, idx):
        ids = self.ids_list[idx]
        ids = torch.tensor(ids, dtype=torch.long)

        if ids.numel() < 2:
            ids = torch.tensor([self.eos_id, self.eos_id], dtype=torch.long)

        return {
            "input_ids": ids[:-1],  # X
            "labels": ids[1:]       # Y (сдвиг вправо)
        }


def collate_fn(batch, pad_id: int):
    xs = [item["input_ids"] for item in batch]
    ys = [item["labels"] for item in batch]
    lengths = torch.tensor([len(x) for x in xs], dtype=torch.long)

    x_pad = pad_sequence(xs, batch_first=True, padding_value=pad_id)
    y_pad = pad_sequence(ys, batch_first=True, padding_value=-100)  # для ignore_index

    attention_mask = (x_pad != pad_id).long()

    return {"input_ids": x_pad, "attention_mask": attention_mask, "labels": y_pad, "lengths": lengths}


def GenDataLoaders(train_df, val_df, test_df, tokenizer):
    # Создает Torch Dataset-ы для задачи next-token prediction (автодополнение текста)
    # tokenizer берется как входной, чтобы он был инициализирован одинаково для будущего сравнения с трансформером
    
    # претокенизация
    train_ids = tokenizer(list(train_df), truncation=True, max_length=128, padding=False)["input_ids"]
    val_ids   = tokenizer(list(val_df),   truncation=True, max_length=128, padding=False)["input_ids"]
    test_ids  = tokenizer(list(test_df),  truncation=True, max_length=128, padding=False)["input_ids"]

    pad_id = tokenizer.pad_token_id
    eos_id = tokenizer.eos_token_id

    train_ds = NextTokenDataset(train_ids, eos_id=eos_id)
    val_ds   = NextTokenDataset(val_ids,   eos_id=eos_id)
    test_ds  = NextTokenDataset(test_ids,  eos_id=eos_id)

    train_loader = DataLoader(train_ds, batch_size=256, shuffle=True,
                            collate_fn=lambda b: collate_fn(b, pad_id=pad_id))
    val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False,
                            collate_fn=lambda b: collate_fn(b, pad_id=pad_id))
    test_loader  = DataLoader(test_ds, batch_size=256, shuffle=False,
                            collate_fn=lambda b: collate_fn(b, pad_id=pad_id))
    
    return train_loader, val_loader, test_loader 



# LSTM-модель для языкового моделирования (next token). (lstm_model.py)

In [3]:
"""
lstm_model.py
LSTM-модель для языкового моделирования (next token).
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTMNextToken(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embedding_dim: int = 128,
        hidden_size: int = 256,
        num_layers: int = 1,
        dropout: float = 0.1,
        pad_id: int = 0,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.pad_id = pad_id

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_id)
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0,
        )
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_ids: torch.Tensor, lengths: torch.Tensor):
        """
        input_ids: [B, T] (X)
        lengths:   [B]
        return logits: [B, T, V] — предсказание следующего токена для каждого t
        """
        emb = self.embedding(input_ids)  # [B,T,E]

        packed = pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        packed_out, _ = self.lstm(packed)
        out, _ = pad_packed_sequence(packed_out, batch_first=True)  # [B,T,H]

        logits = self.fc(out)  # [B,T,V]
        return logits

    @torch.no_grad()
    def generate(
        self,
        prefix_ids: torch.Tensor,      # [T] или [1,T]
        lengths: torch.Tensor | None = None,
        max_new_tokens: int = 20,
        eos_id: int | None = None,
        temperature: float = 1.0,
        top_k: int | None = 50,
        do_sample: bool = True,
    ) -> torch.Tensor:
        """
        Возвращает ids = prefix + продолжение.
        Простой авторегрессионный цикл: каждый раз берём logits последней позиции.
        """
        self.eval()
        device = next(self.parameters()).device

        if prefix_ids.dim() == 1:
            ids = prefix_ids.unsqueeze(0).to(device)  # [1,T]
        else:
            ids = prefix_ids.to(device)

        if lengths is None:
            lengths = torch.tensor([ids.size(1)], device=device)

        for _ in range(max_new_tokens):
            logits = self.forward(ids, lengths=lengths)         # [1,T,V]
            next_logits = logits[:, -1, :]                      # [1,V]

            if temperature != 1.0:
                next_logits = next_logits / max(temperature, 1e-8)

            if top_k is not None:
                v, ix = torch.topk(next_logits, k=top_k, dim=-1)
                filt = torch.full_like(next_logits, float("-inf"))
                filt.scatter_(1, ix, v)
                next_logits = filt

            if do_sample:
                probs = F.softmax(next_logits, dim=-1)
                next_id = torch.multinomial(probs, num_samples=1)  # [1,1]
            else:
                next_id = torch.argmax(next_logits, dim=-1, keepdim=True)  # [1,1]

            ids = torch.cat([ids, next_id], dim=1)  # [1, T+1]
            lengths = lengths + 1

            if eos_id is not None and next_id.item() == eos_id:
                break

        return ids.squeeze(0)  # [T_total]


# Оценка LSTM модели: код замера и вывода метрики ROUGE. (ROUGE + генерация 3/4→1/4) (eval_lstm.py)

In [4]:
"""
eval_lstm.py
Оценка LSTM модели: код замера и вывода метрики ROUGE. (ROUGE + генерация 3/4→1/4).
"""
import torch
from rouge_score import rouge_scorer


def _ids_to_text(tokenizer, ids):
    # ids: 1D tensor/list
    if isinstance(ids, torch.Tensor):
        ids = ids.tolist()
    return tokenizer.decode(ids, skip_special_tokens=True).strip()


@torch.no_grad()
def evaluate_rouge_3of4(
    model,
    dataloader,
    tokenizer,
    device,
    max_new_tokens: int = 32,
    do_sample: bool = False,
    top_k: int | None = None,
    temperature: float = 1.0,
    limit_batches: int | None = None,
):
    """
    Сценарий: берем полный текст (input+labels), режем на prefix=3/4 и target=1/4.
    Генерируем продолжение для prefix и считаем ROUGE между generated_suffix и target.
    """
    model.eval()
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2"], use_stemmer=True)

    total_r1, total_r2, n = 0.0, 0.0, 0

    for bi, batch in enumerate(dataloader):
        if limit_batches is not None and bi >= limit_batches:
            break

        x = batch["input_ids"].to(device)         # [B,T]
        y = batch["labels"].to(device)            # [B,T] (с -100 на паддингах)
        lengths = batch["lengths"].to(device)     # [B]

        B = x.size(0)
        for i in range(B):
            L = int(lengths[i].item())
            if L < 4:
                continue

            # восстановим "полную" последовательность ids длины L+1:
            # full = [x0..x_{L-1}] + [y_{L-1}]
            last_target = y[i, L - 1].item()
            if last_target == -100:
                continue

            full_ids = torch.cat(
                [x[i, :L], torch.tensor([last_target], device=device, dtype=torch.long)]
            )  # [L+1]

            cut = max(1, int(0.75 * full_ids.numel()))
            prefix_ids = full_ids[:cut]     # 3/4
            target_ids = full_ids[cut:]     # 1/4

            # генерим не больше, чем длина таргета (чтобы сравнение было честнее)
            gen_len = min(max_new_tokens, int(target_ids.numel()))
            gen_ids = model.generate(
                prefix_ids=prefix_ids,
                lengths=torch.tensor([prefix_ids.numel()], device=device),
                max_new_tokens=gen_len,
                eos_id=tokenizer.eos_token_id,
                temperature=temperature,
                top_k=top_k,
                do_sample=do_sample,
            )

            gen_suffix = gen_ids[prefix_ids.numel():]  # только сгенерированное продолжение

            ref_text = _ids_to_text(tokenizer, target_ids)
            hyp_text = _ids_to_text(tokenizer, gen_suffix)

            if len(ref_text) == 0 or len(hyp_text) == 0:
                continue

            scores = scorer.score(ref_text, hyp_text)
            total_r1 += scores["rouge1"].fmeasure
            total_r2 += scores["rouge2"].fmeasure
            n += 1

    if n == 0:
        return {"rouge1_f": 0.0, "rouge2_f": 0.0, "n_samples": 0}

    return {"rouge1_f": total_r1 / n, "rouge2_f": total_r2 / n, "n_samples": n}


# Обучение LSTM модели (lstm_train.py)

In [5]:
"""
lstm_train.py
Обучение LSTM модели.
"""

import os
import copy
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm

# from eval_lstm import evaluate_rouge_3of4


def train_lstm(
    model,
    train_loader,
    val_loader,
    tokenizer,
    device,
    n_epochs: int = 5,
    lr: float = 1e-3,
    grad_clip: float = 1.0,
    eval_every_epochs: int = 1,
    max_new_tokens_eval: int = 32,
    save_dir: str = "models",
    save_name: str = "lstm_best.pt",
):
    model.to(device)

    # labels с паддингом -100, значит ignore_index=-100
    loss_fn = nn.CrossEntropyLoss(ignore_index=-100)
    optimizer = Adam(model.parameters(), lr=lr)

    best_val_loss = float("inf")
    best_state = None

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{n_epochs} [train]"):
            inputs = batch["input_ids"].to(device)     # [B,T]
            lengths = batch["lengths"].to(device)      # [B]
            labels = batch["labels"].to(device)        # [B,T]

            optimizer.zero_grad()

            logits = model(inputs, lengths)            # [B,T,V]
            # CrossEntropyLoss ждёт [N,C,*], поэтому делаем [B,V,T]
            loss = loss_fn(logits.transpose(1, 2), labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / max(1, len(train_loader))

        # Валидация по loss (быстро)
        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{n_epochs} [val]"):
                inputs = batch["input_ids"].to(device)
                lengths = batch["lengths"].to(device)
                labels = batch["labels"].to(device)

                logits = model(inputs, lengths)
                loss = loss_fn(logits.transpose(1, 2), labels)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / max(1, len(val_loader))

        # ROUGE (медленнее, поэтому можно делать раз в несколько эпох)
        rouge_str = ""
        if (epoch + 1) % eval_every_epochs == 0:
            rouge = evaluate_rouge_3of4(
                model=model,
                dataloader=val_loader,
                tokenizer=tokenizer,
                device=device,
                max_new_tokens=max_new_tokens_eval,
                do_sample=False,
                top_k=None,
                temperature=1.0,
                # limit_batches=50,  # для теста можно включить, чтобы не ждать слишком долго
            )
            rouge_str = f", ROUGE1-F={rouge['rouge1_f']:.4f}, ROUGE2-F={rouge['rouge2_f']:.4f} (n={rouge['n_samples']})"

        print(
            f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}{rouge_str}"
        )

        # -------- save best checkpoint --------
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss

            best_state = copy.deepcopy(model.state_dict())

            ckpt_path = os.path.join(save_dir, save_name)
            torch.save(
                {
                    "epoch": epoch + 1,
                    "model_state_dict": best_state,
                    "optimizer_state_dict": optimizer.state_dict(),
                    "val_loss": best_val_loss,
                    "tokenizer_name": getattr(tokenizer, "name_or_path", None),
                },
                ckpt_path,
            )
            print(f"Saved best checkpoint to: {ckpt_path} (val_loss={best_val_loss:.4f})")

    # в конце загрузим лучший стейт обратно
    if best_state is not None:
        model.load_state_dict(best_state)

    return model

# оценка трансформера gpt2

In [6]:
"""
src/eval_transformer_pipeline.py
оценка трансформера "distilgpt2"
"""
import torch
from rouge_score import rouge_scorer
from transformers import pipeline


def _decode(tokenizer, ids):
    if isinstance(ids, torch.Tensor):
        ids = ids.tolist()
    return tokenizer.decode(ids, skip_special_tokens=True).strip()


@torch.no_grad()
def evaluate_distilgpt2_pipeline_rouge_3of4(
    dataloader,
    gen, # pipeline object
    max_new_tokens_cap: int = 64,
    do_sample: bool = True,
    top_k: int = 50,
    top_p: float = 0.95,
    temperature: float = 0.9,
    limit_batches: int | None = 50,
):
    tok = gen.tokenizer
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2"], use_stemmer=True)

    total_r1, total_r2, n = 0.0, 0.0, 0

    for bi, batch in enumerate(dataloader):
        if limit_batches is not None and bi >= limit_batches:
            break

        x = batch["input_ids"]      # [B,T]
        y = batch["labels"]         # [B,T]
        lengths = batch["lengths"]  # [B]

        B = x.size(0)
        for i in range(B):
            L = int(lengths[i].item())
            if L < 4:
                continue

            last_target = y[i, L - 1].item()
            if last_target == -100:
                continue

            full_ids = torch.cat([x[i, :L], torch.tensor([last_target])])  # [L+1]
            cut = max(1, int(0.75 * full_ids.numel()))
            prefix_ids = full_ids[:cut]
            target_ids = full_ids[cut:]

            prefix_text = _decode(tok, prefix_ids)
            ref_text = _decode(tok, target_ids)
            if not prefix_text or not ref_text:
                continue

            gen_len = min(int(target_ids.numel()), max_new_tokens_cap)

            out = gen(
                prefix_text,
                max_new_tokens=gen_len,          # рекомендованный способ ограничивать добавляемые токены
                do_sample=do_sample,
                top_k=top_k,
                top_p=top_p,
                temperature=temperature,
                num_return_sequences=1,
                return_full_text=False,          # вернуть только “добавку”, без префикса
            )
            hyp_text = out[0]["generated_text"].strip()

            if not hyp_text:
                continue

            scores = scorer.score(ref_text, hyp_text)
            total_r1 += scores["rouge1"].fmeasure
            total_r2 += scores["rouge2"].fmeasure
            n += 1

    if n == 0:
        return {"rouge1_f": 0.0, "rouge2_f": 0.0, "n_samples": 0}

    return {"rouge1_f": total_r1 / n, "rouge2_f": total_r2 / n, "n_samples": n}



  from .autonotebook import tqdm as notebook_tqdm


# Запуск всех операций executor.py

In [7]:
import torch
from transformers import pipeline, AutoTokenizer

# from data_utils import prepare_all_data
# from next_token_dataset import GenDataLoaders
# from lstm_model import LSTMNextToken
# from lstm_train import train_lstm
# from eval_lstm import evaluate_rouge_3of4
# from eval_transformer_pipeline import evaluate_distilgpt2_pipeline_rouge_3of4

debug_test = True

torch.manual_seed(42)

# 1) Tokenizer
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "right"

# GPT2 обычно без PAD
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token

vocab_size = len(tokenizer)
pad_id = tokenizer.pad_token_id or 0
eos_id = tokenizer.eos_token_id

# 2) Devices
device = "cuda" if torch.cuda.is_available() else "cpu"
pipeline_device = 0 if torch.cuda.is_available() else -1  # pipeline ждёт int/-1

# 3) Data
train_df, val_df, test_df = prepare_all_data(force=True, debug_test=debug_test)
train_loader, val_loader, test_loader = GenDataLoaders(train_df, val_df, test_df, tokenizer)

# 4) LSTM train
model = LSTMNextToken(
    vocab_size=vocab_size,
    embedding_dim=128,
    hidden_size=128,
    num_layers=1,
    dropout=0.1,
    pad_id=pad_id,
)

model = train_lstm(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    tokenizer=tokenizer,
    device=device,
    n_epochs=5,
    lr=1e-3,
    grad_clip=1.0,
    eval_every_epochs=1,
    max_new_tokens_eval=20,
)

# если потом нужно загрузить модель для инференса
# ckpt = torch.load("models/lstm_best.pt", map_location=device)
# model.load_state_dict(ckpt["model_state_dict"])
# model.eval()

rouge_lstm_val = evaluate_rouge_3of4(
    model, val_loader, tokenizer, device,
    max_new_tokens=32,
    do_sample=False
)
print("LSTM VAL:", rouge_lstm_val)

# 5) distilgpt2 pipeline (ONE instance, reuse everywhere)
gpt2_gen = pipeline(
    "text-generation",
    model="distilgpt2",
    tokenizer=tokenizer,   # тот же объект токенизатора для надежности
    device=pipeline_device,
)

# 6) distilgpt2 eval
rouge_gpt2_val = evaluate_distilgpt2_pipeline_rouge_3of4(
    dataloader=val_loader,
    gen=gpt2_gen,
    max_new_tokens_cap=64,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=0.9,
    limit_batches=None,
)
print("GPT2 VAL:", rouge_gpt2_val)


# 7) Side-by-side examples
list_examples = [
    "i am going",
    "tomorrow i will",
    "this movie is",
    "Company Google is",
    "If you compare Google and Yandex, you could say that",
    "Our mentor is very smart and",
    "The distilgpt2 model is very strange, but it allows",
    "I would like",
]

for prefix in list_examples:
    # LSTM: генерим ids и декодим
    prefix_ids = tokenizer(prefix, add_special_tokens=False)["input_ids"]
    prefix_ids = torch.tensor(prefix_ids, dtype=torch.long, device=device)

    lstm_ids = model.generate(
        prefix_ids,
        max_new_tokens=20,
        do_sample=True,
        top_k=50,
        temperature=1.0,
        eos_id=eos_id,
    )
    lstm_full = tokenizer.decode(lstm_ids.tolist(), skip_special_tokens=True)
    lstm_suffix = lstm_full[len(prefix):].strip()

    # GPT2 pipeline: сразу просим только continuation
    out = gpt2_gen(
        prefix,
        max_new_tokens=20,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9,
        num_return_sequences=1,
        return_full_text=False,
    )
    gpt2_suffix = out[0]["generated_text"].strip()

    print("PREFIX:", prefix)
    print("LSTM +:", lstm_suffix)
    print("GPT2 +:", gpt2_suffix)
    print("-" * 60)



Create cleared data file
First 5 texts:
0    switchfoot http twitpic com 2y1zl awww that s ...
1    is upset that he can t update his facebook by ...
2    kenichan i dived many times for the ball manag...
3       my whole body feels itchy and like its on fire
4    nationwideclass no it s not behaving at all i ...
Name: text, dtype: object
total length = 20000

Split data files
Lenght: train=16000, val=2000, test=2000


Epoch 1/5 [train]: 100%|██████████| 63/63 [05:04<00:00,  4.83s/it]
Epoch 1/5 [val]: 100%|██████████| 8/8 [00:15<00:00,  1.89s/it]


Epoch 1: Train Loss=8.7930, Val Loss=7.3892, ROUGE1-F=0.0478, ROUGE2-F=0.0000 (n=1920)
Saved best checkpoint to: models\lstm_best.pt (val_loss=7.3892)


Epoch 2/5 [train]: 100%|██████████| 63/63 [05:02<00:00,  4.80s/it]
Epoch 2/5 [val]: 100%|██████████| 8/8 [00:14<00:00,  1.86s/it]


Epoch 2: Train Loss=7.2491, Val Loss=7.2348, ROUGE1-F=0.0267, ROUGE2-F=0.0033 (n=1920)
Saved best checkpoint to: models\lstm_best.pt (val_loss=7.2348)


Epoch 3/5 [train]: 100%|██████████| 63/63 [05:00<00:00,  4.77s/it]
Epoch 3/5 [val]: 100%|██████████| 8/8 [00:14<00:00,  1.85s/it]


Epoch 3: Train Loss=7.0847, Val Loss=7.0943, ROUGE1-F=0.0332, ROUGE2-F=0.0036 (n=1920)
Saved best checkpoint to: models\lstm_best.pt (val_loss=7.0943)


Epoch 4/5 [train]: 100%|██████████| 63/63 [05:01<00:00,  4.79s/it]
Epoch 4/5 [val]: 100%|██████████| 8/8 [00:14<00:00,  1.86s/it]


Epoch 4: Train Loss=6.9298, Val Loss=6.9561, ROUGE1-F=0.0649, ROUGE2-F=0.0050 (n=1920)
Saved best checkpoint to: models\lstm_best.pt (val_loss=6.9561)


Epoch 5/5 [train]: 100%|██████████| 63/63 [05:12<00:00,  4.97s/it]
Epoch 5/5 [val]: 100%|██████████| 8/8 [00:15<00:00,  1.90s/it]


Epoch 5: Train Loss=6.7795, Val Loss=6.8307, ROUGE1-F=0.0673, ROUGE2-F=0.0053 (n=1920)
Saved best checkpoint to: models\lstm_best.pt (val_loss=6.8307)
LSTM VAL: {'rouge1_f': 0.06725882760349697, 'rouge2_f': 0.005282486610611611, 'n_samples': 1920}


Device set to use cpu


GPT2 VAL: {'rouge1_f': 0.06550649469167424, 'rouge2_f': 0.006085652538621535, 'n_samples': 1903}
PREFIX: i am going
LSTM +: to it in the time is of up one 3 we it is in work awww to work for
GPT2 +: to be.”

[image via Getty Images]
------------------------------------------------------------
PREFIX: tomorrow i will
LSTM +: the you i was from the morning good not was but it i and now i so he but i
GPT2 +: keep this for the long term.
But if you have any questions, then just write this:
------------------------------------------------------------
PREFIX: this movie is
LSTM +: in the i was out i have i don t am was to the and of s day no i
GPT2 +: a pretty good one as I haven't heard much about it before.) I'm very impressed with this
------------------------------------------------------------
PREFIX: Company Google is
LSTM +: it is my now it is too it be good day is so i miss you could feel i think
GPT2 +: making changes to its search engine and search engine to replace its search engi