In [None]:
import os
import random
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

lines = {}
with open("movie_lines.txt", encoding="ISO-8859-1") as f:
    for line in f:
        p = line.strip().split(" +++$+++ ")
        if len(p) == 5: lines[p[0]] = p[4]

convs = []
with open("movie_conversations.txt", encoding="ISO-8859-1") as f:
    for line in f:
        p = line.strip().split(" +++$+++ ")
        if len(p) == 4: convs.append(eval(p[3]))

pairs = []
for c in convs:
    for i in range(len(c) - 1):
        if c[i] in lines and c[i+1] in lines:
            q, a = lines[c[i]].strip(), lines[c[i+1]].strip()
            if 3 < len(q.split()) < 25 and 3 < len(a.split()) < 25:
                pairs.append({"input": q, "output": a})

model_id = "microsoft/DialoGPT-medium"
tk = AutoTokenizer.from_pretrained(model_id)
tk.pad_token = tk.eos_token
data = Dataset.from_list(random.sample(pairs, 5000))

def tokenize(item):
    txt = item["input"] + tk.eos_token + item["output"] + tk.eos_token
    res = tk(txt, truncation=True, padding="max_length", max_length=64)
    res["labels"] = res["input_ids"].copy()
    return res

tokenized_data = data.map(tokenize)
model = AutoModelForCausalLM.from_pretrained(model_id)

args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    fp16=True,
    report_to="none"
)

trainer = Trainer(model=model, args=args, train_dataset=tokenized_data, tokenizer=tk)
trainer.train()

trainer.save_model("./final_model")
tk.save_pretrained("./final_model")
     