In [2]:

# Load raw input and label text files
with open('/kaggle/input/t5model/input_texts.txt', 'r') as f:
    raw_inputs = f.readlines()

with open('/kaggle/input/t5model/label_texts.txt', 'r') as f:
    raw_labels = f.readlines()

# Step 1: Remove custom [sos] and [eos] tokens
cleaned_inputs = [line.strip().replace('[sos]', '').replace('[eos]', '') for line in raw_inputs]
cleaned_labels = [line.strip().replace('[sos]', '').replace('[eos]', '') for line in raw_labels]

# Step 2: Add "chat:" prefix to inputs only
final_inputs = [f"chat: {line.strip()}" for line in cleaned_inputs]
final_labels = [line.strip() for line in cleaned_labels]  # Don't put 'chat:' in labels

# Step 3: Train-test split
from sklearn.model_selection import train_test_split
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    final_inputs, final_labels, test_size=0.2, random_state=42
)


In [3]:
!pip install transformers -q

import os
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW   # ✅ NEW

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import warnings

warnings.filterwarnings("ignore")

# Load input and target texts
with open('/kaggle/input/t5model/input_texts.txt', 'r') as f:
    input_lines = f.readlines()

with open('/kaggle/input/t5model/label_texts.txt', 'r') as f:
    label_lines = f.readlines()

print("Sample Inputs:", input_lines[:3])
print("Sample Labels:", label_lines[:3])

# Split data into training and testing
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    input_lines, label_lines, test_size=0.2, random_state=42
)

# Initialize tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenization function
def encode_data(sources, targets, tokenizer, max_len=128):
    source_enc = tokenizer(
        sources, padding=True, truncation=True, max_length=max_len, return_tensors="pt"
    )
    target_enc = tokenizer(
        targets, padding=True, truncation=True, max_length=max_len, return_tensors="pt"
    )
    return source_enc, target_enc

# Encode training and validation data
train_encodings, train_labels = encode_data(train_inputs, train_targets, tokenizer)
val_encodings, val_labels = encode_data(val_inputs, val_targets, tokenizer)

# Custom dataset class
class ChatbotDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return self.encodings["input_ids"].size(0)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels["input_ids"][idx]
        }

# Create datasets and dataloaders
train_dataset = ChatbotDataset(train_encodings, train_labels)
val_dataset = ChatbotDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Optimizer setup
optimizer = AdamW(model.parameters(), lr=5e-6)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
print("\n🧠 Starting training...\n")
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} — 🔥 Avg Training Loss: {avg_loss:.4f}")

# Evaluation loop (first batch)
print("\n🔍 Evaluating on sample test batch...\n")
model.eval()

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        preds = model.generate(input_ids=input_ids, attention_mask=attn_mask, max_length=50)

        inputs_decoded = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
        labels_decoded = [
            tokenizer.decode(label[label != -100], skip_special_tokens=True)
            for label in labels
        ]
        preds_decoded = [tokenizer.decode(p, skip_special_tokens=True) for p in preds]

        for inp, tgt, pred in zip(inputs_decoded, labels_decoded, preds_decoded):
            print("🗨️  User    :", inp.strip())
            print("✅ Expected:", tgt.strip())
            print("🤖 Predicted:", pred.strip())
            print("-" * 60)
        break  # Only evaluate one batch

2025-07-14 10:18:34.593370: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752488314.774567      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752488314.825890      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Sample Inputs: ['[sos] hi, how are you doing? [eos]\n', "[sos] i'm fine. how about yourself? [eos]\n", "[sos] i'm pretty good. thanks for asking. [eos]\n"]
Sample Labels: ["[sos] i'm fine. how about yourself? [eos]\n", "[sos] i'm pretty good. thanks for asking. [eos]\n", '[sos] no problem. so how have you been? [eos]\n']


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


🧠 Starting training...



Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/10 — 🔥 Avg Training Loss: 4.6526
Epoch 2/10 — 🔥 Avg Training Loss: 1.7010
Epoch 3/10 — 🔥 Avg Training Loss: 1.3167
Epoch 4/10 — 🔥 Avg Training Loss: 1.0876
Epoch 5/10 — 🔥 Avg Training Loss: 0.9752
Epoch 6/10 — 🔥 Avg Training Loss: 0.9262
Epoch 7/10 — 🔥 Avg Training Loss: 0.8960
Epoch 8/10 — 🔥 Avg Training Loss: 0.8725
Epoch 9/10 — 🔥 Avg Training Loss: 0.8576
Epoch 10/10 — 🔥 Avg Training Loss: 0.8420

🔍 Evaluating on sample test batch...

🗨️  User    : [sos] would you like to see a movie with me and my friend? [eos]
✅ Expected: [sos] do you know what movie you're going to watch? [eos]
🤖 Predicted: [sos] i'm not sure. [eos]
------------------------------------------------------------
🗨️  User    : [sos] and we can afford it! [eos]
✅ Expected: [sos] so are we going to buy it? [eos]
🤖 Predicted: [sos] we can afford it! [eos]
------------------------------------------------------------
🗨️  User    : [sos] i'm not being nosey. i'm just asking. [eos]
✅ Expected: [sos] i really don't t

In [4]:
for i in range(5):
    print(f"> INPUT:  {train_inputs[i]}")
    print(f"> TARGET: {train_targets[i]}\n")


> INPUT:  [sos] it's supposed to start at about eight. [eos]

> TARGET: [sos] how many invitations has she given out? [eos]


> INPUT:  [sos] i don't know. [eos]

> TARGET: [sos] when did you lose it? [eos]


> INPUT:  [sos] what's going on? [eos]

> TARGET: [sos] nothing really, you? [eos]


> INPUT:  [sos] did you hear the news? [eos]

> TARGET: [sos] what happened? [eos]


> INPUT:  [sos] i was crossing the street. [eos]

> TARGET: [sos] were you in a crosswalk? [eos]




In [None]:
def chat(input_text, max_length=50, do_sample=False):
    # Match training format exactly
    prompt = f"[sos] {input_text.strip()} [eos]"
    encoded = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=128
    ).to(device)

    output_ids = model.generate(
        input_ids=encoded["input_ids"],
        attention_mask=encoded["attention_mask"],
        max_length=max_length,
        num_beams=4,
        do_sample=do_sample,
        no_repeat_ngram_size=2,
        early_stopping=True
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return reply
print("🤖 Chatbot is ready! Type 'exit' to quit.")
while True:
    user_input = input("🧑 You: ")
    if user_input.lower() in ["exit", "quit"]:
        print("👋 Goodbye!")
        break
    response = chat(user_input)
    print(f"🤖 Bot: {response}")


🤖 Chatbot is ready! Type 'exit' to quit.


🧑 You:  What's going on?


🤖 Bot: [sos] i don't know what's going on in the future. [eoses]


🧑 You:  Did you hear the news


🤖 Bot: [sos] i'm going to be able to read the news. [eoses]
