In [9]:
import os

base_dirs = [
    "data/processed",
    "data/vocab",
    "src"
]

for d in base_dirs:
    os.makedirs(d, exist_ok=True)

print("✅ Folder structure created.")

✅ Folder structure created.


In [10]:
import os
import zipfile
import urllib.request

url = "http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip"
output_path = "data/raw/cornell.zip"
extract_path = "data/raw/cornell"

# Create folders
os.makedirs("data/raw", exist_ok=True)

# Download
if not os.path.exists(output_path):
    urllib.request.urlretrieve(url, output_path)
    print("✅ Download complete.")

# Extract
with zipfile.ZipFile(output_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
    print("✅ Extraction complete.")


✅ Extraction complete.


In [11]:
from pathlib import Path

# Load movie lines
lines_path = Path(extract_path) / "cornell movie-dialogs corpus/movie_lines.txt"
convo_path = Path(extract_path) / "cornell movie-dialogs corpus/movie_conversations.txt"

# Map line IDs to text
id2line = {}
with open(lines_path, "r", encoding="iso-8859-1") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id, text = parts[0], parts[4]
            id2line[line_id] = text

# Get conversation sequences
conversations = []
with open(convo_path, "r", encoding="iso-8859-1") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            line_ids_str = parts[3]
            line_ids = eval(line_ids_str)
            conversations.append(line_ids)

# Create pairs: (input, response)
pairs = []
for conv in conversations:
    for i in range(len(conv) - 1):
        first = id2line.get(conv[i], "").strip()
        second = id2line.get(conv[i+1], "").strip()
        if first and second:
            pairs.append((first, second))

print(f"✅ Total pairs: {len(pairs)}")
print("Example:")
print("👤", pairs[0][0])
print("🤖", pairs[0][1])


✅ Total pairs: 221282
Example:
👤 Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
🤖 Well, I thought we'd start with pronunciation, if that's okay with you.


In [162]:
import os
import re
import torch
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, decoders, processors

# === CONFIG ===
lines_path = "data/raw/cornell/cornell movie-dialogs corpus/movie_lines.txt"
convo_path = "data/raw/cornell/cornell movie-dialogs corpus/movie_conversations.txt"
vocab_dir = "data/vocab"
out_dir = "data/processed"
vocab_size = 30000

# === CLEAN FUNCTION ===
def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s\.\?!']", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# === LOAD LINES ===
id2line = {}
with open(lines_path, "r", encoding="iso-8859-1") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id, text = parts[0], parts[4]
            id2line[line_id] = clean(text)

# === LOAD CONVERSATIONS ===
conversations = []
with open(convo_path, "r", encoding="iso-8859-1") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            ids = eval(parts[3])
            conversations.append(ids)

# === CREATE (input, response) PAIRS ===
pairs = []
for conv in conversations:
    for i in range(len(conv) - 1):
        first = id2line.get(conv[i], "")
        second = id2line.get(conv[i + 1], "")
        if first and second:
            pairs.append((first, second))

print(f"✅ Loaded {len(pairs)} dialogue pairs")
pairs= pairs[:1000]  # Limit to first 1000 pairs for testing

# === TRAIN TOKENIZER ===
def train_tokenizer(pairs, vocab_size, save_path):
    tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["[PAD]", "[SOS]", "[EOS]", "[UNK]"]
    )

    all_texts = [s for p in pairs for s in p]
    tokenizer.train_from_iterator(all_texts, trainer=trainer)

    tokenizer.post_processor = processors.TemplateProcessing(
        single="[SOS] $A [EOS]",
        pair="[SOS] $A [EOS] [SOS] $B [EOS]",
        special_tokens=[
            ("[SOS]", tokenizer.token_to_id("[SOS]")),
            ("[EOS]", tokenizer.token_to_id("[EOS]"))
        ]
    )

    
    # tokenizer.decoder = decoders.BPE()
    Path(save_path).mkdir(parents=True, exist_ok=True)
    tokenizer.save(str(Path(save_path) / "chatbot_tokenizer.json"))

    return tokenizer

tokenizer = train_tokenizer(pairs, vocab_size, vocab_dir)
print("✅ Tokenizer trained and saved")

# === ENCODE & SPLIT ===
def encode_pairs(pairs, tokenizer):
    encoded = []
    for src, tgt in pairs:
        src_ids = tokenizer.encode(src).ids
        tgt_ids = tokenizer.encode(tgt).ids
        encoded.append((src_ids, tgt_ids))
    return encoded

encoded_pairs = encode_pairs(pairs, tokenizer)
print(f"✅ Encoded {len(encoded_pairs)} pairs")

# === SPLIT DATA ===
train, temp = train_test_split(encoded_pairs, test_size=0.2, random_state=42)
val, test = train_test_split(temp, test_size=0.5, random_state=42)

Path(out_dir).mkdir(parents=True, exist_ok=True)
torch.save(train, f"{out_dir}/train.pt")
torch.save(val, f"{out_dir}/val.pt")
torch.save(test, f"{out_dir}/test.pt")

print("✅ Saved train, val, and test splits")


✅ Loaded 221282 dialogue pairs
✅ Tokenizer trained and saved
✅ Encoded 1000 pairs
✅ Saved train, val, and test splits


In [163]:
import torch
from tokenizers import Tokenizer
from pathlib import Path

# Load tokenizer
tokenizer = Tokenizer.from_file("data/vocab/chatbot_tokenizer.json")

# Load sample data
dataset = torch.load("data/processed/train.pt")

# Function to decode token IDs
def decode_ids(ids):
    return tokenizer.decode(ids, skip_special_tokens=True)

# Show 5 samples
for i in range(5):
    input_ids, output_ids = dataset[i]
    input_text = decode_ids(input_ids)
    output_text = decode_ids(output_ids)
    
    print(f"👤 Input  {i+1}: {input_text}")
    print(f"🤖 Output {i+1}: {output_text}")
    print("-" * 60)


👤 Input  1: well no ...
🤖 Output 1: then that ' s all you had to say .
------------------------------------------------------------
👤 Input  2: but we do have a lack of notaries . you should contact my administration .
🤖 Output 2: don bobadilla is already a judge my dear don cristobal .
------------------------------------------------------------
👤 Input  3: so ... the station is empty ?
🤖 Output 3: yeah . this way .
------------------------------------------------------------
👤 Input  4: utapan won ' t you speak to me ? you used to know how to speak to me .
🤖 Output 4: you never learned how to speak my language .
------------------------------------------------------------
👤 Input  5: she just saw two of her friends killed ! they probably threatened her .
🤖 Output 5: is that all there is ?
------------------------------------------------------------


  dataset = torch.load("data/processed/train.pt")


In [137]:
import torch
from tokenizers import Tokenizer
from pathlib import Path

# Load tokenizer
tokenizer = Tokenizer.from_file("data/vocab/chatbot_tokenizer.json")

# Load dataset
dataset = torch.load("data/processed/train.pt")

# Print 5 samples
for i in range(5):
    input_ids, output_ids = dataset[i]
    
    # Decode
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    
    print(f"🟦 Sample {i+1}")
    print("🔢 Input IDs :", input_ids)
    print("🗣️  Input Text:", input_text)
    print("🔢 Output IDs:", output_ids)
    print("🤖 Output Text:", output_text)
    print("-" * 80)


🟦 Sample 1
🔢 Input IDs : [1, 228, 57, 88, 2]
🗣️  Input Text: well no ...
🔢 Output IDs: [1, 203, 76, 5, 35, 106, 45, 281, 53, 162, 6, 2]
🤖 Output Text: then that ' s all you had to say .
--------------------------------------------------------------------------------
🟦 Sample 2
🔢 Input IDs : [1, 135, 67, 77, 117, 17, 1802, 79, 2935, 6, 45, 357, 1555, 100, 3699, 6, 2]
🗣️  Input Text: but we do have a lack of notaries . you should contact my administration .
🔢 Output IDs: [1, 99, 1156, 55, 505, 17, 2306, 100, 2061, 99, 2570, 6, 2]
🤖 Output Text: don bobadilla is already a judge my dear don cristobal .
--------------------------------------------------------------------------------
🟦 Sample 3
🔢 Input IDs : [1, 82, 88, 51, 1225, 55, 2262, 16, 2]
🗣️  Input Text: so ... the station is empty ?
🔢 Output IDs: [1, 183, 6, 131, 169, 6, 2]
🤖 Output Text: yeah . this way .
--------------------------------------------------------------------------------
🟦 Sample 4
🔢 Input IDs : [1, 2527, 340, 5, 36, 

  dataset = torch.load("data/processed/train.pt")


In [138]:
vocab = tokenizer.get_vocab()
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])  # sort by ID
for token, idx in sorted_vocab[:10]:  # top 10 tokens
    print(f"{idx:>3} → {token}")

tokenizer.token_to_id("hello")

  0 → [PAD]
  1 → [SOS]
  2 → [EOS]
  3 → [UNK]
  4 → !
  5 → '
  6 → .
  7 → 0
  8 → 1
  9 → 2


In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
from tokenizers import Tokenizer

In [31]:
train_data = torch.load("data/processed/train.pt")
val_data = torch.load("data/processed/val.pt")
test_data = torch.load("data/processed/test.pt")

print("Training set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))


Training set size: 800
Validation set size: 100
Test set size: 100


  train_data = torch.load("data/processed/train.pt")
  val_data = torch.load("data/processed/val.pt")
  test_data = torch.load("data/processed/test.pt")


In [None]:

max_len_input = max(len(x) for x, _ in train_data)
max_len_target = max(len(y) for _, y in train_data)

print("Max length of input:", max_len_input)
print("Max length of target:", max_len_target)

print("Vocab size:", tokenizer.get_vocab_size())

In [33]:
class ChatbotDataset(Dataset):
    def __init__(self, data, pad_token_id, max_len=40):
        self.data = data
        self.pad_token_id = pad_token_id
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        if len(src) > self.max_len:
            src = src[:self.max_len]
        if len(tgt) > self.max_len:
            tgt = tgt[:self.max_len]

        src = src[:self.max_len] + [self.pad_token_id] * (self.max_len - len(src))
        tgt = tgt[:self.max_len] + [self.pad_token_id] * (self.max_len - len(tgt))
        return torch.tensor(src), torch.tensor(tgt)


In [34]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)

    def forward(self, x):
        embed = self.embedding(x)
        _, (hidden, cell) = self.lstm(embed)
        return hidden, cell


In [35]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        x = x.unsqueeze(1)
        embed = self.embedding(x)
        out, (hidden, cell) = self.lstm(embed, (hidden, cell))
        logits = self.fc(out.squeeze(1))
        return logits, hidden, cell


In [36]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, pad_token_id, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.pad_token_id = pad_token_id

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.shape
        vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self.device)

        hidden, cell = self.encoder(src)
        input_token = tgt[:, 0]  # <sos>

        for t in range(1, tgt_len):
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            outputs[:, t] = output
            top1 = output.argmax(1)
            input_token = tgt[:, t] if random.random() < teacher_forcing_ratio else top1

        return outputs


In [63]:
# ...existing code...
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ['input_ids', 'labels']]
)
val_dataset = val_dataset.remove_columns(
    [col for col in val_dataset.column_names if col not in ['input_ids', 'labels']]
)
# ...existing code...

In [64]:
print(train_dataset)

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 9025
})


In [65]:
# Setup hyperparameters
embed_size = 256
hidden_size = 512
batch_size = 32
num_epochs = 100
# pad_id = tokenizer.token_to_id("[PAD]")  # you should have this from preprocessing
# vocab_size = tokenizer.get_vocab_size()

pad_id = tokenizer.pad_token_id
vocab_size = tokenizer.vocab_size

# Datasets
# train_dataset = ChatbotDataset(train_data, pad_id)
# val_dataset = ChatbotDataset(val_data, pad_id)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Model + optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = Encoder(vocab_size, embed_size, hidden_size)
decoder = Decoder(vocab_size, embed_size, hidden_size)
model = Seq2Seq(encoder, decoder, pad_id, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)


In [None]:
print("Training dataset size:", len(train_dataset))
print("Batch size:", batch_size)
print(train_data[0][0])
print(pad_id)
print(train_dataset[1][0], train_dataset[1][1])  # Check shape of first sample

In [None]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    # for src, tgt in loader:
    for batch in loader:
        src, tgt = batch['input_ids'], batch['labels']
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)
        loss = criterion(output[:, 1:].reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

for epoch in range(1, num_epochs + 1):
    loss = train_epoch(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch} | Loss: {loss:.4f}")


In [None]:
torch.save(model.state_dict(), "model.pt")

In [12]:
# Redefine the encoder and decoder
tokenizer = Tokenizer.from_file("data/vocab/chatbot_tokenizer.json")
pad_id = tokenizer.token_to_id("[PAD]")

embed_size = 256
hidden_size = 512
vocab_size = tokenizer.get_vocab_size()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = Encoder(vocab_size, embed_size, hidden_size)
decoder = Decoder(vocab_size, embed_size, hidden_size)

# Recreate the Seq2Seq model
model = Seq2Seq(encoder, decoder, pad_id, device).to(device)

# Load the saved model weights
model.load_state_dict(torch.load("model.pt"))
model.eval()

print("Model loaded successfully!")

Model loaded successfully!


  model.load_state_dict(torch.load("model.pt"))


In [19]:
def generate_response(model, tokenizer, input_text, max_len=40):
    model.eval()
    ids = tokenizer.encode(input_text).ids
    ids = ids[:max_len] + [pad_id] * (max_len - len(ids))
    src = torch.tensor(ids).unsqueeze(0).to(device)

    hidden, cell = model.encoder(src)
    input_token = torch.tensor([tokenizer.token_to_id("[SOS]")]).to(device)

    output_ids = []
    for _ in range(max_len):
        output, hidden, cell = model.decoder(input_token, hidden, cell)
        top1 = output.argmax(1).item()
        if top1 == tokenizer.token_to_id("[EOS]"):
            break
        output_ids.append(top1)
        input_token = torch.tensor([top1]).to(device)

    # print(output_ids)
    # print([tokenizer.id_to_token(id) for id in output_ids])
    # print(tokenizer.decode(output_ids))

    # return tokenizer.decode(output_ids)
    return output_ids

# Try it

input_text = tokenizer.decode(test_data[19][0], skip_special_tokens=True)
print("Input Text:", input_text)
output_text = tokenizer.decode(test_data[19][1], skip_special_tokens=True)
print("Output Text:", output_text)
print("Generating response...")
response_ids = generate_response(model, tokenizer, input_text)
for id in response_ids:
    print(tokenizer.id_to_token(id), end=" ")
    
print("\n")

Input Text: what ' s normal ?
Output Text: bogey lowenstein ' s party is normal but you ' re too busy listening to bitches who need prozac to know that .
Generating response...
i ' m getting trashed man . isn ' t that what you ' re supposed to do at a party ? 



In [181]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np

In [182]:
def evaluate(model, data_loader, tokenizer, pad_id):
    model.eval()
    total_tokens = 0
    correct_tokens = 0
    bleu_scores = []

    chencherry = SmoothingFunction()

    with torch.no_grad():
        for src, tgt in data_loader:
            src, tgt = src.to(device), tgt.to(device)
            batch_size, seq_len = tgt.size()

            output = model(src, tgt, teacher_forcing_ratio=0.0)  # no teacher forcing
            predictions = output.argmax(-1)

            for i in range(batch_size):
                pred_seq = predictions[i].tolist()
                true_seq = tgt[i].tolist()

                # Remove padding and special tokens
                pred_seq = [tok for tok in pred_seq if tok != pad_id and tok != tokenizer.token_to_id("[SOS]")]
                true_seq = [tok for tok in true_seq if tok != pad_id and tok != tokenizer.token_to_id("[SOS]")]

                # Token Accuracy
                min_len = min(len(pred_seq), len(true_seq))
                total_tokens += min_len
                correct_tokens += sum([1 for p, t in zip(pred_seq, true_seq) if p == t])

                # BLEU Score
                ref = [tokenizer.decode(true_seq).split()]
                hyp = tokenizer.decode(pred_seq).split()
                bleu = sentence_bleu(ref, hyp, smoothing_function=chencherry.method1)
                bleu_scores.append(bleu)

    token_accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
    avg_bleu = np.mean(bleu_scores)
    return token_accuracy, avg_bleu


In [183]:
val_accuracy, val_bleu = evaluate(model, val_loader, tokenizer, pad_id)
print(f"Validation Token Accuracy: {val_accuracy:.2%}")
print(f"Validation BLEU Score: {val_bleu:.4f}")

Validation Token Accuracy: 3.87%
Validation BLEU Score: 0.0112


In [184]:
test_dataset = ChatbotDataset(test_data, pad_id)
test_loader = DataLoader(test_dataset, batch_size=32)

test_accuracy, test_bleu = evaluate(model, test_loader, tokenizer, pad_id)
print(f"Test Token Accuracy: {test_accuracy:.2%}")
print(f"Test BLEU Score: {test_bleu:.4f}")


Test Token Accuracy: 4.37%
Test BLEU Score: 0.0105


In [None]:
!pip install transformers datasets sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   ---------------------------------------- 991.5/991.5 kB 5.8 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\japne\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Using T5 tokenizer + Model


In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer_t5 = T5Tokenizer.from_pretrained("t5-small")
model_t5 = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
def chat_with_t5(input_text, max_length=50):
    input_ids = tokenizer_t5.encode("chat: " + input_text, return_tensors="pt")
    output_ids = model_t5.generate(input_ids, max_length=max_length, num_beams=5, early_stopping=True)
    return tokenizer_t5.decode(output_ids[0], skip_special_tokens=True)

# Test the T5 model with some sample inputs
print("User: Hello!")
print("Bot:", chat_with_t5("Hello!"))


print("\nUser: Tell me a joke.")
print("Bot:", chat_with_t5("Tell me a joke."))

print("\nUser: What is your name?")
print("Bot:", chat_with_t5("What is your name?"))


User: Hello!
Bot: chat: Hello!

User: Tell me a joke.
Bot: chat: Tell me a joke.

User: What is your name?
Bot: chat: What is your name?


Fine tuning it on Cornell


In [24]:
from datasets import Dataset
from transformers import T5Tokenizer
import pandas as pd
import re
import os

# Example dialogue pairs
# Replace this with your real (input, response) pairs
# data = [
#     {"input": "chat: Hello!", "output": "Hi there!"},
#     {"input": "chat: How are you?", "output": "I'm doing well, thank you."},
#     {"input": "chat: What's your name?", "output": "I'm a chatbot."}
#     # Add more pairs from Cornell/PersonaChat here
# ]

def clean_text(text):
    text = text.lower().strip()
    text = text.replace("\n", " ")
    text = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data= pairs 
# Clean the data
data = [(clean_text(src), clean_text(tgt)) for src, tgt in data]

df = pd.DataFrame(data)
df.columns = ['input', 'output']
df['input'] = "chat: " + df['input']  # Prefix for T5
df['output'] = df['output']  # No prefix for output
dataset = Dataset.from_pandas(df)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")



# Tokenize
def preprocess(example):
    input_enc = tokenizer(example['input'], truncation=True, padding='max_length', max_length=32)
    target_enc = tokenizer(example['output'], truncation=True, padding='max_length', max_length=32)
    input_enc['labels'] = target_enc['input_ids']
    return input_enc

tokenized_dataset = dataset.map(preprocess)

Map:   0%|          | 0/221282 [00:00<?, ? examples/s]

In [25]:
# Save the tokenized dataset
output_dir = "data/processed/cornell_t5_dataset"
os.makedirs(output_dir, exist_ok=True)
tokenized_dataset.save_to_disk(output_dir)
print(f"Tokenized dataset saved to {output_dir}")

Saving the dataset (0/1 shards):   0%|          | 0/221282 [00:00<?, ? examples/s]

Tokenized dataset saved to data/processed/cornell_t5_dataset


In [31]:
# Show the dataset along with cols
print(tokenized_dataset[0])
print(tokenized_dataset.column_names)
print(tokenized_dataset[0]['input'])  # Check the input
print(tokenized_dataset[0]['labels'])  # Check the labels
print(tokenizer.decode(tokenized_dataset[0]['input_ids']))  # Decode the input IDs
print(tokenizer.decode(tokenized_dataset[0]['labels']))  # Decode the labels


{'input': 'chat: can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad. again.', 'output': 'well i thought we d start with pronunciation if that s okay with you.', 'input_ids': [3582, 10, 54, 62, 143, 48, 1704, 58, 3, 12907, 4515, 3, 5543, 9249, 11, 11, 60, 210, 18595, 17, 17, 33, 578, 46, 3, 5828, 21315, 989, 1162, 452, 1733, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [168, 3, 23, 816, 62, 3, 26, 456, 28, 30637, 3, 99, 24, 3, 7, 8957, 28, 25, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
['input', 'output', 'input_ids', 'attention_mask', 'labels']
chat: can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad. again.
[168, 3, 23, 816, 62, 3, 26, 456, 28, 30637, 3, 99, 24, 3, 7, 8957, 28, 25, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
chat: can we make this quick? roxanne

In [38]:
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_val = train_test['train'].train_test_split(test_size=0.2, seed=42)
train_dataset = train_val['train']
val_dataset = train_val['test']
test_dataset = train_test['test']
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Test dataset size:", len(test_dataset))


Train dataset size: 141620
Validation dataset size: 35405
Test dataset size: 44257


In [45]:
print(test_dataset[0]['input'])

chat: that is not true mrs. ayala. your route is compromised. perhaps it is time for me to deal with other distributors in california.


In [25]:
# show a sample
print(tokenized_dataset[0]['input'])
print(tokenized_dataset[0]['output'])
print(tokenized_dataset[0]['input_ids'])
print(tokenized_dataset[0]['labels'])

# print lengths
print("Input length:", len(tokenized_dataset[0]['input_ids']))
print("Output length:", len(tokenized_dataset[0]['labels']))
print("Input length:", len(tokenized_dataset[0]['input']))
print("Output length:", len(tokenized_dataset[0]['output']))

chat: can we make this quick? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad. again.
well, i thought we d start with pronunciation, if that s okay with you.
[3582, 10, 54, 62, 143, 48, 1704, 58, 3, 12907, 4515, 3, 5543, 9249, 11, 11, 60, 210, 18595, 17, 17, 33, 578, 46, 3, 5828, 21315, 989, 1162, 452, 1733, 1]
[168, 6, 3, 23, 816, 62, 3, 26, 456, 28, 30637, 6, 3, 99, 24, 3, 7, 8957, 28, 25, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Input length: 32
Output length: 32
Input length: 136
Output length: 71


In [39]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./t5-chatbot",
    per_device_train_batch_size=4,
    num_train_epochs=10,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
trainer.train()
model.save_pretrained("t5-chatbot-model")
tokenizer.save_pretrained("t5-chatbot-model")

In [66]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load model
model = T5ForConditionalGeneration.from_pretrained("t5-chatbot-model")
tokenizer = T5Tokenizer.from_pretrained("t5-chatbot-model")

def chat(prompt):
    input_text = "chat: " + prompt
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids, max_length=32, num_beams=5, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Try chatting
# print(chat("Hello! How are you?"))
# print(chat("Tell me a joke."))
# print(chat("What is your name?"))
# print(chat("What is the weather like today?"))
# print(chat("Can you tell me a story?"))

# Use this on test set

input_text = test_dataset[110]['input']
input_ids = test_dataset[110]['input_ids']
output_ids = test_dataset[110]['labels']
output_text = test_dataset[110]['output']

print("Input Text:", input_text)
print("Input IDs:", input_ids)
print("Output IDs:", output_ids)
print("Output Text:", output_text)
print("Decoded Input Text:", tokenizer.decode(input_ids))
print("Decoded Output Text:", tokenizer.decode(output_ids))
print("Chatbot Response:", chat(input_text))


Input Text: chat: it s that woman from the coach!
Input IDs: [3582, 10, 34, 3, 7, 24, 2335, 45, 8, 3763, 55, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output IDs: [3, 23, 3, 195, 36, 17227, 15, 26, 233, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Output Text: i ll be damned...
Decoded Input Text: chat: it s that woman from the coach!</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Decoded Output Text: i ll be damned...</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Chatbot Response: i m not a coach.


In [53]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from tqdm import tqdm

# Load your test data (replace this with your own test set)
# Example format: list of (input, expected_output)
# For Cornell, it might be something like:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# from test_dataset keep only 'input' and 'output' columns
test_data = [(item['input'], item['output']) for item in test_dataset]
test_data = test_data[:100]  # Limit to first 100 for testing

# Evaluation loop
def evaluate(model, tokenizer, test_data):
    results = []
    for source, expected in tqdm(test_data):
        input_ids = tokenizer.encode(source, return_tensors="pt").to(device)

        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                max_length=50,
                num_beams=4,
                early_stopping=True
            )

        generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        results.append({
            "Input": source,
            "Expected": expected,
            "Generated": generated
        })

    return results

# Run evaluation
results = evaluate(model, tokenizer, test_data)

# Print results
for r in results:
    print(f"Input: {r['Input']}")
    print(f"Expected: {r['Expected']}")
    print(f"Generated: {r['Generated']}\n")

100%|██████████| 100/100 [00:59<00:00,  1.68it/s]

Input: chat: that is not true mrs. ayala. your route is compromised. perhaps it is time for me to deal with other distributors in california.
Expected: i don t think you re going to do that.
Generated: i don t know. i don t know. i don t know. i don t know.

Input: chat: this is a total catastrastroke. as of this moment i am stumped i admit i am stumped and treed both the hound dogs have me surrounded.
Expected: poor thing don t cry rose. i know you feel awful but don t cry honey nobody s perfect. who s the father dear?
Generated: i m stumped and treed.

Input: chat: because i m tired of not understanding things. cops mafia and butlers forcing me to bust my ass to steal something which it turns out i really didn t steal it s fucked up.
Expected: you re not thinking of going to...
Generated: i m tired of not understanding things.

Input: chat: which one is me? the horse?
Expected: get out of here.
Generated: i m a horse. i m a horse.

Input: chat: stop this now. i ll do it. i swear.
Exp




In [54]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4

scores = []
for r in results:
    ref = [r["Expected"].split()]
    hyp = r["Generated"].split()
    score = sentence_bleu(ref, hyp, smoothing_function=smoothie)
    scores.append(score)

print(f"Average BLEU Score: {sum(scores)/len(scores):.4f}")


Average BLEU Score: 0.0112


Using DailyDialog+T5

In [71]:
from datasets import load_dataset

dataset = load_dataset("daily_dialog",trust_remote_code=True)

In [72]:
from transformers import T5Tokenizer
import re

tokenizer = T5Tokenizer.from_pretrained("t5-small")

def clean_text(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9?.!,¿' ]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

def preprocess_function(example):
    inputs = []
    targets = []
    dialogue = example["dialog"]
    for i in range(len(dialogue) - 1):
        input_text = "chat: " + clean_text(dialogue[i])
        target_text = clean_text(dialogue[i + 1])
        inputs.append(input_text)
        targets.append(target_text)
    return {"input_texts": inputs, "target_texts": targets}

processed_dataset = dataset["train"].map(preprocess_function, remove_columns=dataset["train"].column_names)


In [73]:
def tokenize_function(example):
    model_inputs = tokenizer(example["input_texts"], max_length=40, truncation=True, padding="max_length")
    labels = tokenizer(example["target_texts"], max_length=40, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = processed_dataset.map(tokenize_function)

In [74]:
print(tokenized_dataset.column_names)

['input_texts', 'target_texts', 'input_ids', 'attention_mask', 'labels']


In [75]:
# check how many sentences are there per row
# make a new dataset with only 1 sentence per row
def flatten_dataset(dataset):
    flattened_data = []
    for i in range(len(dataset)):
        input_texts = dataset[i]["input_texts"]
        target_texts = dataset[i]["target_texts"]
        input_ids = dataset[i]["input_ids"]
        labels = dataset[i]["labels"]
        attention_mask = dataset[i]["attention_mask"]

        for j in range(len(input_texts)):
            flattened_data.append({
                "input_text": input_texts[j],
                "target_text": target_texts[j],
                "input_ids": input_ids[j],
                "attention_mask": attention_mask[j],
                "labels": labels[j]

            })
    return flattened_data

flattened_data = flatten_dataset(tokenized_dataset)


In [76]:
# convert to dataset
from datasets import Dataset
import pandas as pd

flattened_dataset = pd.DataFrame(flattened_data)
hf_dataset = Dataset.from_pandas(flattened_dataset)


In [77]:
train_test = hf_dataset.train_test_split(test_size=0.1)
train_dataset = train_test['train']
eval_dataset = train_test['test']
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(eval_dataset))

Train dataset size: 68446
Validation dataset size: 7606


In [78]:
print(tokenized_dataset[0]['input_texts'][8])

chat: good.let ' s go now .


In [79]:
# Show a sample from the dataset
print(tokenized_dataset[0]["input_texts"][0])  # Input text
print(tokenized_dataset[0]["target_texts"][0])  # Target text
print(tokenized_dataset[0]["input_ids"][0])  # Tokenized input IDs
print(tokenized_dataset[0]["labels"][0])  # Tokenized target IDs
print(tokenizer.decode(tokenized_dataset[0]["input_ids"][0]))  # Decoded input text
print(tokenizer.decode(tokenized_dataset[0]["labels"][0]))  # Decoded target text

print("Input length:", len(tokenized_dataset[0]["input_ids"][0]))  # Length of input IDs
print("Output length:", len(tokenized_dataset[0]["labels"][0]))  # Length of target IDs
print("Input length:", tokenized_dataset[0]["attention_mask"][0])  # Length of input text


chat: say , jim , how about going for a few beers after dinner ?
you know that is tempting but is really not good for our fitness .
[3582, 10, 497, 3, 6, 3, 354, 603, 3, 6, 149, 81, 352, 21, 3, 9, 360, 36, 277, 227, 2634, 3, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[25, 214, 24, 19, 24873, 68, 19, 310, 59, 207, 21, 69, 4639, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
chat: say, jim, how about going for a few beers after dinner?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
you know that is tempting but is really not good for our fitness.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
Input length: 40
Output length: 40
Input length: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [81]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-small")

train_dataset = train_dataset.shuffle(seed=42).select(range(1000))  # Select first 1000 samples
eval_dataset = eval_dataset.shuffle(seed=42).select(range(100))  # Select first 100 samples

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

eval_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"]
)

training_args = TrainingArguments(
    output_dir="./t5-dailydialog",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # For demonstration; use validation set in practice
)

trainer.train()
model.save_pretrained("t5-dailydialog-model")
tokenizer.save_pretrained("t5-dailydialog-model")

Step,Training Loss


('t5-dailydialog-model\\tokenizer_config.json',
 't5-dailydialog-model\\special_tokens_map.json',
 't5-dailydialog-model\\spiece.model',
 't5-dailydialog-model\\added_tokens.json')

In [None]:
def chat(prompt):
    input_ids = tokenizer("chat: " + prompt, return_tensors="pt").input_ids
    outputs = model.generate(input_ids, max_length=50, do_sample=True, top_k=50, top_p=0.95)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(chat("Hello! How are you?"))
print(chat("Tell me a joke."))
print(chat("What is your name?"))
print(chat("What is the weather like today?"))

# Evaluate the model on the val set
def evaluate(model, tokenizer, dataset):
    model.eval()
    results = []
    for example in dataset:
        input_ids = example["input_ids"].unsqueeze(0)
        attention_mask = example["attention_mask"].unsqueeze(0)

        with torch.no_grad():
            output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50)

        generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        expected_text = tokenizer.decode(example["labels"], skip_special_tokens=True)
        results.append({
            "Input": tokenizer.decode(input_ids[0], skip_special_tokens=True),
            "Expected": expected_text,
            "Generated": generated_text
        })
    return results
results = evaluate(model, tokenizer, eval_dataset[:10])
for r in results:
    print(f"Input: {r['Input']}")
    print(f"Expected: {r['Expected']}")
    print(f"Generated: {r['Generated']}\n")

i love it.
we're not able to make our own decision on the subject of a conversation between people who have a business, we will. We can get a lot of advice for other individuals.
en anglais?
can you picture your holiday on your beach beach?
Input: chat: have you tried an outlet?
Expected: why didn't i think of that?
Generated: i have a problem with a.

Input: chat: suit wrote me a letter.
Expected: what did she say?
Generated: 

Input: chat: what do you think of this one?
Expected: eh, so so.
Generated: i'm not sure if you're going to be able to.

Input: chat: oh, i thought i could make a right turn on red here.
Expected: no, sir. the sign says no turn on red.
Generated: i thought i could make a right turn on red.

Input: chat: sorry. i feel like sitting out the next dance.
Expected: ok. let's get something to drink.
Generated: i'm not sure. but i'm not sure.

Input: chat: i don't know what to suggest. there're so many attractions, and they all sound interesting. one exciting program m

Using BlenderBot by Facebook

In [4]:
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration

model_name = "facebook/blenderbot-400M-distill"
tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
model = BlenderbotForConditionalGeneration.from_pretrained(model_name)

def chat(text):
    inputs = tokenizer(text, return_tensors="pt")
    reply_ids = model.generate(**inputs)
    reply = tokenizer.decode(reply_ids[0], skip_special_tokens=True)
    return reply

print(chat("Hey! How are you today?"))
print(chat("Can you tell me a fun fact?"))


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/127k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/62.9k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/310k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/730M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/730M [00:00<?, ?B/s]

 I'm doing well, thank you. How about yourself? Do you have any plans for the weekend?
 Sure, one of my favorite facts is that the word "fact" was first recorded in the 10th century in a Latin manuscript from the Southern Italy town of Gaeta in Lazio.


In [5]:
# Sample conversation
print(chat("What is your favorite movie?"))
print(chat("Tell me a joke."))
print(chat("What is the weather like today?"))
print(chat("Can you tell me a story?"))
print(chat("What is your favorite color?"))

 I don't really have a favorite movie, but I do like action movies. What about you?
 What do you call a deer with no teeth?  A duck.  A pig.
 It is a little chilly, but not too bad.  How about where you are?
 Sure, I was in a car accident and had to get stitches in my head.
 My favorite color is blue.  What is yours?  Do you have a favorite color as well?


In [6]:
save_path = "./blenderbot_400M"

tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)



Applying seq2seq on medical dataset


In [21]:
import pandas as pd

df = pd.read_csv("icliniq_data.csv")
df.head()

Unnamed: 0,ID,Legacy ID,Title,Author Name,Speciality,Abstract,Affiliation,Publish At,Question,Answer
0,01JF4TEQHFJ5MEKH1QNM9PRT92,897840,What are effective therapies for metastatic br...,Dr. Pawar Satyajit Jalinder,Medical oncology,Metastatic breast cancer occurs when cancer sp...,icliniq,2024-12-15T15:00:05,"Hello doctor,\nMy mother was diagnosed with st...","Hello,\nWelcome to icliniq.com.\nI can underst..."
1,01JF4Q0WT937DQDC2MGR40MTDN,3595800,How does HIV spread?,Dr. Basti Bharatesh Devendra,Dermatology,HIV spreads by certain body fluids from an inf...,icliniq,2024-12-15T14:00:04,"Hello doctor,Last night I went for dinner and ...","Hello,\nWelcome to icliniq.com.\nI read your q..."
2,01JF4KK3NKG5ZRRZ3G0QFXFAB7,3099775,Can recurrent hoarseness without GERD indicate...,Dr. Akshay. B. K.,Otolaryngology (E.N.T),Recurrent hoarseness may result from vocal str...,icliniq,2024-12-15T13:00:04,"Hi doctor,I am a 59-year-old male and a nonsmo...","Hi,\nWelcome to icliniq.com.\nI have read your..."
3,01JF4G52SP8WMFBKZ5V075R066,65337,Is long-term Pantocid-IT use safe?,Dr. Kunal Das,Medical Gastroenterology,Long-term use of Pantocid-IT may cause nutrien...,icliniq,2024-12-15T12:00:04,"Hi doctor,\nI am a 35-year-old male. My height...","Hi,\nWelcome to icliniq.com.\nI have read your..."
4,01JF4CQ76T1P7713XQ099A432S,3876269,Can type 2 diabetes resolve after delivery?,Dr. Nitesh Goyal,Pulmonology (Asthma Doctors),"After giving birth, a person with type 1 diabe...",icliniq,2024-12-15T11:00:04,"Hi doctor,My sister delivered a baby one month...","Hi,\nWelcome to icliniq.com.\nI have gone thro..."


In [22]:
pairs = []
for i in range(len(df)):
    input_text = df.iloc[i]["Question"]
    output_text = df.iloc[i]["Answer"]
    pairs.append((input_text, output_text))
print(pairs[:5])  # Show first 5 pairs
print(len(pairs))  # Show total number of pairs
pairs = pairs[:10000]  # Limit to first 1000 pairs for testing

[('Hello doctor,\nMy mother was diagnosed with stage 3 breast cancer, which was completely removed. However, two years later, she experienced metastasis that spread to the brain. Now, the cancer has spread to the liver, and she has developed jaundice. Her bilirubin levels have increased from 7.5 to 10. Is there anything that can be done with medication? The common bile duct (CBD) is not dilated; she also has ascites, for which a combination of Spironolactone and Torasemide will be administered. Can anything be done to manage the jaundice?\nKindly suggest.', 'Hello,\nWelcome to icliniq.com.\nI can understand your concern.\nI have reviewed your reports and your query. To better assist, I would like to know a few things:\n\n\nWhat treatments has she received so far?\n\n\nWhat is the hormonal status of the tumor?\n\n\nHow is her overall condition?\n\n\nFrom what I can see, she has already received Palbociclib and Capecitabine, which suggests the tumor might be hormone receptor-positive and

In [23]:
from datasets import Dataset
from transformers import T5Tokenizer
import pandas as pd
import re
import os

# Example dialogue pairs
# Replace this with your real (input, response) pairs
# data = [
#     {"input": "chat: Hello!", "output": "Hi there!"},
#     {"input": "chat: How are you?", "output": "I'm doing well, thank you."},
#     {"input": "chat: What's your name?", "output": "I'm a chatbot."}
#     # Add more pairs from Cornell/PersonaChat here
# ]

def clean_text(text):
    text = text.lower().strip()
    text = text.replace("\n", " ")
    text = re.sub(r"[^a-zA-Z0-9?.!,¿]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data= pairs 
# Clean the data
data = [(clean_text(src), clean_text(tgt)) for src, tgt in data]

df = pd.DataFrame(data)
df.columns = ['input', 'output']
df['input'] = "chat: " + df['input']  # Prefix for T5
df['output'] = df['output']  # No prefix for output
dataset = Dataset.from_pandas(df)

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")


# Tokenize
def preprocess(example):
    input_enc = tokenizer(example['input'], truncation=True, padding='max_length', max_length=32)
    target_enc = tokenizer(example['output'], truncation=True, padding='max_length', max_length=32)
    input_enc['labels'] = target_enc['input_ids']
    return input_enc

tokenized_dataset = dataset.map(preprocess)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [24]:
# Save the tokenized dataset
output_dir = "data/processed/icliniq_t5_dataset"
os.makedirs(output_dir, exist_ok=True)
tokenized_dataset.save_to_disk(output_dir)
print(f"Tokenized dataset saved to {output_dir}")

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenized dataset saved to data/processed/icliniq_t5_dataset


In [27]:
# shoew the dataset along with cols
print(tokenized_dataset)

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10000
})


In [40]:
train_test = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
train_val = train_test['train'].train_test_split(test_size=0.05, seed=42)
train_dataset = train_val['train']
val_dataset = train_val['test']
test_dataset = train_test['test']
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print("Test dataset size:", len(test_dataset))


Train dataset size: 9025
Validation dataset size: 475
Test dataset size: 500
