In [2]:
import os
from pathlib import Path
from glob import glob

Step 1 - Load Cornell files and build QA pairs

In [3]:
lines_file = Path(r"E:\70 Days 70 Project\Chatbot with Seq2Seq Model\cornell movie-dialogs corpus\movie_lines.txt")
convs_file = Path(r"E:\70 Days 70 Project\Chatbot with Seq2Seq Model\cornell movie-dialogs corpus\movie_conversations.txt")

In [4]:
MIN_LEN=1
MAX_LEN=20

"""
movie_lines.txt format (roughly):
lineID +++$+++ characterID +++$+++ movieID +++$+++ characterName +++$+++ text
 0                  1                 2                 3                 4
"""

def load_lines(lines_file):
    id2line = {}
    with open(lines_file, 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.strip().split(" +++$+++ ")
            if len(parts) == 5:
                line_id = parts[0] # ID of the line
                text = parts[4] # Text of the line
                id2line[line_id] = text
    #PRINT SAMPLE LINES
    sample_keys = list(id2line.keys())[:3]
    print("Sample lines:")
    for k in sample_keys:
        print(" ", k, "->", id2line[k])
    return id2line

In [20]:
"""
movie_conversations.txt format:
character1ID +++$+++ character2ID +++$+++ movieID +++$+++ utteranceIDs
utteranceIDs is like:
    ['L194', 'L195', 'L196', ...]
"""
import re

def load_conversations(convs_file):
    conversations=[]
    with open(convs_file, 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.strip().split(" +++$+++ ")
            if len(parts) == 4:
                line_ids_str = parts[3] # This is a string like "['L194', 'L195', ...]"
                line_ids = re.findall(r"L\d+", line_ids_str)  # extract all line IDs like L1234
                if len(line_ids) >= 2:
                    conversations.append(line_ids)
    print("Sample conversation lineIDs:", conversations[0:3])
    return conversations

In [21]:
id2line=load_lines(lines_file)
all_conversations_list=load_conversations(convs_file)

Sample lines:
  L1045 -> They do not!
  L1044 -> They do to!
  L985 -> I hope so.
Sample conversation lineIDs: [['L194', 'L195', 'L196', 'L197'], ['L198', 'L199'], ['L200', 'L201', 'L202', 'L203']]


In [24]:
def build_pairs(all_conversations_list,id2line):
    pairs=[]
    for conv in all_conversations_list:
        for i in range(len(conv)-1):
            dialog_1_id=conv[i]
            dialog_2_id=conv[i+1]
            
            dialog_1_text=id2line.get(dialog_1_id,"").strip()
            dialog_2_text=id2line.get(dialog_2_id,"").strip()
            
            if dialog_1_text and dialog_2_text:
                pairs.append((dialog_1_text,dialog_2_text))
                
    print(" Speaker A:", pairs[0][0])
    print(" Speaker B:", pairs[0][1])
    return pairs
            

In [25]:
all_pairs=build_pairs(all_conversations_list,id2line)

 Speaker A: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.
 Speaker B: Well, I thought we'd start with pronunciation, if that's okay with you.


In [26]:
def normalize_text(s):
    s = s.lower().strip()
    # separate punctuation slightly: "hi!" -> "hi !"
    s = re.sub(r"([.!?])", r" \1", s)
    # remove weird characters (keep a-z, punctuation)
    s = re.sub(r"[^a-z.!?]+", " ", s)
    # remove extra spaces
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [27]:
def filter_pairs(pairs, min_len=MIN_LEN, max_len=MAX_LEN):
    filtered = []
    for q, a in pairs:
        qn = normalize_text(q)
        an = normalize_text(a)

        q_len = len(qn.split())
        a_len = len(an.split())

        if min_len <= q_len <= max_len and min_len <= a_len <= max_len:
            filtered.append((qn, an))
    print("Sample cleaned pair:")
    print(" Q:", filtered[0][0])
    print(" A:", filtered[0][1])
    return filtered

In [28]:
filter_pairs=filter_pairs(all_pairs, MIN_LEN, MAX_LEN)

Sample cleaned pair:
 Q: well i thought we d start with pronunciation if that s okay with you .
 A: not the hacking and gagging and spitting part . please .


Build Vocab

In [29]:
class Vocabulary:
    def __init__(self):
        # Special tokens with reserved indices
        self.PAD_token = 0
        self.SOS_token = 1
        self.EOS_token = 2
        self.UNK_token = 3

        # Initialize mappings with special tokens 
        self.word2index = {
            "<PAD>": self.PAD_token,
            "<SOS>": self.SOS_token,
            "<EOS>": self.EOS_token,
            "<UNK>": self.UNK_token
        }

        self.index2word = {
            self.PAD_token: "<PAD>",
            self.SOS_token: "<SOS>",
            self.EOS_token: "<EOS>",
            self.UNK_token: "<UNK>"
        }

        self.word2count = {}

        self.next_num = 4  # position for next new word (as special tokens are 0-3)

    def add_sentence(self, sentence):

        for word in sentence.split():
            self.add_word(word)

    # this function adds a word to the vocabulary based on whether it's new or existing
    def add_word(self, word):
        if word not in self.word2index:
            # new word
            self.word2index[word] = self.next_num
            self.index2word[self.next_num] = word
            self.word2count[word] = 1 # word counter
            self.next_num += 1 # increment for next new word
        else:
            # existing word
            self.word2count[word] += 1


In [31]:
# Build vocabulary
vocab = Vocabulary()

for q, a in filter_pairs:
    vocab.add_sentence(q)
    vocab.add_sentence(a)
    
print("Total words in vocab:", vocab.next_num) # as next_num indicates total words

# show some examples
for word in list(vocab.word2index.keys())[:10]:
    print(word, "->", vocab.word2index[word])

Total words in vocab: 33922
<PAD> -> 0
<SOS> -> 1
<EOS> -> 2
<UNK> -> 3
well -> 4
i -> 5
thought -> 6
we -> 7
d -> 8
start -> 9


In [33]:
def sentence_to_indices(vocab, sentence):
    indices = [vocab.SOS_token]  # start with SOS
    for word in sentence.split():
        if word in vocab.word2index: 
            indices.append(vocab.word2index[word])
        else:
            indices.append(vocab.UNK_token)  # unknown word
    indices.append(vocab.EOS_token)  # end with EOS
    return indices

In [41]:
vocab.word2count["can"], vocab.word2index["can"]

(11101, 66)

In [39]:
test_sentence = filter_pairs[0][0]
print("Sentence:", test_sentence)

idxs = sentence_to_indices(vocab, test_sentence)
print("Indexes:", idxs)

print("Back to words:")
print([vocab.index2word[i] for i in idxs])


Sentence: well i thought we d start with pronunciation if that s okay with you .
Indexes: [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 10, 16, 17, 2]
Back to words:
['<SOS>', 'well', 'i', 'thought', 'we', 'd', 'start', 'with', 'pronunciation', 'if', 'that', 's', 'okay', 'with', 'you', '.', '<EOS>']


In [None]:
import torch
def pad_sequence(seq, max_len):
    # Convert tensor to list if needed
    if isinstance(seq, torch.Tensor):
        seq = seq.tolist()
    return seq + [vocab.PAD_token] * (max_len - len(seq))

In [62]:
seq1 = sentence_to_indices(vocab, "hi .")
seq2 = sentence_to_indices(vocab, "how are you ?")

max_len = max(len(seq1), len(seq2))

print("Before padding:")
print(seq1)
print(seq2)

print("\nAfter padding:")
print(pad_sequence(seq1, max_len))
print(pad_sequence(seq2, max_len))


Before padding:
[1, 133, 17, 2]
[1, 27, 286, 16, 35, 2]

After padding:
[1, 133, 17, 2, 0, 0]
[1, 27, 286, 16, 35, 2]


In [88]:
import torch
from torch.utils.data import Dataset, DataLoader


class ChatDataset(Dataset):
    def __init__(self, pairs, vocab):
        self.pairs = pairs
        self.vocab = vocab
        
    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        q, a = self.pairs[idx]
        q_idxs = sentence_to_indices(self.vocab, q)
        a_idxs = sentence_to_indices(self.vocab, a)
        return torch.tensor(q_idxs, dtype=torch.long), torch.tensor(a_idxs, dtype=torch.long)
    

In [89]:
def collate_fn(batch):
    # sort by input length (important for LSTM)
    
    # This ensures pack_padded_sequence receives sequences in the correct order
    batch.sort(key=lambda x: len(x[0]), reverse=True) # longest to shortest descending required by pack_padded_sequence

    input_seqs, target_seqs = zip(*batch)

    # codes to find the list of length of all seq and the max from it
    input_lengths = [len(seq) for seq in input_seqs]
    target_lengths = [len(seq) for seq in target_seqs]

    max_input_len = max(input_lengths)
    max_target_len = max(target_lengths)

    # new input for appending padded sequence
    padded_inputs = []
    padded_targets = []

    for inp, tgt in zip(input_seqs, target_seqs):
        padded_inputs.append(pad_sequence(inp, max_input_len))
        padded_targets.append(pad_sequence(tgt, max_target_len))

    # convert to tensors
    input_tensor = torch.tensor(padded_inputs, dtype=torch.long)
    target_tensor = torch.tensor(padded_targets, dtype=torch.long)

    return input_tensor, target_tensor, input_lengths, target_lengths


In [90]:
BATCH_SIZE = 32

dataset = ChatDataset(filter_pairs, vocab)

dataloader = DataLoader(dataset,batch_size=BATCH_SIZE,shuffle=True,collate_fn=collate_fn)


In [91]:
next(iter(dataloader))

(tensor([[    1, 15701,    14,  1252,    17,    17,  2697,   155,   217,   132,
            178,    47,   240,  4556,    17,    17,    17,     2],
         [    1,   459,  7223,    21,  6504, 18984,   603,   500,   158,   190,
            459,  2198,  3227,    17,    17,    17,     2,     0],
         [    1,   400,    39,   149,    13,   372,  1509,    17,     5,   235,
            620,    47,   175,    17,     2,     0,     0,     0],
         [    1,     5,   255,   101,   256,   101,   132,    17,   206,    16,
           3676,    17,    17,    17,     2,     0,     0,     0],
         [    1,   296,   721,  3417, 15289,   296,   721,     7,    60,   112,
            591,   185,  7426,    17,     2,     0,     0,     0],
         [    1,   260,   424,     5,   125,    40,  3500,    17,     5,   125,
           1438,   312,    17,     2,     0,     0,     0,     0],
         [    1,    64,    14,   580,    83,  2214,   127,    19,  2976,  6816,
             17,     2,     0,     0, 

Encoder

In [108]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class Encoder(nn.Module):
    def __init__(self, vocab_size,embedding_dim,hidden_dim,num_layers=1,dropout=.3):
        super().__init__()
        
        self.hidden_dim=hidden_dim
        self.num_layers=num_layers
        
        self.embedding=nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim,padding_idx=0)
        self.lstm=nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim,num_layers=num_layers,batch_first=True,dropout= dropout)
        
    def forward(self,input,input_length):
        embedded=self.embedding(input)
        # print("Embedded shape:", embedded.shape)
        
        packed=pack_padded_sequence(embedded,input_length,batch_first=True,enforce_sorted=True) # Packs variable-length sequences so the LSTM ignores padded time steps and processes only real tokens in each sequence
        
        packed_outputs,(hidden,cell)=self.lstm(packed)
        
        outputs,_=pad_packed_sequence(packed_outputs,batch_first=True) # Unpacks the LSTM outputs back to padded sequences for batch alignment

            # print("Encoder outputs shape:", outputs.shape)
            # print("Encoder hidden shape:", hidden.shape)
            # print("Encoder cell shape:", cell.shape)
        
        return outputs,(hidden,cell)
        

In [109]:
VOCAB_SIZE = vocab.next_num
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 1

encoder = Encoder(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS
)




In [110]:
encoder.eval()  # evaluation mode

batch = next(iter(dataloader))
input_tensor, target_tensor, input_lengths, target_lengths = batch

print("Input tensor shape:", input_tensor.shape)

with torch.no_grad():
    encoder_outputs, (hidden, cell) = encoder(input_tensor, input_lengths)


Input tensor shape: torch.Size([32, 22])


In [111]:
class Decoder(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim, num_layers=1,dropout=.3):
        super().__init__()
        
        self.hidden_dim=hidden_dim
        self.num_layers=num_layers
        
        self.embedding=nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim,padding_idx=0)
        self.lstm=nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim,num_layers=num_layers,batch_first=True,dropout= dropout)
        # fc layer to map LSTM outputs to vocab size
        self.fc_out=nn.Linear(hidden_dim,vocab_size)
        self.dropout=nn.Dropout(dropout)
        
    def forward(self,input,hidden,cell):
        # input shape: (batch_size) -> we process one time step at a time
        input=input.unsqueeze(1)  # (batch_size, 1) # add time step dimension
        
        embedded=self.dropout(self.embedding(input))  # (batch_size, 1, embedding_dim)
        
        outputs,(hidden,cell)=self.lstm(embedded,(hidden,cell))  # outputs: (batch_size, 1, hidden_dim)
        
        predictions=self.fc_out(outputs.squeeze(1))  # (batch_size, vocab_size) remove the time step dimension
        
        return predictions,hidden,cell

In [112]:
decoder = Decoder(
    vocab_size=vocab.next_num,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS
)


In [113]:
import random

def decode_training(decoder,encoder_hidden,encoder_cell,target_tensor,teacher_forcing_ratio=0.5):
    
    batch_size = target_tensor.size(0)
    max_len = target_tensor.size(1)

    decoder_input = torch.full((batch_size,),vocab.SOS_token,dtype=torch.long,device=target_tensor.device)

    hidden = encoder_hidden
    cell = encoder_cell

    all_outputs = []

    for t in range(max_len):
        
        logits, hidden, cell = decoder(decoder_input, hidden, cell)
        
        all_outputs.append(logits.unsqueeze(1))

        use_teacher_forcing = random.random() < teacher_forcing_ratio

        if use_teacher_forcing:
            decoder_input = target_tensor[:, t]
        else:
            decoder_input = logits.argmax(dim=1)

    outputs = torch.cat(all_outputs, dim=1)
    # shape: (batch_size, max_len, vocab_size)

    return outputs


In [114]:
import torch.optim as optim

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=vocab.PAD_token)


In [115]:
def train_step(encoder,decoder,input_tensor,target_tensor,input_lengths, encoder_optimizer,decoder_optimizer,criterion,teacher_forcing_ratio=0.5):
    
    encoder.train()
    decoder.train()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # ENCODER
    encoder_outputs, (hidden, cell) = encoder(input_tensor, input_lengths)

    # DECODER
    decoder_outputs = decode_training(decoder,hidden,cell,target_tensor,teacher_forcing_ratio)
    # decoder_outputs: (batch, seq_len, vocab_size)

    # LOSS
    loss = criterion(decoder_outputs.view(-1, decoder_outputs.size(-1)),target_tensor.view(-1))

    # ----- Backprop -----
    loss.backward()

    # Gradient clipping (important for RNNs)
    torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1.0)

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()

In [118]:
# Setup GPU device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

# Move models to device
encoder = encoder.to(device)
decoder = decoder.to(device)

Using device: cpu
GPU available: False


In [None]:
from tqdm import tqdm

EPOCHS = 10
PRINT_EVERY = 100

for epoch in range(1, EPOCHS + 1):
    total_loss = 0

    for i, batch in tqdm(enumerate(dataloader, start=1), total=len(dataloader)):
        input_tensor, target_tensor, input_lengths, target_lengths = batch
        
        # Move tensors to GPU
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)

        loss = train_step(
            encoder,
            decoder,
            input_tensor,
            target_tensor,
            input_lengths,
            encoder_optimizer,
            decoder_optimizer,
            criterion,
            teacher_forcing_ratio=0.5
        )

        total_loss += loss

        if i % PRINT_EVERY == 0:
            print(
                f"Epoch [{epoch}/{EPOCHS}] "
                f"Step [{i}/{len(dataloader)}] "
                f"Loss: {loss:.4f}"
            )

    avg_loss = total_loss / len(dataloader)
    print(f"\n Epoch {epoch} completed | Avg Loss: {avg_loss:.4f}\n")

In [None]:
def chat(
    encoder,
    decoder,
    sentence,
    vocab,
    device,
    max_len=20
):
    encoder.eval()
    decoder.eval()

    with torch.no_grad():
        # Prepare input
        input_seq = sentence_to_indices(vocab, normalize_text(sentence))
        input_tensor = torch.tensor(input_seq).unsqueeze(0).to(device)
        input_lengths = [len(input_seq)]

        # Encoder
        encoder_outputs, (hidden, cell) = encoder(input_tensor, input_lengths)

        # Decoder
        decoder_input = torch.tensor([vocab.SOS_token]).to(device)
        decoded_words = []

        for _ in range(max_len):
            logits, hidden, cell = decoder(decoder_input, hidden, cell)
            next_token = logits.argmax(dim=1).item()

            if next_token == vocab.EOS_token:
                break

            decoded_words.append(vocab.index2word[next_token])
            decoder_input = torch.tensor([next_token]).to(device)

    return " ".join(decoded_words)

while True:
    user_input = input("You: ")
    if user_input.lower() in ["quit", "exit"]:
        break

    reply = chat(encoder, decoder, user_input, vocab, device)
    print("Bot:", reply)


You:  im hungry

Bot: <SOS> you got a .

You:  i wanna go out tonight

Bot: <SOS> yeah .

You:  quit
