In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re


In [None]:
# Download and extract the dataset
!wget http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip cornell_movie_dialogs_corpus.zip


In [None]:
# Load the dataset
lines = open("cornell movie-dialogs corpus/movie_lines.txt", encoding="utf-8", errors="ignore").read().split("\n")
conversations = open("cornell movie-dialogs corpus/movie_conversations.txt", encoding="utf-8", errors="ignore").read().split("\n")

# Extract pairs of questions and answers
id_to_line = {}
for line in lines:
    parts = line.split(" +++$+++ ")
    if len(parts) == 5:
        id_to_line[parts[0]] = parts[4]

pairs = []
for convo in conversations:
    parts = convo.split(" +++$+++ ")
    if len(parts) == 4:
        ids = eval(parts[3])
        for i in range(len(ids) - 1):
            pairs.append((id_to_line[ids[i]], id_to_line[ids[i + 1]]))

# Display sample pairs
print("\nSample Question-Answer Pairs:")
for pair in pairs[:5]:
    print(f"Q: {pair[0]}")
    print(f"A: {pair[1]}\n")


In [None]:
# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

# Apply cleaning
questions = [clean_text(pair[0]) for pair in pairs]
answers = ["<start> " + clean_text(pair[1]) + " <end>" for pair in pairs]

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(questions + answers)

# Convert text to sequences
questions_seq = tokenizer.texts_to_sequences(questions)
answers_seq = tokenizer.texts_to_sequences(answers)

# Pad sequences
max_len = max(max(len(seq) for seq in questions_seq), max(len(seq) for seq in answers_seq))
questions_seq = pad_sequences(questions_seq, maxlen=max_len, padding="post")
answers_seq = pad_sequences(answers_seq, maxlen=max_len, padding="post")

# Display tokenized samples
print("\nTokenized Question Sample:", questions_seq[0])
print("Tokenized Answer Sample:", answers_seq[0])


In [None]:
# Define encoder
encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=256)(encoder_inputs)
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embedding)

# Define attention mechanism
decoder_inputs = Input(shape=(max_len,))
decoder_embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=256)(decoder_inputs)
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[encoder_state_h, encoder_state_c])

attention = tf.keras.layers.Attention()([decoder_outputs, encoder_outputs])
decoder_concat_input = tf.concat([decoder_outputs, attention], axis=-1)
decoder_dense = Dense(len(tokenizer.word_index) + 1, activation="softmax")
decoder_outputs = decoder_dense(decoder_concat_input)

# Build the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
model.summary()


In [None]:
# Prepare decoder target data
answers_target = np.zeros_like(answers_seq)
answers_target[:, :-1] = answers_seq[:, 1:]

# Train the model
model.fit(
    [questions_seq, answers_seq],
    answers_target,
    batch_size=64,
    epochs=10,
    validation_split=0.2
)

# Save the trained model
model.save("chatbot_model.h5")


In [None]:
# Load the trained model
model = load_model("chatbot_model.h5")

# Generate responses
def generate_response(input_text, model, tokenizer, max_len):
    input_seq = tokenizer.texts_to_sequences([clean_text(input_text)])
    input_seq = pad_sequences(input_seq, maxlen=max_len, padding="post")
    decoder_input = np.zeros((1, max_len))
    decoder_input[0, 0] = tokenizer.word_index["<start>"]

    response = ""
    for i in range(max_len - 1):
        pred = model.predict([input_seq, decoder_input])
        token = np.argmax(pred[0, i, :])
        word = tokenizer.index_word.get(token, "")
        if word == "<end>":
            break
        response += word + " "
        decoder_input[0, i + 1] = token

    return response.strip()

# Test the chatbot
test_question = "How are you?"
response = generate_response(test_question, model, tokenizer, max_len)
print("\nChatbot Response:", response)
