In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
import random

# Data Collection - Using default data since web scraping might not be reliable
itu_data = {
    "programs": {
        "BS": ["Computer Science", "Electrical Engineering", "Business & Technology",
               "Social Sciences", "Artificial Intelligence"],
        "MS": ["Computer Science", "Electrical Engineering", "Data Science",
               "Innovation & Entrepreneurship"],
        "PhD": ["Computer Science", "Electrical Engineering", "Information Technology"]
    },
    "admissions": {
        "deadlines": "Fall 2023: July 15, 2023 | Spring 2024: December 15, 2023",
        "criteria": "BS: 60% in intermediate\nMS: 2.5 CGPA in BS\nPhD: 3.0 CGPA in MS\nAdmission test and interview required",
        "test_dates": "Fall: July 30, 2023 | Spring: January 5, 2024"
    },
    "fees": "BS: ~PKR 80,000/semester\nMS: ~PKR 100,000/semester\nPhD: ~PKR 120,000/semester",
    "scholarships": [
        "Merit scholarships (up to 100% tuition)",
        "Need-based scholarships",
        "HEC scholarships for eligible students"
    ]
}

# Create question-answer pairs
pairs = [
    # Programs
    ["How many BS programs does ITU offer?",
     f"ITU currently offers {len(itu_data['programs']['BS'])} BS programs: {', '.join(itu_data['programs']['BS'])}."],
    ["How many MS programs does ITU offer?",
     f"ITU offers {len(itu_data['programs']['MS'])} MS programs: {', '.join(itu_data['programs']['MS'])}."],
    ["How many PhD programs does ITU offer?",
     f"ITU has {len(itu_data['programs']['PhD'])} PhD programs: {', '.join(itu_data['programs']['PhD'])}."],
    ["What programs does ITU offer?",
     f"ITU offers programs at BS, MS and PhD levels. BS: {', '.join(itu_data['programs']['BS'])}. MS: {', '.join(itu_data['programs']['MS'])}. PhD: {', '.join(itu_data['programs']['PhD'])}."],

    # Admissions
    ["What is the admission application deadline?",
     f"Admission deadlines: {itu_data['admissions']['deadlines']}"],
    ["What are the admission criteria?",
     f"Admission criteria:\n{itu_data['admissions']['criteria']}"],
    ["Tell me about ITU admissions",
     f"ITU admissions information:\nDeadlines: {itu_data['admissions']['deadlines']}\nCriteria:\n{itu_data['admissions']['criteria']}"],

    # Fees
    ["What is the fee structure?",
     f"Fee structure:\n{itu_data['fees']}"],
    ["How much does it cost to study at ITU?",
     f"ITU fees:\n{itu_data['fees']}"],

    # Scholarships
    ["What scholarships are available?",
     f"Scholarships available at ITU:\n- " + "\n- ".join(itu_data['scholarships'])],
    ["Is there any financial aid?",
     f"Yes, ITU offers several financial aid options:\n- " + "\n- ".join(itu_data['scholarships'])],

    # General
    ["hello", "Hello! Welcome to ITU Admissions Chatbot. How can I help you?"],
    ["hi", "Hi there! I'm the ITU Admissions Bot. What would you like to know?"],
    ["bye", "Goodbye! If you have more questions about ITU admissions, feel free to ask later."]
]

# Add more variations
questions = [
    "number of bs programs", "bs programs count", "undergraduate programs",
    "ms programs number", "graduate programs count", "masters programs",
    "phd programs count", "doctoral programs", "phd offerings",
    "admission deadline", "last date to apply", "application due date",
    "admission requirements", "eligibility criteria", "what do i need to apply",
    "tuition fees", "fee details", "cost of study",
    "financial aid", "scholarship options", "funding opportunities",
    "what can i study at itu", "programs offered", "courses available"
]

answers = [
    pairs[0][1], pairs[0][1], pairs[0][1],
    pairs[1][1], pairs[1][1], pairs[1][1],
    pairs[2][1], pairs[2][1], pairs[2][1],
    pairs[4][1], pairs[4][1], pairs[4][1],
    pairs[5][1], pairs[5][1], pairs[5][1],
    pairs[7][1], pairs[7][1], pairs[7][1],
    pairs[9][1], pairs[9][1], pairs[9][1],
    pairs[3][1], pairs[3][1], pairs[3][1]
]

# Combine the pairs
for q, a in zip(questions, answers):
    pairs.append([q, a])

# Prepare data for the model
input_texts = []
target_texts = []
input_words = set()
target_words = set()

# Use special tokens
start_token = "<START>"
end_token = "<END>"

for pair in pairs:
    input_text = pair[0].lower()
    target_text = pair[1].lower()
    input_texts.append(input_text)
    target_texts.append(start_token + " " + target_text + " " + end_token)

    # Collect vocabulary
    for word in input_text.split():
        if word not in input_words:
            input_words.add(word)
    for word in target_text.split():
        if word not in target_words:
            target_words.add(word)

# Tokenization
num_encoder_tokens = len(input_words) + 1  # +1 for padding
num_decoder_tokens = len(target_words) + 2  # +2 for start and end tokens
max_encoder_seq_length = max(len(txt.split()) for txt in input_texts)
max_decoder_seq_length = max(len(txt.split()) for txt in target_texts)

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

# Create tokenizers
tokenizer_inputs = Tokenizer(num_words=num_encoder_tokens, filters='')
tokenizer_inputs.fit_on_texts(input_texts)
input_sequences = tokenizer_inputs.texts_to_sequences(input_texts)
encoder_input_data = pad_sequences(input_sequences, maxlen=max_encoder_seq_length, padding='post')

tokenizer_outputs = Tokenizer(num_words=num_decoder_tokens, filters='')
tokenizer_outputs.fit_on_texts(target_texts)
target_sequences = tokenizer_outputs.texts_to_sequences(target_texts)
target_sequences = [seq[1:] for seq in target_sequences]  # Remove start token
decoder_input_data = pad_sequences(target_sequences, maxlen=max_decoder_seq_length, padding='post')

target_sequences_next = [seq[1:] for seq in tokenizer_outputs.texts_to_sequences(target_texts)]
decoder_target_data = pad_sequences(target_sequences_next, maxlen=max_decoder_seq_length, padding='post')

# Convert to one-hot
encoder_input_data = tf.keras.utils.to_categorical(encoder_input_data, num_encoder_tokens)
decoder_input_data = tf.keras.utils.to_categorical(decoder_input_data, num_decoder_tokens)
decoder_target_data = tf.keras.utils.to_categorical(decoder_target_data, num_decoder_tokens)

# Encoder
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(256, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=32,
          epochs=50,
          validation_split=0.2)

# Inference models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse lookup token index to decode sequences back to words
reverse_input_char_index = dict(
    (i, char) for char, i in tokenizer_inputs.word_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in tokenizer_outputs.word_index.items())

def decode_sequence(input_seq):
    # Encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character
    start_token_index = tokenizer_outputs.word_index.get(start_token.lower(), 0)
    target_seq[0, 0, start_token_index] = 1.

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index.get(sampled_token_index, '')
        decoded_sentence += sampled_char + ' '

        # Exit condition: either hit max length or find stop character
        if (sampled_char == end_token.lower() or
            len(decoded_sentence.split()) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1)
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    # Remove start and end tokens from the final output
    decoded_sentence = decoded_sentence.replace(start_token.lower(), '').replace(end_token.lower(), '')
    return decoded_sentence.strip()

# Chatbot interface
def chat():
    print("ITU Admissions Chatbot: Hi! I can answer questions about ITU admissions. Type 'bye' to exit.")
    while True:
        user_input = input("You: ")

        if user_input.lower() == 'bye':
            print("ITU Admissions Chatbot: Goodbye! Have a great day.")
            break

        # Preprocess user input
        input_seq = tokenizer_inputs.texts_to_sequences([user_input.lower()])
        if not input_seq or not input_seq[0]:  # If no tokens found
            print("ITU Admissions Chatbot: I'm not sure I understand. Could you rephrase your question?")
            continue

        input_seq = pad_sequences(input_seq, maxlen=max_encoder_seq_length, padding='post')
        input_seq = tf.keras.utils.to_categorical(input_seq, num_encoder_tokens)

        # Get response
        decoded_sentence = decode_sequence(input_seq)
        print("ITU Admissions Chatbot:", decoded_sentence.capitalize())

# Start chatting
chat()

Number of samples: 38
Number of unique input tokens: 71
Number of unique output tokens: 115
Max sequence length for inputs: 9
Max sequence length for outputs: 40
Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0042 - loss: 4.7379 - val_accuracy: 0.4094 - val_loss: 4.5625
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 709ms/step - accuracy: 0.6050 - loss: 4.4779 - val_accuracy: 0.4187 - val_loss: 4.2739
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step - accuracy: 0.6175 - loss: 4.0526 - val_accuracy: 0.4187 - val_loss: 3.3374
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step - accuracy: 0.6125 - loss: 2.6021 - val_accuracy: 0.4094 - val_loss: 4.1070
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 336ms/step - accuracy: 0.6000 - loss: 2.3384 - val_accuracy: 0.4187 - val_loss: 3.2984
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━