# Statistical NLP Part B

In [2]:
# Importing necessary libraries
import json
import numpy as np
import random
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Downloading nltk data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kanak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kanak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kanak\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# Load the data from the provided JSON file
with open('GL Bot.json', 'r') as file:
    data = json.load(file)

# Explore the structure of the dataset
print(json.dumps(data, indent=4))

{
    "intents": [
        {
            "tag": "Intro",
            "patterns": [
                "hi",
                "how are you",
                "is anyone there",
                "hello",
                "whats up",
                "hey",
                "yo",
                "listen",
                "please help me",
                "i am learner from",
                "i belong to",
                "aiml batch",
                "aifl batch",
                "i am from",
                "my pm is",
                "blended",
                "online",
                "i am from",
                "hey ya",
                "talking to you for first time"
            ],
            "responses": [
                "Hello! how can i help you ?"
            ],
            "context_set": ""
        },
        {
            "tag": "Exit",
            "patterns": [
                "thank you",
                "thanks",
                "cya",
                "see you",
                "l

In [4]:
# Data Preprocessing
# Initialize lists for patterns (inputs) and tags (labels)
patterns = []
tags = []
responses = {}
classes = []

# Loop through the intents and extract patterns and their corresponding tag
for intent in data['intents']:
    for pattern in intent['patterns']:
        # Tokenizing the sentence into words
        word_list = nltk.word_tokenize(pattern)
        patterns.append(word_list)
        tags.append(intent['tag'])
        
    # Store the tag and its responses
    if intent['tag'] not in responses:
        responses[intent['tag']] = intent['responses']
        
    # Add tag to the class list
    if intent['tag'] not in classes:
        classes.append(intent['tag'])

# Lemmatize and lower each word and remove duplicates
patterns = [[lemmatizer.lemmatize(word.lower()) for word in pattern] for pattern in patterns]

print("Patterns (tokenized and lemmatized):", patterns[:5])
print("Tags:", tags[:5])

Patterns (tokenized and lemmatized): [['hi'], ['how', 'are', 'you'], ['is', 'anyone', 'there'], ['hello'], ['whats', 'up']]
Tags: ['Intro', 'Intro', 'Intro', 'Intro', 'Intro']


In [5]:
# Create a Bag of Words Model
# Flatten the list of patterns to get the vocabulary
vocabulary = set([lemmatizer.lemmatize(word.lower()) for pattern in patterns for word in pattern])

# Convert the patterns into a bag of words (BOW) representation
vectorizer = CountVectorizer(vocabulary=vocabulary, lowercase=False, tokenizer=lambda text: text, preprocessor=lambda text: text)
X = vectorizer.fit_transform(patterns).toarray()

# Assign tag indices to the responses (target labels)
Y = np.array([classes.index(tag) for tag in tags])

# Check the shape of X and Y
print("Shape of X (features):", X.shape)
print("Shape of Y (target labels):", Y.shape)

Shape of X (features): (128, 158)
Shape of Y (target labels): (128,)




In [6]:
# Train a Simple Classifier

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the SVM classifier
clf = SVC(kernel='linear', probability=True)
clf.fit(X_train, Y_train)

# Test the classifier on the test set
Y_pred = clf.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)

print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 53.85%


In [7]:
# Create Functions for Chatbot Interaction
# Function to preprocess user input
def clean_input(user_input):
    tokenized = nltk.word_tokenize(user_input)
    tokenized = [lemmatizer.lemmatize(word.lower()) for word in tokenized]
    input_vector = vectorizer.transform([tokenized]).toarray()
    return input_vector

# Function to get response based on model prediction
def get_response(user_input):
    input_vector = clean_input(user_input)
    predicted_tag_index = clf.predict(input_vector)[0]
    predicted_tag = classes[predicted_tag_index]
    
    # Return a random response from the corresponding tag
    return random.choice(responses[predicted_tag])

In [8]:
# Build the Chatbot Loop
def chat():
    print("Bot: Hello! How can I help you today?")
    
    while True:
        # Take user input
        user_input = input("You: ")
        
        # If the user wants to exit
        if user_input.lower() in ["exit", "quit", "bye", "goodbye"]:
            print("Bot: I hope I was able to assist you. Goodbye!")
            break
        
        # Get the chatbot's response
        bot_response = get_response(user_input)
        print(f"Bot: {bot_response}")

# Start the chatbot interaction
chat()

Bot: Hello! How can I help you today?


You:  Hi


Bot: Hello! how can i help you ?


You:  Tell me about Great Learning


Bot: I hope I was able to assist you, Good Bye


You:  Bye


Bot: I hope I was able to assist you. Goodbye!


In [9]:
# Improve User Interaction with More Responses and Handling Unknown Inputs
# Modify the get_response function to handle unknown inputs
def get_response(user_input, threshold=0.3):
    input_vector = clean_input(user_input)
    predicted_proba = clf.predict_proba(input_vector)[0]
    
    # If the highest probability is below the threshold, return a fallback response
    if max(predicted_proba) < threshold:
        return "I'm sorry, I didn't understand that. Could you please rephrase?"
    
    predicted_tag_index = np.argmax(predicted_proba)
    predicted_tag = classes[predicted_tag_index]
    
    # Return a random response from the corresponding tag
    return random.choice(responses[predicted_tag])

# Re-run the chat loop to include this new functionality
chat()

Bot: Hello! How can I help you today?


You:  Hi


Bot: Hello! how can i help you ?


You:  Bye


Bot: I hope I was able to assist you. Goodbye!


In [10]:
# Add Logging to Track Chat Conversations
# Function to log the conversation to a file
def log_conversation(user_input, bot_response):
    with open("chat_log.txt", "a") as log_file:
        log_file.write(f"User: {user_input}\n")
        log_file.write(f"Bot: {bot_response}\n\n")

# Modify the chat function to include logging
def chat():
    print("Bot: Hello! How can I help you today?")
    
    while True:
        # Take user input
        user_input = input("You: ")
        
        # If the user wants to exit
        if user_input.lower() in ["exit", "quit", "bye", "goodbye"]:
            bot_response = "I hope I was able to assist you. Goodbye!"
            print(f"Bot: {bot_response}")
            log_conversation(user_input, bot_response)
            break
        
        # Get the chatbot's response
        bot_response = get_response(user_input)
        print(f"Bot: {bot_response}")
        
        # Log the conversation
        log_conversation(user_input, bot_response)

# Start the chatbot with logging
chat()

Bot: Hello! How can I help you today?


You:  HI


Bot: Hello! how can i help you ?


You:  Bye


Bot: I hope I was able to assist you. Goodbye!
