In [1]:
pip install nltk scikit-learn streamlit


Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import json
import random
import string
import numpy as np
import streamlit as st
import pickle

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gunti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gunti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = {
    "greetings": {
        "questions": ["Hi", "Hello", "Good morning", "Good evening"],
        "responses": ["Hello!", "Hi there!", "Greetings!", "Good day!"]
    },
    "farewells": {
        "questions": ["Bye", "Goodbye", "See you later", "Take care"],
        "responses": ["Goodbye!", "See you later!", "Take care!", "Bye!"]
    },
    "thanks": {
        "questions": ["Thank you", "Thanks", "Much appreciated"],
        "responses": ["You're welcome!", "No problem!", "Glad to help!"]
    }
}

In [4]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [5]:
questions = []
labels = []

for label, content in data.items():
    for question in content['questions']:
        questions.append(preprocess(question))
        labels.append(label)

# Convert to numpy arrays for compatibility with scikit-learn
questions = np.array(questions)
labels = np.array(labels)

In [6]:
questions

array(['hi', 'hello', 'good morning', 'good evening', 'bye', 'goodbye',
       'see later', 'take care', 'thank', 'thanks', 'much appreciated'],
      dtype='<U16')

In [7]:
labels

array(['greetings', 'greetings', 'greetings', 'greetings', 'farewells',
       'farewells', 'farewells', 'farewells', 'thanks', 'thanks',
       'thanks'], dtype='<U9')

In [8]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(questions, labels)

In [9]:
with open('chatbot_model.pkl', 'wb') as f:
    pickle.dump(model, f)