In [None]:
import nltk
import random
import json
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score

In [None]:
intents = {
  "intents": [
    {
      "tag": "greeting",
      "patterns": ["Hi", "Hello", "Hey", "Good day", "How are you?"],
      "responses": ["Hello!", "Good to see you!", "Hi there, how can I help?"],
      "context_set": ""
    },
    {
      "tag": "farewell",
      "patterns": ["Goodbye", "Bye", "See you later", "Talk to you later"],
      "responses": ["Sad to see you go :(", "Goodbye!", "Come back soon!"],
      "context_set": ""
    },
    {
      "tag": "creator",
      "patterns": ["Who created you?", "Who is your developer?", "Who made you?"],
      "responses": ["I was created by Lakshmi Kommuri."],
      "context_set": ""
    },
    {
      "tag": "identity",
      "patterns": ["What is your name?", "What should I call you?", "Who are you?"],
      "responses": ["You can call me Mind Reader. I'm a Chatbot."],
      "context_set": ""
    },
    {
      "tag": "hours",
      "patterns": ["What are the college timings?", "When is the college open?", "What are your hours of operation?"],
      "responses": ["The college is open from 8am to 5pm, Monday to Saturday."],
      "context_set": ""
    },
    {
      "tag": "contact",
      "patterns": ["How can I contact the college?", "What is the college telephone number?", "Can I get your contact number?"],
      "responses": ["You can contact the college at 123456789."],
      "context_set": ""
    },
    {
      "tag": "income",
      "patterns": ["What is the average income of CUST1000?", "What is the average income of cust1000?", "avg income of CUST1000  ?"],
      "responses": ["$59544"],
      "context_set": ""
    },
    {
      "tag": "avg income total",
      "patterns": ["What is the average income of all customer?", "What is the average income?", "avg income of all?"],
      "responses": ["51021.65469"],
      "context_set": ""
    },
    {
      "tag": "age",
      "patterns": ["How many customers are under 30?", "customers below 30?", "Number of customers below 30?"],
      "responses": ["141"],
      "context_set": ""
    },
    {
      "tag": "loan",
      "patterns": ["Compare loan defaults by gender", "loan by gender", "comparision of loan by gender"],
      "responses": ["https://drive.google.com/file/d/1_QqD6WUwXbfU_rYTOk7rKWZ3XuP-CazX/view?usp=drive_link"],
      "context_set": ""
    },
    {
      "tag": "count",
      "patterns": ["Show a bar chart of transaction count by job", "bar chart of transaction count by job", "bar chat of transaction"],
      "responses": ["https://drive.google.com/file/d/1G5WBTMlxJT_Ha5reFsmdSPPFSjhqpJqP/view?usp=sharing"],
      "context_set": ""
    },
  ]
}

In [None]:
# Function to perform synonym replacement
def synonym_replacement(tokens, limit):
    augmented_sentences = []
    for i in range(len(tokens)):
        synonyms = []
        for syn in wordnet.synsets(tokens[i]):
            for lemma in syn.lemmas():
                synonyms.append(lemma.name())
        if len(synonyms) > 0:
            num_augmentations = min(limit, len(synonyms))
            sampled_synonyms = random.sample(synonyms, num_augmentations)
            for synonym in sampled_synonyms:
                augmented_tokens = tokens[:i] + [synonym] + tokens[i+1:]
                augmented_sentences.append(' '.join(augmented_tokens))
    return augmented_sentences

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
text_data = []
labels = []
stopwords = set(nltk.corpus.stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


limit_per_tag = 40

for intent in intents['intents']:
    augmented_sentences_per_tag = 0
    for example in intent['patterns']:
        tokens = nltk.word_tokenize(example.lower())
        filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords and token.isalpha()]
        if filtered_tokens:
            text_data.append(' '.join(filtered_tokens))
            labels.append(intent['tag'])

            augmented_sentences = synonym_replacement(filtered_tokens, limit_per_tag - augmented_sentences_per_tag)
            for augmented_sentence in augmented_sentences:
                text_data.append(augmented_sentence)
                labels.append(intent['tag'])
                augmented_sentences_per_tag += 1
                if augmented_sentences_per_tag >= limit_per_tag:
                    break

print(len(text_data))
print(len(labels))

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text_data)
y = labels

In [None]:
def find_best_model(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=100)


    models = [
        ('Logistic Regression', LogisticRegression(), {
            'penalty': ['l2'],
            'C': [0.1, 1.0, 10.0],
            'solver': ['liblinear'],
            'max_iter': [100, 1000, 10000]
        }),
        ('Multinomial Naive Bayes', MultinomialNB(), {'alpha': [0.1, 0.5, 1.0]}),
        ('Linear SVC', LinearSVC(), {
            'penalty': ['l2'],
            'loss': ['hinge', 'squared_hinge'],
            'C': [0.1, 1, 10],
            'max_iter': [100, 1000, 10000]
        }),
        ('Decision Tree', DecisionTreeClassifier(), {
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'criterion': ['gini', 'entropy']
        }),
        ('Random Forest', RandomForestClassifier(), {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        })
    ]

    for name, model, param_grid in models:
        grid = GridSearchCV(model, param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train, y_train)
        y_pred = grid.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        print(f'{name}: {score:.4f} (best parameters: {grid.best_params_})')

    best_model = max(models, key=lambda x: GridSearchCV(x[1], x[2], cv=3, n_jobs=-1).fit(X_train, y_train).score(X_test, y_test))
    print(f'\nBest model: {best_model[0]}')

    # Fit the best model to the full training data
    best_model[1].fit(X, y)

    return best_model[1]

In [None]:
best_model = find_best_model(X, y)

In [None]:
def chatbot_response(user_input):
    input_text = vectorizer.transform([user_input])
    predicted_intent = best_model.predict(input_text)[0]

    for intent in intents['intents']:
        if intent['tag'] == predicted_intent:
            response = random.choice(intent['responses'])
            break

    return response

In [None]:
print('Hello! I am a chatbot. How can I help you today? Type "quit" to exit.')
while True:
    user_input = input('> ')
    if user_input.lower() == 'quit':
        break
    response = chatbot_response(user_input)
    print(response)

Hello! I am a chatbot. How can I help you today? Type "quit" to exit.
Goodbye!
$59544
Goodbye!
Sad to see you go :(
Come back soon!


In [None]:
import os
import pickle


if not os.path.exists('model'):
    os.makedirs('model')

if not os.path.exists('dataset'):
    os.makedirs('dataset')

# Save the trained model
with open('model/chatbot_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the vectorizer
with open('model/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the intents to the "dataset" folder
with open('dataset/intents1.json', 'w') as f:
    json.dump(intents, f)