In [1]:
import json
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
with open("intents.json", "r") as file:
    data = json.load(file)

# Prepare data
patterns, tags = [], []
for intent in data:
    for pattern in intent['patterns']:
        patterns.append(pattern)
        tags.append(intent['tag'])

# Create a DataFrame
df = pd.DataFrame({'Pattern': patterns, 'Tag': tags})

# Preprocessing function
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    return ' '.join([word for word in tokens if word.isalnum() and word not in stop_words])

# Apply preprocessing
df['Pattern'] = df['Pattern'].apply(preprocess_text)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Pattern'], df['Tag'], test_size=0.3, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=500)
X_train_vectorized = vectorizer.fit_transform(X_train).toarray()
X_test_vectorized = vectorizer.transform(X_test).toarray()


[nltk_data] Downloading package punkt to C:\Users\samar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train_vectorized, y_train)

# Evaluate on test set
y_pred_logistic = logistic_model.predict(X_test_vectorized)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logistic))
print("\nClassification Report:\n", classification_report(y_test, y_pred_logistic))


Logistic Regression Accuracy: 0.0398406374501992

Classification Report:
                                 precision    recall  f1-score   support

                alien_invasion       0.00      0.00      0.00         2
                arcane_alchemy       0.00      0.00      0.00         2
                           art       0.00      0.00      0.00         2
                art_and_crafts       0.00      0.00      0.00         2
       artificial_intelligence       0.00      0.00      0.00         2
  artificial_superintelligence       0.00      0.00      0.00         1
               astral_artistry       0.00      0.00      0.00         1
             augmented_reality       0.00      0.00      0.00         2
                        baking       0.00      0.00      0.00         1
        biomedical_engineering       0.00      0.00      0.00         1
                book_of_spells       0.00      0.00      0.00         0
                         books       1.00      0.67      0.80

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_vectorized, y_train)
y_pred_rf = rf_model.predict(X_test_vectorized)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

# SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_vectorized, y_train)
y_pred_svm = svm_model.predict(X_test_vectorized)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))


Random Forest Accuracy: 0.11952191235059761

Classification Report:
                                           precision    recall  f1-score   support

                                   about       0.00      0.00      0.00         0
                          alien_invasion       1.00      0.50      0.67         2
                          arcane_alchemy       0.00      0.00      0.00         2
                                     art       0.00      0.00      0.00         2
                          art_and_crafts       0.00      0.00      0.00         2
                 artificial_intelligence       1.00      0.50      0.67         2
            artificial_superintelligence       0.00      0.00      0.00         1
                         astral_artistry       0.00      0.00      0.00         1
                       augmented_reality       1.00      0.50      0.67         2
                                  baking       0.00      0.00      0.00         1
                  biomedical

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM Accuracy: 0.04780876494023904

Classification Report:
                                           precision    recall  f1-score   support

                                   about       0.00      0.00      0.00         0
                          alien_invasion       0.00      0.00      0.00         2
                         alien_languages       0.00      0.00      0.00         0
                          arcane_alchemy       0.00      0.00      0.00         2
                                     art       0.00      0.00      0.00         2
                          art_and_crafts       0.00      0.00      0.00         2
                 artificial_intelligence       0.00      0.00      0.00         2
            artificial_superintelligence       0.00      0.00      0.00         1
                         astral_artistry       0.00      0.00      0.00         1
                       augmented_reality       0.00      0.00      0.00         2
                                  baki

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_vectorized, y_train)

# Best parameters and evaluation
best_rf_model = grid_search.best_estimator_
y_pred_rf_best = best_rf_model.predict(X_test_vectorized)
print("Best Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf_best))




Best Random Forest Accuracy: 0.13545816733067728

Classification Report:
                                           precision    recall  f1-score   support

                                   about       0.00      0.00      0.00         0
                          alien_invasion       1.00      0.50      0.67         2
                          arcane_alchemy       0.00      0.00      0.00         2
                                     art       0.00      0.00      0.00         2
                          art_and_crafts       0.00      0.00      0.00         2
                 artificial_intelligence       1.00      0.50      0.67         2
            artificial_superintelligence       0.00      0.00      0.00         1
                         astral_artistry       0.00      0.00      0.00         1
                       augmented_reality       1.00      0.50      0.67         2
                                  baking       0.00      0.00      0.00         1
                  biome

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
import pickle

# Save the best model and vectorizer
with open('chatbot_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf_model, model_file)

with open('vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)
