# Statistical NLP Part A (run on GL Bot.json)

In [2]:
# Import necessary libraries
import json
import random
import nltk
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('wordnet')

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kanak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kanak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#1. Read and Analyse Dataset.
# Load the JSON file
file_path = 'GL Bot.json'  # File path

with open(file_path, 'r') as f:
    data = json.load(f)

# Inspecting the data structure
intents = data['intents']  # Accessing the intents from the JSON

# Printing the first intent to understand the structure
print(f"Number of intents: {len(intents)}")
print(f"First intent: {intents[0]}")

Number of intents: 8
First intent: {'tag': 'Intro', 'patterns': ['hi', 'how are you', 'is anyone there', 'hello', 'whats up', 'hey', 'yo', 'listen', 'please help me', 'i am learner from', 'i belong to', 'aiml batch', 'aifl batch', 'i am from', 'my pm is', 'blended', 'online', 'i am from', 'hey ya', 'talking to you for first time'], 'responses': ['Hello! how can i help you ?'], 'context_set': ''}


In [4]:
#2. Preprocess unstructured data to make it consumable for model training.

# Initialize lists to store patterns and tags
patterns = []
tags = []

# Loop through each intent
for intent in intents:
    for pattern in intent['patterns']:
        # Tokenize each pattern
        word_list = nltk.word_tokenize(pattern)
        # Lemmatize each word and convert to lowercase
        words = [lemmatizer.lemmatize(w.lower()) for w in word_list]
        patterns.append(" ".join(words))
        tags.append(intent['tag'])

# Print the first 5 preprocessed patterns and their tags
print("Preprocessed Patterns and Tags:")
for i in range(5):
    print(f"Pattern: {patterns[i]}, Tag: {tags[i]}")

Preprocessed Patterns and Tags:
Pattern: hi, Tag: Intro
Pattern: how are you, Tag: Intro
Pattern: is anyone there, Tag: Intro
Pattern: hello, Tag: Intro
Pattern: whats up, Tag: Intro


In [5]:
#3C. Vectorize data using any one vectorizer.

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the patterns into a vectorized form
X = vectorizer.fit_transform(patterns).toarray()

# Convert tags into numerical labels for classification
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(tags)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of the training and testing data
print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")

Training data shape: (102, 157), Testing data shape: (26, 157)


In [6]:
#3D. Build a base model for Supervised Learning

# Initialize the Naive Bayes classifier
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

In [7]:
#3E. Clearly print Performance Metrics.

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Classification Report:
              precision    recall  f1-score   support

         Bot       1.00      0.50      0.67         2
        Exit       0.33      0.33      0.33         3
       Intro       0.50      0.40      0.44         5
          NN       0.60      0.50      0.55         6
     Olympus       0.67      0.67      0.67         3
     Profane       0.00      0.00      0.00         2
          SL       0.33      0.75      0.46         4
      Ticket       0.00      0.00      0.00         1

    accuracy                           0.46        26
   macro avg       0.43      0.39      0.39        26
weighted avg       0.48      0.46      0.45        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
#4. Improve Performance of model.
#4A. Experiment with other vectorisers.

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the patterns into a TF-IDF matrix
X_tfidf = tfidf_vectorizer.fit_transform(patterns).toarray()

# Split the TF-IDF data into training and testing sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier on the TF-IDF features
model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train_tfidf)

# Make predictions on the test data (TF-IDF)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

# Evaluate the model's performance
print("Classification Report (TF-IDF):")
print(classification_report(y_test_tfidf, y_pred_tfidf, target_names=label_encoder.classes_))

Classification Report (TF-IDF):
              precision    recall  f1-score   support

         Bot       1.00      0.50      0.67         2
        Exit       0.50      0.33      0.40         3
       Intro       0.67      0.40      0.50         5
          NN       1.00      0.50      0.67         6
     Olympus       1.00      0.33      0.50         3
     Profane       0.00      0.00      0.00         2
          SL       0.25      1.00      0.40         4
      Ticket       0.00      0.00      0.00         1

    accuracy                           0.46        26
   macro avg       0.55      0.38      0.39        26
weighted avg       0.65      0.46      0.47        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
#4B. Build classifier Models using other algorithms than base model.
# Using Support Vector Machine (SVM)

from sklearn.svm import SVC

# Initialize the SVM classifier
svm_model = SVC(kernel='linear')

# Train the SVM classifier on the TF-IDF features
svm_model.fit(X_train_tfidf, y_train_tfidf)

# Make predictions on the test data
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate the SVM model's performance
print("Classification Report (SVM with TF-IDF):")
print(classification_report(y_test_tfidf, y_pred_svm, target_names=label_encoder.classes_))

Classification Report (SVM with TF-IDF):
              precision    recall  f1-score   support

         Bot       1.00      0.50      0.67         2
        Exit       0.25      0.33      0.29         3
       Intro       0.40      0.40      0.40         5
          NN       0.75      0.50      0.60         6
     Olympus       1.00      0.67      0.80         3
     Profane       0.00      0.00      0.00         2
          SL       0.40      1.00      0.57         4
      Ticket       0.00      0.00      0.00         1

    accuracy                           0.50        26
   macro avg       0.47      0.42      0.42        26
weighted avg       0.53      0.50      0.48        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
#4B. Build classifier Models using other algorithms than base model.
# Using Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier on the TF-IDF features
rf_model.fit(X_train_tfidf, y_train_tfidf)

# Make predictions on the test data
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate the Random Forest model's performance
print("Classification Report (Random Forest with TF-IDF):")
print(classification_report(y_test_tfidf, y_pred_rf, target_names=label_encoder.classes_))

Classification Report (Random Forest with TF-IDF):
              precision    recall  f1-score   support

         Bot       1.00      0.50      0.67         2
        Exit       0.33      0.33      0.33         3
       Intro       0.40      0.40      0.40         5
          NN       0.50      0.67      0.57         6
     Olympus       1.00      0.67      0.80         3
     Profane       0.00      0.00      0.00         2
          SL       0.60      0.75      0.67         4
      Ticket       0.00      0.00      0.00         1

    accuracy                           0.50        26
   macro avg       0.48      0.41      0.43        26
weighted avg       0.52      0.50      0.49        26



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
#4C. Tune Parameters/Hyperparameters of the model/s.
# Hyperparameter Tuning for SVM using GridSearchCV

from sklearn.model_selection import GridSearchCV

# Define the parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Initialize GridSearchCV with SVM
svm_grid = GridSearchCV(SVC(), svm_param_grid, refit=True, verbose=3, cv=5)

# Fit the model
svm_grid.fit(X_train_tfidf, y_train_tfidf)

# Print the best parameters
print(f"Best parameters for SVM: {svm_grid.best_params_}")

# Make predictions with the best SVM model
y_pred_svm_tuned = svm_grid.predict(X_test_tfidf)

# Evaluate the tuned SVM model's performance
print("Classification Report (Tuned SVM with TF-IDF):")
print(classification_report(y_test_tfidf, y_pred_svm_tuned, target_names=label_encoder.classes_))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.238 total time=   0.0s
[CV 2/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.238 total time=   0.0s
[CV 3/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.250 total time=   0.0s
[CV 4/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.250 total time=   0.0s
[CV 5/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.250 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.238 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.238 total time=   0.0s
[CV 3/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.250 total time=   0.0s
[CV 4/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.250 total time=   0.0s
[CV 5/5] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.250 total time=   0.0s
[CV 1/5] END ...C=0.1, gamma=scale, kernel=poly;, score=0.238 total time=   0.0s
[CV 2/5] END ...C=0.1, gamma=scale, kernel=poly;

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
#4C. Tune Parameters/Hyperparameters of the model/s.
# Hyperparameter Tuning for Random Forest using GridSearchCV

# Define the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV with Random Forest
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param_grid, refit=True, verbose=3, cv=5)

# Fit the model
rf_grid.fit(X_train_tfidf, y_train_tfidf)

# Print the best parameters
print(f"Best parameters for Random Forest: {rf_grid.best_params_}")

# Make predictions with the best Random Forest model
y_pred_rf_tuned = rf_grid.predict(X_test_tfidf)

# Evaluate the tuned Random Forest model's performance
print("Classification Report (Tuned Random Forest with TF-IDF):")
print(classification_report(y_test_tfidf, y_pred_rf_tuned, target_names=label_encoder.classes_))

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.476 total time=   0.0s
[CV 2/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.476 total time=   0.0s
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.400 total time=   0.0s
[CV 4/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.650 total time=   0.0s
[CV 5/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.450 total time=   0.0s
[CV 1/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.381 total time=   0.1s
[CV 2/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.429 total time=   0.1s
[CV 3/5] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.300 total time=  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
#4D. Clearly print Performance Metrics.

# Summary of models and vectorizers
print("Model Performance Summary:")

# Naive Bayes with Count Vectorizer
print("\nNaive Bayes with Count Vectorizer:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Naive Bayes with TF-IDF
print("\nNaive Bayes with TF-IDF:")
print(classification_report(y_test_tfidf, y_pred_tfidf, target_names=label_encoder.classes_))

# SVM with TF-IDF (Before tuning)
print("\nSVM with TF-IDF (Before tuning):")
print(classification_report(y_test_tfidf, y_pred_svm, target_names=label_encoder.classes_))

# Tuned SVM with TF-IDF
print("\nTuned SVM with TF-IDF:")
print(classification_report(y_test_tfidf, y_pred_svm_tuned, target_names=label_encoder.classes_))

# Random Forest with TF-IDF (Before tuning)
print("\nRandom Forest with TF-IDF (Before tuning):")
print(classification_report(y_test_tfidf, y_pred_rf, target_names=label_encoder.classes_))

# Tuned Random Forest with TF-IDF
print("\nTuned Random Forest with TF-IDF:")
print(classification_report(y_test_tfidf, y_pred_rf_tuned, target_names=label_encoder.classes_))

Model Performance Summary:

Naive Bayes with Count Vectorizer:
              precision    recall  f1-score   support

         Bot       1.00      0.50      0.67         2
        Exit       0.33      0.33      0.33         3
       Intro       0.50      0.40      0.44         5
          NN       0.60      0.50      0.55         6
     Olympus       0.67      0.67      0.67         3
     Profane       0.00      0.00      0.00         2
          SL       0.33      0.75      0.46         4
      Ticket       0.00      0.00      0.00         1

    accuracy                           0.46        26
   macro avg       0.43      0.39      0.39        26
weighted avg       0.48      0.46      0.45        26


Naive Bayes with TF-IDF:
              precision    recall  f1-score   support

         Bot       1.00      0.50      0.67         2
        Exit       0.50      0.33      0.40         3
       Intro       0.67      0.40      0.50         5
          NN       1.00      0.50      0.67

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

### 5. Share insights on relative performance comparison.
### 5A. Which vectorizer performed better? Probable reason?



### 5B. Which model outperformed? Probable reason?

### 5C. Which parameter/hyperparameter significantly helped to improve performance?Probable reason?

### 5D. According to you, which performance metric should be given most importance, why?