In [11]:
# importing the necessary libraries
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import PorterStemmer
PS = PorterStemmer()
from nltk import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
import time
import numpy as np
import pickle
import os
import json
from string import punctuation
punctuation = list(punctuation)
punctuation.append("'s")
punctuation.append("'m")
punctuation.append("'d")
punctuation.append("'ve")
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
!pip install imbalanced-learn
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
over = SMOTE()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
#converting the json file to a pandas dataframe
with open('intents.json') as file:
  data = json.load(file)

prompts = []
labels = []
for intent in data['intents']:
  for prompt in intent['prompts']:
    prompts.append(prompt)
    labels.append(intent['tag'])

df = pd.DataFrame({'label': labels,
                   'prompts': prompts})
df.to_csv('data.csv',index=False)

In [4]:
#Reading the csv file
df = pd.read_csv('data.csv')
print(df.shape)
df.head()

(329, 2)


Unnamed: 0,label,prompts
0,greeting,Hello there!
1,greeting,Hiiii
2,greeting,"Hey, how's it going?"
3,greeting,Anyone there?
4,greeting,"Hey, stranger!"


In [33]:
#Preprocessing the text
#Tokenization -> converting it to lower case -> removing punctuations (cleaned text) -> Stemming OR Lemmatization 
def preprocess(text):
    tokenized_words = nltk.word_tokenize(text)
    tokenized_words = [word.lower() for word in tokenized_words]
    cleaned_words = [word for word in tokenized_words if word not in punctuation]
    stemmed_words = [PS.stem(word) for word in cleaned_words]
    lemmetized_words = [lemmatizer.lemmatize(word,'v') for word in cleaned_words]
    return " ".join(stemmed_words)

In [19]:
df['Cleaned'] = df['prompts'].apply(preprocess)

In [20]:
df.tail()

Unnamed: 0,label,prompts,Lemmatized,Stemmed,Cleaned
324,clubs,societies,societies,societi,societies
325,clubs,teams,team,team,teams
326,clubs,co-curricular clubs,co-curricular club,co-curricular club,co-curricular clubs
327,clubs,groups,group,group,groups
328,clubs,clubs at college,club at college,club at colleg,clubs at college


In [21]:
# Number of prompts for each label
print(df.label.value_counts())

information    24
projects       22
course         22
greeting       21
goodbye        21
placements     21
hostel         21
fees           21
clubs          20
address        20
library        19
canteen        18
admission      18
events         17
scholarship    16
name           15
hours          13
Name: label, dtype: int64


In [22]:
# Mapping label to a number 
n_labels = {}
count = 0
for label in df.label.unique():
  n_labels[label] = count
  count += 1
print(n_labels)

{'greeting': 0, 'name': 1, 'goodbye': 2, 'address': 3, 'course': 4, 'information': 5, 'library': 6, 'fees': 7, 'hours': 8, 'canteen': 9, 'scholarship': 10, 'projects': 11, 'admission': 12, 'hostel': 13, 'events': 14, 'placements': 15, 'clubs': 16}


In [24]:
# Dictionary to map number to a label
labelled_data = {}
for keys, values in  n_labels.items():
  labelled_data[values] = keys
print(labelled_data)

{0: 'greeting', 1: 'name', 2: 'goodbye', 3: 'address', 4: 'course', 5: 'information', 6: 'library', 7: 'fees', 8: 'hours', 9: 'canteen', 10: 'scholarship', 11: 'projects', 12: 'admission', 13: 'hostel', 14: 'events', 15: 'placements', 16: 'clubs'}


In [25]:
df['n_labels'] = df.label.map(n_labels)
df.head()

Unnamed: 0,label,prompts,Lemmatized,Stemmed,Cleaned,n_labels
0,greeting,Hello there!,hello there,hello there,hello there,0
1,greeting,Hiiii,hiiii,hiiii,hiiii,0
2,greeting,"Hey, how's it going?",hey how it go,hey how it go,hey how it going,0
3,greeting,Anyone there?,anyone there,anyon there,anyone there,0
4,greeting,"Hey, stranger!",hey stranger,hey stranger,hey stranger,0


In [27]:
# Initializing a dictionary to map the best combo to a score
scores = {}
max_combo, max_score = '',0
# Different preprocess algos to consider
preprocess_algos = ['Stemmed','Cleaned','Lemmatized']
# Different machine learning algos
algorithms = [RandomForestClassifier(),MultinomialNB(),KNeighborsClassifier(),SGDClassifier()]
# Different word embedding techniques
word_emb = [TfidfVectorizer(), CountVectorizer()]
best_model = ''
for word_em in word_emb:
  for model_algo in algorithms:
    for preprocessor in preprocess_algos:
      # Dividing the data into train and test
      X_train, X_test, y_train, y_test = train_test_split(df[preprocessor], df.n_labels, test_size=0.2, random_state=43, stratify=df.n_labels)
      # Developing a pipeline
      # Word Embeddings -> Smote (To balance the data) -> Machine Learning
      clf = Pipeline([
        ('wb', word_em),
        ('sm',over),
        ('clf', model_algo)
      ])
      clf.fit(X_train,y_train)
      y_pred = clf.predict(X_test)
      # Developing the report
      report = classification_report(y_test,y_pred,output_dict=True)
      combo = f"{preprocessor} + {model_algo} + {word_em}"
      scores[combo] = report['accuracy']
      # Storing the best model
      if report['accuracy'] > max_score:
        max_score = report['accuracy']
        max_combo = combo
        best_model = clf



print(max_combo, max_score)
print(scores)


Stemmed + MultinomialNB() + TfidfVectorizer() 0.9090909090909091
{'Stemmed + RandomForestClassifier() + TfidfVectorizer()': 0.7424242424242424, 'Cleaned + RandomForestClassifier() + TfidfVectorizer()': 0.7121212121212122, 'Lemmatized + RandomForestClassifier() + TfidfVectorizer()': 0.7272727272727273, 'Stemmed + MultinomialNB() + TfidfVectorizer()': 0.9090909090909091, 'Cleaned + MultinomialNB() + TfidfVectorizer()': 0.8333333333333334, 'Lemmatized + MultinomialNB() + TfidfVectorizer()': 0.8333333333333334, 'Stemmed + KNeighborsClassifier() + TfidfVectorizer()': 0.7878787878787878, 'Cleaned + KNeighborsClassifier() + TfidfVectorizer()': 0.6818181818181818, 'Lemmatized + KNeighborsClassifier() + TfidfVectorizer()': 0.7121212121212122, 'Stemmed + SGDClassifier() + TfidfVectorizer()': 0.9090909090909091, 'Cleaned + SGDClassifier() + TfidfVectorizer()': 0.8484848484848485, 'Lemmatized + SGDClassifier() + TfidfVectorizer()': 0.8787878787878788, 'Stemmed + RandomForestClassifier() + CountVec

In [28]:
X_train, X_test, y_train, y_test = train_test_split(df['Stemmed'], df.n_labels, test_size=0.2, random_state=43, stratify=df.n_labels)

clf = Pipeline([
  ('wb', TfidfVectorizer()),
  ('sm',over),
  ('clf', MultinomialNB())
])
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

report = classification_report(y_test,y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.50      0.75      0.60         4
           1       0.60      1.00      0.75         3
           2       1.00      0.75      0.86         4
           3       1.00      1.00      1.00         4
           4       1.00      0.80      0.89         5
           5       1.00      1.00      1.00         5
           6       1.00      1.00      1.00         4
           7       1.00      1.00      1.00         4
           8       1.00      0.67      0.80         3
           9       1.00      0.75      0.86         4
          10       0.75      1.00      0.86         3
          11       1.00      1.00      1.00         4
          12       1.00      1.00      1.00         4
          13       1.00      0.75      0.86         4
          14       1.00      1.00      1.00         3
          15       1.00      1.00      1.00         4
          16       1.00      1.00      1.00         4

    accuracy              

In [34]:
p = clf.predict_proba([preprocess('Hello who are you')])
print(p)
print(np.argmax(p))
print(labelled_data[np.argmax(p)])

[[0.0957032  0.26092109 0.05077774 0.0338293  0.05836082 0.04993282
  0.04466297 0.03272523 0.05198019 0.03430989 0.03899306 0.03382504
  0.04957859 0.0423742  0.0324946  0.03433114 0.05520012]]
1
name


In [35]:
p = clf.predict_proba([preprocess('library timings')])
print(p)
print(np.argmax(p))
print(labelled_data[np.argmax(p)])

[[0.03561409 0.03403605 0.0477227  0.03468138 0.03398498 0.03282164
  0.39524843 0.03386364 0.08407447 0.03503498 0.03454002 0.03467824
  0.03228528 0.03251909 0.03369185 0.03341149 0.03179168]]
6
library


In [39]:
p = clf.predict_proba([preprocess('Help with the scolarship')])
print(p)
print(np.argmax(p))
print(labelled_data[np.argmax(p)])

[[0.04170691 0.04147458 0.06213583 0.04977953 0.05069507 0.06529785
  0.05623343 0.05932852 0.08854534 0.04859532 0.1011276  0.04600091
  0.05008224 0.05060061 0.04573244 0.06317254 0.07949129]]
10
scholarship


In [38]:
p = clf.predict_proba([preprocess('dggdn')])
print(p)
print(np.argmax(p))
print(labelled_data[np.argmax(p)])

[[0.05882353 0.05882353 0.05882353 0.05882353 0.05882353 0.05882353
  0.05882353 0.05882353 0.05882353 0.05882353 0.05882353 0.05882353
  0.05882353 0.05882353 0.05882353 0.05882353 0.05882353]]
0
greeting


In [None]:
# Saving the model to a file
import pickle
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))