### Pre-requisite installations

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Models Training

In [2]:
import json
import random
import numpy as np
import pandas as pd
from string import punctuation
from nltk import word_tokenize
from sklearn.model_selection import train_test_split

In [3]:
# For randomization and re-producability of results
random.seed(123)
np.random.seed(123)

In [None]:
run_results=pd.DataFrame(columns=['Classifier', 'Mean Fit Time(s)', 'Mean Test Time(s)', 
                'Mean Train Score', 'Mean CV Score', 'Best Train Score','Test Score','F1 Score'])

In [None]:
# Using Glove embeddings
embeddings_size=100
glove_path='/content/drive/MyDrive/Colab Notebooks/models/glove.6B.%dd.txt'%embeddings_size

In [None]:
embeddings_index = dict()
with open(glove_path) as gfile:
    for line in gfile:
        values = line.split()
        word, vectors = values[0], np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vectors

In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/VICCI/data/generated_train_data.json'
training_data = None
with open(file_path, 'r') as file:
    training_data = json.load(file)

In [None]:
queries, intents = [], []
for train_set in training_data:
    for query in train_set['query']:
        queries.append(query)
        intents.append(train_set['intent'])

In [None]:
# Training data shape
len(queries), len(intents)

(900, 900)

In [None]:
queries_train, queries_test, intents_train, intents_test = train_test_split( queries, 
                        intents, train_size=0.7, random_state=123, stratify=intents)

In [None]:
# Train and test set shape
len(queries_train), len(queries_test), len(intents_train), len(intents_test)

(630, 270, 630, 270)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# We don't want to exclude stopwords as questions in chat are short and crisp and 
# words like "what" and "not" carry lot of weightage, but word_tokenizer treats the 
# sentence ending punctuations as separate tokens which have to be removed
tfidf = TfidfVectorizer(max_features=600, encoding='latin-1', sublinear_tf=True, lowercase=True,
                        tokenizer=word_tokenize, ngram_range=(1,2), 
                        stop_words=list(punctuation), token_pattern=None)

In [None]:
tfidf.fit(queries_train)

  'stop_words.' % sorted(inconsistent))


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='latin-1',
                input='content', lowercase=True, max_df=1.0, max_features=600,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
                            '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
                            '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', ...],
                strip_accents=None, sublinear_tf=True, token_pattern=None,
                tokenizer=<function word_tokenize at 0x7f0733ace200>,
                use_idf=True, vocabulary=None)

In [None]:
tfidf_dict = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
tfidf_feat = tfidf.get_feature_names()

In [None]:
# We have to calculate the tf-Idf weighted average of the glove embeddings
tfidf_weighted_glove_train = []
for query in queries_train:
    tokens = [tokn.lower() for tokn in word_tokenize(query) if tokn not in list(punctuation)]
    query_vec = np.zeros(embeddings_size)
    weight_sum = 0
    for tokn in tokens:
        if tokn in embeddings_index and tokn in tfidf_dict:
            vec = embeddings_index[tokn]
            # the tf-Idf score of a word in query is pumped up based on the ratio of its
            # count in the query to the total query length  
            score = tfidf_dict[tokn]*((tokens.count(tokn)/len(tokens))+1)
            query_vec += (vec * score)
            weight_sum += score
        else:
            # print(tokn)
            pass
    
    if weight_sum != 0:
        query_vec /= weight_sum
    tfidf_weighted_glove_train.append(query_vec)
tfidf_weighted_glove_train = np.array(tfidf_weighted_glove_train)

In [None]:
# Similar vectorization for the test data
tfidf_weighted_glove_test = []
for query in queries_test:
    tokens = [tokn.lower() for tokn in word_tokenize(query) if tokn not in list(punctuation)]
    query_vec = np.zeros(embeddings_size)
    weight_sum = 0
    for tokn in tokens:
        if tokn in embeddings_index and tokn in tfidf_dict:
            vec = embeddings_index[tokn]
            score = tfidf_dict[tokn]*((tokens.count(tokn)/len(tokens))+1)
            query_vec += (vec * score)
            weight_sum += score
        else:
            # print(tokn)
            pass
    
    if weight_sum != 0:
        query_vec /= weight_sum
    tfidf_weighted_glove_test.append(query_vec)
tfidf_weighted_glove_test = np.array(tfidf_weighted_glove_test)

In [None]:
# Total feature length after conactenating both Tf-Idf and Weighted Glove
len(tfidf_feat)+tfidf_weighted_glove_train.shape[1]

700

In [None]:
X_train = np.hstack((tfidf.transform(queries_train).todense(), tfidf_weighted_glove_train))
X_test = np.hstack((tfidf.transform(queries_test).todense(), tfidf_weighted_glove_test))

In [None]:
X_train.shape, X_test.shape

((630, 700), (270, 700))

In [None]:
lbencoder = LabelEncoder()
lbencoder.fit(intents_train)

LabelEncoder()

In [None]:
Y_train = lbencoder.transform(intents_train)
Y_test = lbencoder.transform(intents_test)

In [None]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, classification_report

In [None]:
def classifier_analyzer(classifier, params):
    ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=123)
    # we are explicitly passing StratifiedShuffleSplit because we want the CV data to
    # shuffles in each split which is not the default behaviour of GridSearchCV 
    gsCV = GridSearchCV(classifier, params, scoring='accuracy', n_jobs=-1, refit=True, 
                                    cv=ss, return_train_score=True)
    gscv_result = gsCV.fit(X_train, Y_train).cv_results_
    print("Mean fit time : %.3fs" % gscv_result['mean_fit_time'].mean())
    print("Mean test time : %.3fs" % gscv_result['mean_score_time'].mean())
    print("Mean train score : %.3f" % gscv_result['mean_train_score'].mean())
    print("Mean CV score : %.3f" % gscv_result['mean_test_score'].mean())
    
    # Get the train score on the best estimator
    print("Best Train Score : %.3f" % accuracy_score(Y_train, gsCV.predict(X_train)))

    # Get the test score on the best estimator
    Y_pred = gsCV.predict(X_test)
    print("Best Test Score  : %.3f" % accuracy_score(Y_test, Y_pred))
    
    print("Best params : ", gsCV.best_params_)
    return Y_pred

In [None]:
# Logistic Regression

lr_clf = LogisticRegression(random_state=123, n_jobs=-1)
# not all the combination of penalty and solver will be compatible so we define
# a list of params dict. First we fix the solver param, then go to fix C
lr_params = [{'penalty' : ['l2'], 
              'solver':['newton-cg', 'sag', 'lbfgs'] }, 
             {'penalty' : ['elasticnet'], 
              'solver':['saga'],
              'l1_ratio':[0, 0.25, 0.5, 0.75, 1]}]
Y_pred = classifier_analyzer(lr_clf, lr_params)

Mean fit time : 5.798s
Mean test time : 0.001s
Mean train score : 0.987
Mean CV score : 0.940
Best Train Score : 0.990
Best Test Score  : 0.978
Best params :  {'penalty': 'l2', 'solver': 'newton-cg'}


In [None]:
lr_clf = LogisticRegression(random_state=123, n_jobs=-1)

lr_params = [{'penalty' : ['l2'], 
              'solver':['newton-cg'],
              'C': [0.01, 0.1, 1, 10, 100, 500] } ]
              
Y_pred = classifier_analyzer(lr_clf, lr_params)

Mean fit time : 1.065s
Mean test time : 0.002s
Mean train score : 0.960
Mean CV score : 0.906
Best Train Score : 0.990
Best Test Score  : 0.978
Best params :  {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}


In [None]:
print("Classification Report for the best params : ")
print(classification_report(Y_test, Y_pred, target_names=lbencoder.classes_))

Classification Report for the best params : 
                    precision    recall  f1-score   support

               bye       1.00      1.00      1.00        15
     covid_numbers       1.00      1.00      1.00        15
       covid_tests       1.00      1.00      1.00        15
   covid_treatment       0.88      1.00      0.94        15
     covid_vaccine       1.00      1.00      1.00        15
 definition_corona       1.00      1.00      1.00        15
  definition_covid       0.87      0.87      0.87        15
development_period       1.00      1.00      1.00        15
             greet       1.00      1.00      1.00        15
             intro       1.00      1.00      1.00        15
  longterm_effects       1.00      0.87      0.93        15
     post_symptoms       0.88      1.00      0.94        15
        protection       1.00      1.00      1.00        15
       risk_people       1.00      1.00      1.00        15
            spread       1.00      1.00      1.00     

In [None]:
run_results.loc[run_results.shape[0]]=['Logistic Reg', 1.065, 0.002, 0.960, 
                                       0.906, 0.990, 0.978, 0.98]

In [None]:
# KNN 

knn_clf = KNeighborsClassifier( n_jobs=-1)
knn_params = {'n_neighbors':[3,5,7,10,15], 
              'weights':['uniform','distance'], 
             'metric':['cosine','minkowski','euclidean']}

Y_pred = classifier_analyzer(knn_clf, knn_params)

Mean fit time : 0.019s
Mean test time : 0.141s
Mean train score : 0.944
Mean CV score : 0.882
Best Train Score : 0.994
Best Test Score  : 0.948
Best params :  {'metric': 'minkowski', 'n_neighbors': 3, 'weights': 'distance'}


In [None]:
print("Classification Report for the best params : ")
print(classification_report(Y_test, Y_pred, target_names=lbencoder.classes_))

Classification Report for the best params : 
                    precision    recall  f1-score   support

               bye       1.00      1.00      1.00        15
     covid_numbers       1.00      1.00      1.00        15
       covid_tests       1.00      0.93      0.97        15
   covid_treatment       0.82      0.93      0.87        15
     covid_vaccine       0.88      0.93      0.90        15
 definition_corona       1.00      1.00      1.00        15
  definition_covid       0.83      0.67      0.74        15
development_period       0.94      1.00      0.97        15
             greet       1.00      1.00      1.00        15
             intro       0.88      1.00      0.94        15
  longterm_effects       1.00      0.87      0.93        15
     post_symptoms       0.88      1.00      0.94        15
        protection       1.00      0.93      0.97        15
       risk_people       1.00      1.00      1.00        15
            spread       1.00      1.00      1.00     

In [None]:
run_results.loc[run_results.shape[0]]=['kNN', 0.019, 0.141, 0.944, 0.882, 
                                       0.994, 0.948, 0.95]

In [None]:
# SVM 

svm_clf = SVC(random_state=123)
svm_params = {'C':[0.001, 0.01, 0.1, 1, 10], 
              'kernel':['rbf', 'poly', 'sigmoid']}

Y_pred = classifier_analyzer(svm_clf, svm_params)

Mean fit time : 0.327s
Mean test time : 0.069s
Mean train score : 0.517
Mean CV score : 0.471
Best Train Score : 0.992
Best Test Score  : 0.974
Best params :  {'C': 10, 'kernel': 'rbf'}


In [None]:
print("Classification Report for the best params : ")
print(classification_report(Y_test, Y_pred, target_names=lbencoder.classes_))

Classification Report for the best params : 
                    precision    recall  f1-score   support

               bye       1.00      1.00      1.00        15
     covid_numbers       1.00      1.00      1.00        15
       covid_tests       1.00      1.00      1.00        15
   covid_treatment       0.83      1.00      0.91        15
     covid_vaccine       1.00      1.00      1.00        15
 definition_corona       1.00      1.00      1.00        15
  definition_covid       0.86      0.80      0.83        15
development_period       1.00      1.00      1.00        15
             greet       1.00      1.00      1.00        15
             intro       1.00      1.00      1.00        15
  longterm_effects       1.00      0.87      0.93        15
     post_symptoms       0.88      1.00      0.94        15
        protection       1.00      1.00      1.00        15
       risk_people       1.00      1.00      1.00        15
            spread       1.00      1.00      1.00     

In [None]:
run_results.loc[run_results.shape[0]]=['SVM', 0.327, 0.069, 0.517, 
                                       0.471, 0.992, 0.974, 0.97]

In [None]:
# SGD Classifier 

sgd_clf = SGDClassifier(early_stopping=False, n_jobs=-1, random_state=123)

sgd_params = {'loss': ['hinge', 'modified_huber'], 
              'penalty': ['l2', 'elasticnet'], 
              'max_iter': [100, 300, 500, 700],
              'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
              'epsilon': [0.01, 0.05, 0.1]}

Y_pred = classifier_analyzer(sgd_clf, sgd_params)

Mean fit time : 0.385s
Mean test time : 0.001s
Mean train score : 0.936
Mean CV score : 0.882
Best Train Score : 0.989
Best Test Score  : 0.981
Best params :  {'alpha': 0.01, 'epsilon': 0.01, 'loss': 'modified_huber', 'max_iter': 100, 'penalty': 'l2'}


In [None]:
print("Classification Report for the best params : ")
print(classification_report(Y_test, Y_pred, target_names=lbencoder.classes_))

Classification Report for the best params : 
                    precision    recall  f1-score   support

               bye       1.00      1.00      1.00        15
     covid_numbers       0.94      1.00      0.97        15
       covid_tests       1.00      1.00      1.00        15
   covid_treatment       0.88      1.00      0.94        15
     covid_vaccine       1.00      1.00      1.00        15
 definition_corona       1.00      1.00      1.00        15
  definition_covid       0.93      0.87      0.90        15
development_period       1.00      1.00      1.00        15
             greet       1.00      1.00      1.00        15
             intro       1.00      1.00      1.00        15
  longterm_effects       1.00      0.87      0.93        15
     post_symptoms       1.00      0.93      0.97        15
        protection       1.00      1.00      1.00        15
       risk_people       1.00      1.00      1.00        15
            spread       0.94      1.00      0.97     

In [None]:
run_results.loc[run_results.shape[0]]=['SGD Classifier', 0.385, 0.001, 0.936, 
                                       0.882, 0.989, 0.981, 0.98]

In [None]:
# XGBoost 

xgb_clf = XGBClassifier(random_state=123, n_jobs=-1)

# First we fix the objective param then, others
xgb_params = [{'objective': ['binary:logistic', 'binary:hinge', 
                            'multi:softprob','multi:softmax']
              },{
                  'objective' : ['multi:softmax'],
                  'num_class' : [len(set(intents))]
              }]

Y_pred = classifier_analyzer(xgb_clf, xgb_params)

Mean fit time : 19.716s
Mean test time : 0.034s
Mean train score : 0.996
Mean CV score : 0.930
Best Train Score : 0.994
Best Test Score  : 0.959
Best params :  {'objective': 'binary:logistic'}


In [None]:
xgb_clf = XGBClassifier(objective='binary:logistic', random_state=123, n_jobs=-1)

# First we fix the objective param then, others
xgb_params = {
              'max_depth' : [3, 5, 7],
              'n_estimators':[5,10,20,35,60],
              'learning_rate' : [0.1, 0.2, 0.3, 0.5, 0.7]
            }
Y_pred = classifier_analyzer(xgb_clf, xgb_params)

Mean fit time : 5.342s
Mean test time : 0.017s
Mean train score : 0.995
Mean CV score : 0.917
Best Train Score : 0.994
Best Test Score  : 0.967
Best params :  {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 60}
Mean fit time : 5.342s
Mean test time : 0.017s
Mean train score : 0.995
Mean CV score : 0.917
Best Train Score : 0.994
Best Test Score  : 0.967
Best params :  {'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 60}


In [None]:
print("Classification Report for the best params : ")
print(classification_report(Y_test, Y_pred, target_names=lbencoder.classes_))

Classification Report for the best params : 
                    precision    recall  f1-score   support

               bye       1.00      1.00      1.00        15
     covid_numbers       0.94      1.00      0.97        15
       covid_tests       1.00      1.00      1.00        15
   covid_treatment       0.83      1.00      0.91        15
     covid_vaccine       1.00      1.00      1.00        15
 definition_corona       1.00      1.00      1.00        15
  definition_covid       0.86      0.80      0.83        15
development_period       1.00      1.00      1.00        15
             greet       1.00      1.00      1.00        15
             intro       1.00      1.00      1.00        15
  longterm_effects       1.00      0.87      0.93        15
     post_symptoms       0.88      1.00      0.94        15
        protection       1.00      0.87      0.93        15
       risk_people       0.94      1.00      0.97        15
            spread       1.00      1.00      1.00     

In [None]:
run_results.loc[run_results.shape[0]]=['XGBoost', 5.342, 0.017, 0.995, 0.917, 
                                            0.994, 0.967, 0.97]

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# MultinomialNB cant take negative values 

scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# MultiNomial naive bayes

mnb_clf = MultinomialNB()
mnb_params = {'alpha': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0]}
Y_pred = classifier_analyzer(mnb_clf, mnb_params)

Mean fit time : 0.006s
Mean test time : 0.001s
Mean train score : 0.967
Mean CV score : 0.887
Best Train Score : 0.984
Best Test Score  : 0.930
Best params :  {'alpha': 0.1}


In [None]:
print("Classification Report for the best params : ")
print(classification_report(Y_test, Y_pred, target_names=lbencoder.classes_))

Classification Report for the best params : 
                    precision    recall  f1-score   support

               bye       1.00      1.00      1.00        15
     covid_numbers       1.00      1.00      1.00        15
       covid_tests       1.00      0.73      0.85        15
   covid_treatment       0.76      0.87      0.81        15
     covid_vaccine       0.93      0.93      0.93        15
 definition_corona       1.00      1.00      1.00        15
  definition_covid       0.82      0.60      0.69        15
development_period       0.88      1.00      0.94        15
             greet       1.00      1.00      1.00        15
             intro       1.00      1.00      1.00        15
  longterm_effects       0.93      0.87      0.90        15
     post_symptoms       0.94      1.00      0.97        15
        protection       1.00      0.93      0.97        15
       risk_people       0.93      0.93      0.93        15
            spread       0.88      1.00      0.94     

In [None]:
run_results.loc[run_results.shape[0]]=['MultiNomial NB', 0.006, 0.001, 0.967, 
                                                0.887, 0.984, 0.930, 0.93]

In [None]:
run_results.sort_values(by=['Test Score', 'F1 Score'])

Unnamed: 0,Classifier,Mean Fit Time(s),Mean Test Time(s),Mean Train Score,Mean CV Score,Best Train Score,Test Score,F1 Score
5,MultiNomial NB,0.006,0.001,0.967,0.887,0.984,0.93,0.93
1,kNN,0.019,0.141,0.944,0.882,0.994,0.948,0.95
4,XGBoost,5.342,0.017,0.995,0.917,0.994,0.967,0.97
2,SVM,0.327,0.069,0.517,0.471,0.992,0.974,0.97
0,Logistic Reg,1.065,0.002,0.96,0.906,0.99,0.978,0.98
3,SGD Classifier,0.385,0.001,0.936,0.882,0.989,0.981,0.98


In [None]:
inputs = ["what are the tests available for covid?", "bye", 
          "after how much time do I see the symptoms?", "That's great.",
          "how do i protect myself?", "what is covid-19?",
          "ok. what are the vaccines available?", 
          "i am looking for vaccination. i need help",
          "how many people have suffered?"]

In [None]:
sgd_clf = SGDClassifier(alpha=0.01, epsilon=0.01, loss='modified_huber', max_iter=100,
                        penalty='l2', early_stopping=False, n_jobs=-1, 
                        random_state=123)
sgd_clf.fit(X_train, Y_train)

SGDClassifier(alpha=0.01, average=False, class_weight=None,
              early_stopping=False, epsilon=0.01, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=100, n_iter_no_change=5, n_jobs=-1, penalty='l2',
              power_t=0.5, random_state=123, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
for inp in inputs:
    tokens = [tokn.lower() for tokn in word_tokenize(inp) if tokn not in list(punctuation)]
    query_vec = np.zeros(embeddings_size)
    weight_sum = 0
    for tokn in tokens:
        if tokn in embeddings_index and tokn in tfidf_dict:
            vec = embeddings_index[tokn]
            score = tfidf_dict[tokn]*((tokens.count(tokn)/len(tokens))+1)
            query_vec += (vec * score)
            weight_sum += score
        else:
            # print(tokn)
            pass

    if weight_sum != 0:
        query_vec /= weight_sum

    pred = sgd_clf.predict_proba(np.hstack((tfidf.transform([inp]).todense(), 
                                           query_vec.reshape(1,-1))))
    tag = lbencoder.inverse_transform([pred.argmax()])[0]
    print(inp," - ",tag," - ",pred[0][pred.argmax()])

what are the tests available for covid?  -  covid_tests  -  0.9027125161518191
bye  -  bye  -  1.0
after how much time do I see the symptoms?  -  development_period  -  1.0
That's great.  -  thanks  -  1.0
how do i protect myself?  -  protection  -  1.0
what is covid-19?  -  definition_covid  -  0.5159454543520243
ok. what are the vaccines available?  -  covid_vaccine  -  1.0
i am looking for vaccination. i need help  -  vaccination_slot  -  1.0
how many people have suffered?  -  covid_numbers  -  0.36272658171682653
