<a href="https://colab.research.google.com/github/Dark-Sied/Intent_Classification/blob/master/Intent_classification_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [286]:
def load_dataset(filename):
  df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
  print(df.head())
  intent = df["Intent"]
  unique_intent = list(set(intent))
  sentences = list(df["Sentence"])
  
  return (intent, unique_intent, sentences)
  


In [287]:
intent, unique_intent, sentences = load_dataset("Dataset.csv")

                Sentence          Intent
0       Need help pleese  commonQ.assist
1              Need help  commonQ.assist
2       I need some info  commonQ.assist
3      Will you help me?  commonQ.assist
4  What else can you do?  commonQ.assist


In [288]:
print(sentences[:12])

['Need help pleese', 'Need help', 'I need some info', 'Will you help me?', 'What else can you do?', 'What do you do?', 'What can you help me with?', 'What can you do?', 'What are you good at?', 'Need help plz', 'How can you help me', 'Can you help me?']


In [289]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kushagramittal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kushagramittal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [290]:
#define stemmer
stemmer = LancasterStemmer()

In [291]:
def cleaning(sentences):
  words = []
  for s in sentences:
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
    w = word_tokenize(clean)
    #stemming
    words.append([i.lower() for i in w])
    
  return words  

In [292]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:3])  
  


1175
[['need', 'help', 'pleese'], ['need', 'help'], ['i', 'need', 'some', 'info']]


In [293]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
  token = Tokenizer(filters = filters)
  token.fit_on_texts(words)
  return token

In [294]:
def max_length(words):
  return(len(max(words, key = len)))
  

In [295]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 509 and Maximum length = 28


In [296]:
def encoding_doc(token, words):
  return(token.texts_to_sequences(words))

In [297]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [298]:
def padding_doc(encoded_doc, max_length):
  return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [299]:
padded_doc = padding_doc(encoded_doc, max_length)

In [300]:
padded_doc[:6]

array([[ 25,  80, 348,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 25,  80,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  1,  25, 210, 192,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [ 51,  12,  80,  15,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  8, 284,   4,  12,  31,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  8,  31,  12,  31,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]], dtype=int

In [301]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (1175, 28)


In [302]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')


In [303]:
output_tokenizer.word_index

{'faq.biz_new': 1,
 'faq.approval_time': 2,
 'faq.borrow_use': 3,
 'details.array': 4,
 'details.array.storage': 5,
 'commonq.how': 6,
 'faq.biz_simpler': 7,
 'faq.application_process': 8,
 'details.array.cache': 9,
 'faq.borrow_limit': 10,
 'commonq.wait': 11,
 'faq.aadhaar_missing': 12,
 'contact.contact': 13,
 'commonq.just_details': 14,
 'faq.biz_category_missing': 15,
 'commonq.bot': 16,
 'details.array.recommdation': 17,
 'faq.bad_service': 18,
 'commonq.query': 19,
 'faq.banking_option_missing': 20,
 'commonq.name': 21,
 'faq.address_proof': 22,
 'commonq.not_giving': 23,
 'commonq.assist': 24,
 'faq.apply_register': 25}

In [304]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [305]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [306]:
encoded_output.shape

(1175, 1)

In [307]:
def one_hot(encode):
  o = OneHotEncoder(sparse = False)
  return(o.fit_transform(encode))

In [308]:
output_one_hot = one_hot(encoded_output)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [309]:
output_one_hot.shape

(1175, 25)

In [310]:
from sklearn.model_selection import train_test_split

In [311]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)


In [312]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (940, 28) and train_Y = (940, 25)
Shape of val_X = (235, 28) and val_Y = (235, 25)


In [313]:
def create_model(vocab_size, max_length):
  model = Sequential()
  model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
  model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
  model.add(Dense(32, activation = "relu"))
  model.add(Dropout(0.5))
  model.add(Dense(25, activation = "softmax"))
  
  return model

In [314]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 28, 128)           65152     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_15 (Dense)             (None, 32)                8224      
_________________________________________________________________
dropout_8 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 25)                825       
Total params: 337,369
Trainable params: 272,217
Non-trainable params: 65,152
_________________________________________________________________


In [315]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 940 samples, validate on 235 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.99269, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.99269 to 2.85012, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 2.85012 to 2.82728, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 2.82728 to 2.68699, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 2.68699 to 2.56874, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 2.56874 to 2.49247, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 2.49247 to 2.41796, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 2.41796 to 2.32276, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss did not improve from 2.32276
Epoch 10/100

Epoch 00010: val_loss improved from 2.32276 to 2.17415, saving model to model.h5
Epoch 11/100

Epoch 00011: val_loss improved fr


Epoch 00041: val_loss improved from 1.08021 to 1.07390, saving model to model.h5
Epoch 42/100

Epoch 00042: val_loss improved from 1.07390 to 1.07255, saving model to model.h5
Epoch 43/100

Epoch 00043: val_loss did not improve from 1.07255
Epoch 44/100

Epoch 00044: val_loss did not improve from 1.07255
Epoch 45/100

Epoch 00045: val_loss improved from 1.07255 to 1.04146, saving model to model.h5
Epoch 46/100

Epoch 00046: val_loss did not improve from 1.04146
Epoch 47/100

Epoch 00047: val_loss did not improve from 1.04146
Epoch 48/100

Epoch 00048: val_loss improved from 1.04146 to 1.01129, saving model to model.h5
Epoch 49/100

Epoch 00049: val_loss did not improve from 1.01129
Epoch 50/100

Epoch 00050: val_loss did not improve from 1.01129
Epoch 51/100

Epoch 00051: val_loss improved from 1.01129 to 0.94733, saving model to model.h5
Epoch 52/100

Epoch 00052: val_loss improved from 0.94733 to 0.92397, saving model to model.h5
Epoch 53/100

Epoch 00053: val_loss did not improve f


Epoch 00084: val_loss did not improve from 0.92397
Epoch 85/100

Epoch 00085: val_loss did not improve from 0.92397
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.92397
Epoch 87/100

Epoch 00087: val_loss improved from 0.92397 to 0.85517, saving model to model.h5
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.85517
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.85517
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.85517
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.85517
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.85517
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.85517
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.85517
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.85517
Epoch 96/100

Epoch 00096: val_loss did not improve from 0.85517
Epoch 97/100

Epoch 00097: val_loss did not improve from 0.85517
Epoch 98/100

Epoch 00098: val_loss did not improve from 0.85517
Epoch 99

In [316]:
 model = load_model("model.h5") 

In [317]:
def predictions(text):
  clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
  test_word = word_tokenize(clean)
  test_word = [w.lower() for w in test_word]
  test_ls = word_tokenizer.texts_to_sequences(test_word)
  print(test_word)
  #Check for unknown words
  if [] in test_ls:
    test_ls = list(filter(None, test_ls))
    
  test_ls = np.array(test_ls).reshape(1, len(test_ls))
 
  x = padding_doc(test_ls, max_length)
  
  pred = model.predict_proba(x)
  
  
  return pred


  

In [318]:
def get_final_output(pred, classes):
  predictions = pred[0]
 
  classes = np.array(classes)
  ids = np.argsort(-predictions)
  classes = classes[ids]
  predictions = -np.sort(-predictions)
 
  for i in range(pred.shape[1]):
    print("%s has confidence = %s" % (classes[i], (predictions[i])))



In [320]:
text = "cache?"
pred = predictions(text)
print(get_final_output(pred, unique_intent))
print(pred)

['cache']
details.array.cache has confidence = 0.18447202
details.array has confidence = 0.13132222
commonQ.not_giving has confidence = 0.08407373
faq.aadhaar_missing has confidence = 0.07887857
commonQ.name has confidence = 0.067223
commonQ.assist has confidence = 0.049920447
commonQ.bot has confidence = 0.04466487
faq.biz_category_missing has confidence = 0.04363832
faq.apply_register has confidence = 0.041020535
faq.bad_service has confidence = 0.034653425
commonQ.just_details has confidence = 0.03195386
faq.borrow_limit has confidence = 0.025213074
commonQ.how has confidence = 0.024166733
details.array.recommdation has confidence = 0.022585457
details.array.storage has confidence = 0.022389323
commonQ.wait has confidence = 0.019807538
faq.address_proof has confidence = 0.018387036
faq.biz_new has confidence = 0.018269338
contact.contact has confidence = 0.014878928
faq.banking_option_missing has confidence = 0.013836922
commonQ.query has confidence = 0.012808132
faq.application_pro