In [90]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
from CustomTokenizer import CustomTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint


In [91]:
def load_dataset(filename):
    df = pd.read_csv(filename, encoding = "latin1", names = ["Pregunta", "Intencion"], sep='|')
    print(df.head())
    intent = df["Intencion"]
    unique_intent = list(set(intent))
    sentences = list(df["Pregunta"])

    return (intent, unique_intent, sentences)
  


In [92]:
intent, unique_intent, sentences = load_dataset("train.csv")

                                            Pregunta  Intencion
0                                           Pregunta  Intencion
1               como puedo trabajar en santander rio    Cat_102
2                pagar tarjeta visa querer reintegro    Cat_350
3                      pagar tarjeta naranja sistema    Cat_132
4  no se debitÃ³ la primera cuota del plan de bie...    Cat_129


In [93]:
print(sentences[:5])

['Pregunta', 'como puedo trabajar en santander rio', 'pagar tarjeta visa querer reintegro', 'pagar tarjeta naranja sistema', 'no se debitÃ³ la primera cuota del plan de bienes personales y quiero saber por que']


In [94]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to C:\Users\Jose
[nltk_data]     Ferrer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Jose
[nltk_data]     Ferrer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [95]:
#define stemmer
stemmer = LancasterStemmer()

In [96]:
def cleaning(sentences):
    words = []
    for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        #stemming
        words.append([i.lower() for i in w])
    
    return words  

In [97]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])  
  


20105
[['pregunta'], ['como', 'puedo', 'trabajar', 'en', 'santander', 'rio']]


In [98]:
def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token

In [99]:
def max_length(words):
    return(len(max(words, key = len)))
  

In [100]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))

Vocab Size = 6117 and Maximum length = 46


In [101]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [102]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)

In [103]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [104]:
padded_doc = padding_doc(encoded_doc, max_length)

In [105]:
padded_doc[:5]

array([[1516,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  14,   17,  949,    9,   43,  129,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  36,    5,   38,   77,  467,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  36,    5, 1041,  322,    0,    0,    0,

In [106]:
print("Shape of padded docs = ",padded_doc.shape)

Shape of padded docs =  (20105, 46)


In [107]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intent, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')


In [108]:
output_tokenizer.word_index

{'cat_289': 1,
 'cat_335': 2,
 'cat_22': 3,
 'cat_118': 4,
 'cat_352': 5,
 'cat_225': 6,
 'cat_280': 7,
 'cat_72': 8,
 'cat_207': 9,
 'cat_41': 10,
 'cat_169': 11,
 'cat_92': 12,
 'cat_132': 13,
 'cat_127': 14,
 'cat_185': 15,
 'cat_348': 16,
 'cat_115': 17,
 'cat_143': 18,
 'cat_259': 19,
 'cat_208': 20,
 'cat_328': 21,
 'cat_141': 22,
 'cat_278': 23,
 'cat_290': 24,
 'cat_160': 25,
 'cat_347': 26,
 'cat_75': 27,
 'cat_34': 28,
 'cat_255': 29,
 'cat_71': 30,
 'cat_60': 31,
 'cat_183': 32,
 'cat_206': 33,
 'cat_139': 34,
 'cat_32': 35,
 'cat_247': 36,
 'cat_298': 37,
 'cat_288': 38,
 'cat_173': 39,
 'cat_336': 40,
 'cat_38': 41,
 'cat_220': 42,
 'cat_201': 43,
 'cat_197': 44,
 'cat_7': 45,
 'cat_309': 46,
 'cat_346': 47,
 'cat_216': 48,
 'cat_212': 49,
 'cat_334': 50,
 'cat_331': 51,
 'cat_159': 52,
 'cat_9': 53,
 'cat_339': 54,
 'cat_117': 55,
 'cat_230': 56,
 'cat_214': 57,
 'cat_241': 58,
 'cat_320': 59,
 'cat_359': 60,
 'cat_266': 61,
 'cat_49': 62,
 'cat_179': 63,
 'cat_191': 64,


In [109]:
encoded_output = encoding_doc(output_tokenizer, intent)

In [110]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)

In [111]:
encoded_output.shape

(20105, 1)

In [112]:
def one_hot(encode):
    o = OneHotEncoder(sparse = False)
    return(o.fit_transform(encode))

In [113]:
output_one_hot = one_hot(encoded_output)

In [114]:
output_one_hot.shape

(20105, 353)

In [115]:
from sklearn.model_selection import train_test_split

In [116]:
#train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)
stopwords = nltk.corpus.stopwords.words('spanish')
tokenizer = CustomTokenizer()
tfidf_vect = TfidfVectorizer(lowercase=True, 
                             stop_words=stopwords,                              
                             strip_accents='ascii', 
                             tokenizer=tokenizer,
                             ngram_range= (1,2),
                             sublinear_tf=True,
                             analyzer='word',
                             token_pattern="[\w']+")

In [117]:
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (16084, 46) and train_Y = (16084, 353)
Shape of val_X = (4021, 46) and val_Y = (4021, 353)


In [118]:
def create_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
    model.add(Bidirectional(LSTM(128)))
#   model.add(LSTM(128))
    model.add(Dense(32, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(353, activation = "softmax"))
  
    return model

In [119]:
model = create_model(vocab_size, max_length)

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 46, 128)           782976    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_5 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 353)               11649     
Total params: 1,066,017
Trainable params: 283,041
Non-trainable params: 782,976
_________________________________________________________________


In [120]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
#train_X = tfidf_vect.fit_transform(train_X)
#train_Y = tfidf_vect.transform(train_Y)
hist = model.fit(train_X, train_Y, epochs = 200, batch_size = 64, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Train on 16084 samples, validate on 4021 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to 5.14964, saving model to model.h5
Epoch 2/200

Epoch 00002: val_loss improved from 5.14964 to 5.06956, saving model to model.h5
Epoch 3/200

Epoch 00003: val_loss improved from 5.06956 to 4.96534, saving model to model.h5
Epoch 4/200

Epoch 00004: val_loss improved from 4.96534 to 4.79121, saving model to model.h5
Epoch 5/200

Epoch 00005: val_loss improved from 4.79121 to 4.67326, saving model to model.h5
Epoch 6/200

Epoch 00006: val_loss improved from 4.67326 to 4.58099, saving model to model.h5
Epoch 7/200

Epoch 00007: val_loss improved from 4.58099 to 4.46934, saving model to model.h5
Epoch 8/200

Epoch 00008: val_loss improved from 4.46934 to 4.35696, saving model to model.h5
Epoch 9/200

Epoch 00009: val_loss improved from 4.35696 to 4.30182, saving model to model.h5
Epoch 10/200

Epoch 00010: val_loss improved from 4.30182 to 4.23635, saving model to model.h5
Epoch 11/200



KeyboardInterrupt: 

In [None]:
 model = load_model("model.h5")

In [None]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = word_tokenizer.texts_to_sequences(test_word)
    print(test_word)
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
      
    test_ls = np.array(test_ls).reshape(1, len(test_ls))
    
    x = padding_doc(test_ls, max_length)
    
    pred = model.predict_proba(x)
    
    
    return pred


  

In [None]:
def get_final_output(pred, classes):
    predictions = pred[0]
    
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)
    
    for i in range(pred.shape[1]):
        print("%s has confidence = %s" % (classes[i], (predictions[i])))



In [None]:
text = "Can you help me?"
pred = predictions(text)
get_final_output(pred, unique_intent)