## <b>Import libraries<b>

In [1]:
# Keras
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
XTrain = pd.read_csv('c:/Users/I051796/Projects/CES/data/input_train.csv', sep=';')
YTrain = pd.read_csv('c:/Users/I051796/Projects/CES/data/label.csv', sep=';')
num_classes = 51

In [3]:
import nltk
nltk.download('stopwords')
    
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("french"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"\?\?", "?", text)
    text = re.sub(r"([A-Z]|[a-z])'", "", text)
    
    text = text.split()
    stemmer = SnowballStemmer('french')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)

    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\I051796\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
XTrain['question'] = XTrain['question'].map(lambda x: clean_text(x))

In [7]:
vocabulary_size = 10000
padding = 100
embed_out_size = 100

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(XTrain['question'])

sequences = tokenizer.texts_to_sequences(XTrain['question'])
XEncodedTrain = pad_sequences(sequences, maxlen=padding)

###  <b>Build neural network with LSTM<b>

In [19]:
import tensorflow as tf
config = tf.ConfigProto(device_count={"CPU": 32})
keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary_size , embed_out_size, input_length=padding))
model_lstm.add(LSTM(embed_out_size, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(num_classes, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 51)                5151      
Total params: 1,085,551
Trainable params: 1,085,551
Non-trainable params: 0
_________________________________________________________________


### Train the network

In [20]:
YOneHotEncodedTrain = keras.utils.to_categorical(YTrain['intention'])
print(YOneHotEncodedTrain)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [21]:
import keras.utils

model_lstm.fit(XEncodedTrain, np.array(YOneHotEncodedTrain), validation_split=0.2, epochs=5)



Train on 6422 samples, validate on 1606 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26a036362b0>

##  <b>Build neural network with LSTM and CNN <b>
The LSTM model worked well. However, it takes forever to train three epochs. One way to speed up the training time is to improve the network adding “Convolutional” layer. Convolutional Neural Networks (CNN) come from image processing. They pass a “filter” over the data and calculate a higher-level representation. They have been shown to work surprisingly well for text, even though they have none of the sequence processing ability of LSTMs.

In [22]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, embed_out_size, input_length=padding))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(num_classes, activation='softmax'))
    model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv 

In [23]:
model_conv = create_conv_model()
model_conv.fit(XEncodedTrain, np.array(YOneHotEncodedTrain), validation_split=0.2, epochs = 5)

Train on 6422 samples, validate on 1606 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x26a11c57860>