## Deep Learning training with CNN and an extended FastText word embedding
We test here the combination of:
* an embedding layer which is initialized with a pretrained fasttext word embedding extended with custom embedding model
* a CNN layer to capture word sequence representation
* a dense layer to learn classification

We leverage the pre-trained fasttext word embedding model gracefully supplied by Facebook company at [https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md](https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md)

This general purpose word embedding model is extended with custom model generated by [fasttext_embedding_extension_builder.ipynb](fasttext_embedding_extension_builder.ipynb) script

In [9]:
# Keras
import keras
from keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd

embeddings_index = {}

In [5]:
from sklearn.model_selection import train_test_split

XTrain = pd.read_csv('../../data/staging_data/mispelling_fixed_clean_input_train.csv', sep=',')
YTrain = pd.read_csv('../../data/POSOS/label.csv', sep=';')

XTrain, XTest, YTrain, YTest = train_test_split(XTrain, YTrain, test_size=0.15, random_state=42)
num_classes = 51

In [6]:
from keras.preprocessing.sequence import pad_sequences

vocabulary_size = 10000
sequence_length = 30
embedding_out_dims = 300

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(XTrain['question'])

sequences = tokenizer.texts_to_sequences(XTrain['question'])
XEncodedTrain = pad_sequences(sequences, maxlen=sequence_length)

In [10]:
# load the French fasttext model
import os, re, csv, math, codecs
from tqdm import tqdm

if len(embeddings_index) == 0:
    
    f = codecs.open('../../pretrained_models/fasttext/wiki.fr.vec', encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

1152466it [02:24, 7967.94it/s]


In [11]:
# %load ../utils/serializer.py

import csv

def saveEmbeddingVector(vectors, fileName):
    ''' save a dict of numerical array'''
    with open(fileName, 'w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in vectors.items():
            writer.writerow([key, ",".join([str(i) for i in value])])
    csv_file.close()

def loadEmbeddingVector(fileName):
    ''' load a dict of numerical array'''
    with open(fileName, 'r') as csv_file:
        reader = csv.reader(csv_file);
        temp_dict = dict(reader)
        myDict={k:list(map(lambda x: float(x), v.split(','))) for k,v in temp_dict.items()}    
        csv_file.close()
        return myDict 
    return None

In [12]:
# complete the embedding mode with the custom one built by the script additional_fasttext_embedding_builder.ipynb
customEmbedding = loadEmbeddingVector('../../pretrained_models/fasttext_embedding_extension.txt')

for k,v in customEmbedding.items():
    embeddings_index[k] = v 

In [13]:
nb_words = min(vocabulary_size, len(tokenizer.word_index))

embedding_matrix = np.zeros((nb_words+1, embedding_out_dims))

for word, i in tokenizer.word_index.items():
    if i >= nb_words:
        continue
                
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        embedding_matrix[i] = embedding_vector
    else:
        raise Exception("unknown word=" + word)

###  <b>Build neural network with CNN and FastText embedding<b>

In [42]:
import keras
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Convolution1D, MaxPooling1D, Dropout, Flatten, Input, Concatenate
from keras.layers.embeddings import Embedding

# hyper-parameters
drop_ratio = 0.15
num_filters = 4
hidden_dims = 40
kernel_sizes = [2, 4, 8 , 10, 15 , 20]
batch_size = 12
num_epochs= 80
pool_size = 2
strides = 2

#config = tf.ConfigProto(device_count={"CPU": 32})
# tensor flow technical setting
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8, allow_growth = True)
config=tf.ConfigProto(gpu_options=gpu_options,allow_soft_placement=True)
keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

input_layer = Input(shape=(sequence_length, ))

embedding_layer = Embedding(
                    input_dim=len(tokenizer.word_index)+1,
                    output_dim=embedding_out_dims,
                    weights=[embedding_matrix],
                    input_length=sequence_length,
                    trainable=False) (input_layer)

dropout_layer_1 = Dropout(drop_ratio) (embedding_layer)

# parallel convolution/pooling/flatten branches with different kernel size
conv_blocks = []
for kernel_size in kernel_sizes:
    conv_layer = Convolution1D(
                    filters=num_filters,
                    kernel_size=kernel_size,
                    strides=strides,
                    activation='relu') (dropout_layer_1)
    
    pooling_layer = MaxPooling1D(pool_size=pool_size) (conv_layer)

    flatten_layer = Flatten() (pooling_layer)
    
    conv_blocks.append(flatten_layer)

concat_layer = Concatenate() (conv_blocks)

dropout_layer_2 = Dropout(drop_ratio) (concat_layer)

hidden_dense_layer = Dense(hidden_dims, activation="relu") (dropout_layer_2)

class_dense_layer = Dense(num_classes, activation='softmax') (hidden_dense_layer)

model_conv = Model(input_layer, class_dense_layer)
model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model_conv.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 30, 300)      2468400     input_12[0][0]                   
__________________________________________________________________________________________________
dropout_18 (Dropout)            (None, 30, 300)      0           embedding_12[0][0]               
__________________________________________________________________________________________________
conv1d_56 (Conv1D)              (None, 15, 4)        2404        dropout_18[0][0]                 
__________________________________________________________________________________________________
conv1d_57 

### Train the network

In [43]:
YOneHotEncodedTrain = keras.utils.to_categorical(YTrain['intention'])
model_conv.fit(XEncodedTrain, np.array(YOneHotEncodedTrain), validation_split=0.15, epochs = num_epochs, batch_size=batch_size, verbose=2)

Train on 5799 samples, validate on 1024 samples
Epoch 1/80
 - 5s - loss: 3.1742 - acc: 0.2221 - val_loss: 2.9420 - val_acc: 0.2676
Epoch 2/80
 - 4s - loss: 2.7209 - acc: 0.3054 - val_loss: 2.6653 - val_acc: 0.3252
Epoch 3/80
 - 4s - loss: 2.4164 - acc: 0.3782 - val_loss: 2.4829 - val_acc: 0.3623
Epoch 4/80
 - 4s - loss: 2.1501 - acc: 0.4347 - val_loss: 2.3955 - val_acc: 0.3896
Epoch 5/80
 - 4s - loss: 1.9632 - acc: 0.4713 - val_loss: 2.3453 - val_acc: 0.3936
Epoch 6/80
 - 4s - loss: 1.8043 - acc: 0.5094 - val_loss: 2.3471 - val_acc: 0.4043
Epoch 7/80
 - 4s - loss: 1.6531 - acc: 0.5494 - val_loss: 2.3170 - val_acc: 0.4219
Epoch 8/80
 - 4s - loss: 1.5723 - acc: 0.5634 - val_loss: 2.3298 - val_acc: 0.4385
Epoch 9/80
 - 4s - loss: 1.4616 - acc: 0.5917 - val_loss: 2.3615 - val_acc: 0.4492
Epoch 10/80
 - 4s - loss: 1.3717 - acc: 0.6082 - val_loss: 2.3516 - val_acc: 0.4375
Epoch 11/80
 - 4s - loss: 1.3219 - acc: 0.6287 - val_loss: 2.3853 - val_acc: 0.4531
Epoch 12/80
 - 4s - loss: 1.2441 - ac

<keras.callbacks.History at 0x14d27462898>