## <b>Import libraries<b>

In [1]:
# Keras
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding

## Plot
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
import matplotlib as plt

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Other
import re
import string
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
XTrain = pd.read_csv('c:/Users/I051796/Projects/CES/data/input_train.csv', sep=';')
YTrain = pd.read_csv('c:/Users/I051796/Projects/CES/data/label.csv', sep=';')
num_classes = 51

In [3]:
import nltk
nltk.download('stopwords')
    
def clean_text(text):
    
    # Clean the text
    text = re.sub(r"\?\?", "?", text)
    text = re.sub("qu'", "que ", text)
    text = re.sub("n°", "numéro", text)
    
    #text = re.sub("..", ".", text)
    #text = re.sub("...", ".", text)
    text = re.sub(r"([^\s])\?", r"\1 ?", text) # stick question mark
    text = re.sub("-", " ", text)
    text = re.sub(r"([A-Za-zèéàê])('|’)", "", text)
    
    # mixed numbers+letters like: 6jours => jours (keep only the letters)
    text = re.sub(r"([0-9]+)([A-Za-zèéàêô]+)", r"\2", text)
    text = re.sub(r"([A-Za-zèéàêô]+)([0-9]+)", r"\2", text)
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("french"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)    
    # dont stem as FastText can cope with morphological variability
    #text = text.split()
    #stemmer = SnowballStemmer('french')
    #stemmed_words = [stemmer.stem(word) for word in text]
    #text = " ".join(stemmed_words)

    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jacques\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
pd.set_option("display.max_colwidth",300)
XTrain['cleanQuestion'] = XTrain['question'].map(lambda x: clean_text(x))

In [5]:
XTrain.head(50)

Unnamed: 0,ID,question,cleanQuestion
0,0,"bonjour, je m suis trompé de forum pour ma question alors je la repose ici. je pris pour la première fois hier du paroxétine et ce matin c'est une catastrophe. picotement dasn tous le corps annonciateur de sueur froide très très massive et de vomissement. j'en suis à deux crises depuis 5 heure ...","bonjour, trompé forum question alors repose ici. pris première fois hier paroxétine matin catastrophe. picotement dasn tous corps annonciateur sueur froide très très massive vomissement. deux crises depuis heure mat. cela semble passer mains reste moites chaude estce normal première fois merci tous"
1,1,est ce que le motilium me soulagera contre les nausées?,motilium soulagera contre les nausées
2,2,"mon médecin m'a prescrit adenyl. au 2ème cachet des maux de tête terribles et au 3ème palpitations, sueurs froides, chaleur intense dans la tête, tremblements, fourmillements dans la lèvre supérieure, difficultés à respirer.. dès l'arrêt du médicament tous les symptômes ont disparu. cela est-il ...","médecin prescrit adenyl. ème cachet maux tête terribles ème palpitations, sueurs froides, chaleur intense tête, tremblements, fourmillements lèvre supérieure, difficultés respirer.. dès arrêt médicament tous les symptômes disparu. cela déjà arrivé quelque"
3,3,Est-ce qu'il existe une forme adaptée aux enfant de 5ans du Micropakine ?,existe forme adaptée enfant ans micropakine
4,4,mon medecin me soigne pour une rhino pharingite et m'a prescrit du amoxicilline comme anti biotique. Est-ce vraiment pour cette indication?,medecin soigne rhino pharingite prescrit amoxicilline comme anti biotique. vraiment cette indication
5,5,je viens d'apprendre que je suis enceinte..savez-vous si je peux poursuivre le rubozinc ?,viens apprendre enceinte..savez peux poursuivre rubozinc
6,6,atrax n'est-il pas dangereux au long terme ?,atrax dangereux long terme
7,7,je suis sous mercilon. J'ai des nausées et des saignements ?,sous mercilon. nausées saignements
8,8,"L'atenolol, c'est quoi ?","atenolol, quoi"
9,9,"je prend trinordiol et à la fin de ma première plaquette j'ai eu ma première fois ac mn copain. la pilule est donc bien efficace dès le debut ? j'ai des douleur dans la poitrine, j'ai l'impression qu'ils ont un peu grossi ossi de plus, j'ai mal o bas du ventr,e dois je minkiété ou c simplemen, l...","prend trinordiol fin première plaquette première fois copain. pilule donc bien efficace dès debut douleur poitrine, impression ils peu grossi ossi plus, mal bas ventr,e dois minkiété simplemen, periode dadaptation"


###  <b>Build neural network with LSTM and FastText embedding<b>

In [6]:
vocabulary_size = 12000
padding = 150 # TODO look at the length mean/median
embed_out_size = 300

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(XTrain['cleanQuestion'])

sequences = tokenizer.texts_to_sequences(XTrain['cleanQuestion'])
XEncodedTrain = pad_sequences(sequences, maxlen=padding)

In [7]:
word_index = tokenizer.word_index
word_index

{'les': 1,
 'depuis': 2,
 'peut': 3,
 'prendre': 4,
 'pilule': 5,
 'jours': 6,
 'plus': 7,
 'effets': 8,
 'faire': 9,
 'mois': 10,
 'sous': 11,
 'fait': 12,
 'savoir': 13,
 'prends': 14,
 'temps': 15,
 'pris': 16,
 'vaccin': 17,
 'avoir': 18,
 'grossesse': 19,
 'prise': 20,
 'règles': 21,
 'prend': 22,
 'jour': 23,
 'plaquette': 24,
 'dois': 25,
 'combien': 26,
 'normal': 27,
 'contre': 28,
 'pendant': 29,
 'traitement': 30,
 'après': 31,
 'deroxat': 32,
 'quels': 33,
 'quel': 34,
 'enceinte': 35,
 'risque': 36,
 'secondaires': 37,
 'effet': 38,
 'quand': 39,
 'donc': 40,
 'pillule': 41,
 'quoi': 42,
 'comment': 43,
 'soir': 44,
 'quelle': 45,
 'comme': 46,
 'cela': 47,
 'puis': 48,
 'sans': 49,
 'semaine': 50,
 'être': 51,
 'prescrit': 52,
 'ans': 53,
 'quelque': 54,
 'tout': 55,
 'deux': 56,
 'comprimé': 57,
 'bien': 58,
 'mal': 59,
 'merci': 60,
 'bonjour': 61,
 'fois': 62,
 'matin': 63,
 'peu': 64,
 'cette': 65,
 'possible': 66,
 'regles': 67,
 'faut': 68,
 'car': 69,
 'avant': 70,

In [8]:
import os, re, csv, math, codecs
from tqdm import tqdm

embeddings_index = {}
f = codecs.open('../../WordEmbeddings/fasttext/wiki.fr.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

1152466it [02:30, 7654.57it/s]


In [9]:
nb_words = min(vocabulary_size, len(word_index))
len(word_index)

9451

In [10]:
import re

misSpelledCount = 0
misSpelledMap = {}


isNumber = re.compile('^[0-9]+$')
numberCount = 0

isDuration = re.compile('^(h|mn|s)$')
durationCount = 0

isTime = re.compile('^[0-9][0-9](h)[0-9][0-9]$')
timeCount = 0

isWeight = re.compile('^(mg|kg|kgs|g)$')
weightCount = 0

isVolume = re.compile('^(ml|l|cl)$')
volumeCount = 0

drugs = pd.read_csv('c:/Users/I051796/Projects/ANSM/drugs.txt', header=None, index_col=None)
drugs = set(drugs[0].values)
drugCount = 0

def isDrug(word):
    w = re.sub(r"é", "e", word)
    w = re.sub(r"è", "e", w)
    w = re.sub(r"à", "a", w)
    w = w.upper()
    return w in drugs
    
words_not_found = []
embedding_matrix = np.zeros((nb_words+1, embed_out_size))

%timeit
for word, i in word_index.items():
    if i >= nb_words:
        continue
                
    embedding_vector = embeddings_index.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros. RANDOM?
        embedding_matrix[i] = embedding_vector        
    else:
        # number term => 'nombre'
        # 'XX(h|mn|s) => 'durée'        
        # 'XX(mg|g|kg|kgs)' => 'poids'        
        # fallback: '[0-9]+(YY)' => YY        
        # in drug_list => 'médicament'
        # misspelled?   
                
        '''    
        if isDuration.match(word):
            embedding_matrix[i] = embeddings_index.get('durée') 
            durationCount += 1
        elif isTime.match(word):
            embedding_matrix[i] = embeddings_index.get('temps') 
            timeCount += 1            
        elif isWeight.match(word):
            embedding_matrix[i] = embeddings_index.get('poids') 
            weightCount += 1
        elif isVolume.match(word):
            embedding_matrix[i] = embeddings_index.get('volume') 
            volumeCount += 1            
        '''    
        if isDrug(word):
            embedding_matrix[i] = embeddings_index.get('médicament') 
            drugCount += 1            
        elif isNumber.match(word):
            embedding_matrix[i] = embeddings_index.get('nombre') 
            numberCount += 1            
        else:
            words_not_found.append(word)
            
print('wordCount:{0}, unknownCount:{1}, numberCount:{2}, durationCount:{3}, weightCount:{4}, drugCount:{5}, timeCount:{6}, volumeCount:{7}'
      .format(len(word_index), len(words_not_found), numberCount, durationCount, weightCount, drugCount, timeCount, volumeCount))

wordCount:9451, unknownCount:1404, numberCount:114, durationCount:0, weightCount:0, drugCount:694, timeCount:0, volumeCount:0


In [11]:
pd.DataFrame(words_not_found).to_csv('C:/Users/I051796/Projects/CES/temp/unknown.txt', index=None, header=None)
words_not_found

['aujourhui',
 'qest',
 'sterilet',
 'cycléane',
 'gygy',
 'lumalia',
 'lyoc',
 'seroquel',
 'cycleane',
 'indesirables',
 'preservatif',
 'gelules',
 'holgyeme',
 'noroxine',
 'préscrit',
 'rubeole',
 'dexorat',
 'anxiete',
 'gelsenium',
 'pilulle',
 'apetit',
 'noctran',
 'tricilest',
 'implanon',
 'granions',
 'homeopathique',
 'depresseur',
 'calcibronate',
 'gellules',
 'seropam',
 'somnifere',
 'mepronizine',
 'nausees',
 'deprime',
 'stodal',
 'microdosée',
 'homeopathie',
 'androcure',
 'holgyème',
 'efflexor',
 'sertaline',
 'déremboursé',
 'spottings',
 'demangeaisons',
 'minidosée',
 'homeopathiques',
 'corticoide',
 'diarrhés',
 'bichromicum',
 'myolastan',
 'infarix',
 'grossese',
 'douleureuses',
 'lisanxia',
 'orgastoro',
 'remicad',
 'efexor',
 'temestat',
 'lysopaine',
 'colliques',
 'lexomyl',
 'qon',
 'sipralexa',
 'équanil',
 'seconadire',
 'indesirable',
 'anxiolitique',
 'prescit',
 'vacin',
 'trinordiole',
 'oestrogène',
 'luthényl',
 'hépatire',
 'phosphorica',


In [12]:

from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [13]:
import tensorflow as tf
from keras.layers import CuDNNLSTM
from keras.layers import Bidirectional

#config = tf.ConfigProto(device_count={"CPU": 32})

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
config=tf.ConfigProto(gpu_options=gpu_options,allow_soft_placement=True)

keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

model_lstm = Sequential()

model_lstm.add(Embedding(len(word_index)+1, embed_out_size,
          weights=[embedding_matrix], input_length=padding, trainable=False))

model_lstm.add(Bidirectional(CuDNNLSTM(embed_out_size)))
model_lstm.add(Dense(num_classes, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 300)          2835600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 600)               1444800   
_________________________________________________________________
dense_1 (Dense)              (None, 51)                30651     
Total params: 4,311,051
Trainable params: 1,475,451
Non-trainable params: 2,835,600
_________________________________________________________________


import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
from keras.utils.vis_utils import plot_model
plot_model(model_lstm, to_file='c:/Users/I051796/Projects/CES/report/model.png', show_shapes=True)

#pd.DataFrame(XEncodedTrain).to_csv('c:/Users/I051796/Projects/CES/notebooks/XEncodedTrain.txt', index=False, header=False)
XEncodedTrain = pd.read_csv('c:/Users/I051796/Projects/CES/notebooks/XEncodedTrain.txt', header=None)
XEncodedTrain = XEncodedTrain.values
XEncodedTrain.shape

### Train the network

In [14]:
YOneHotEncodedTrain = keras.utils.to_categorical(YTrain['intention'])
print(YOneHotEncodedTrain)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
model_lstm.fit(XEncodedTrain, np.array(YOneHotEncodedTrain), validation_split=0.2, epochs=10, verbose=1)

Train on 6422 samples, validate on 1606 samples
Epoch 1/10


UnknownError: Fail to find the dnn implementation.
	 [[Node: bidirectional_1/CudnnRNN = CudnnRNN[T=DT_FLOAT, _class=["loc:@training/Adam/gradients/bidirectional_1/CudnnRNN_grad/CudnnRNNBackprop"], direction="unidirectional", dropout=0, input_mode="linear_input", is_training=true, rnn_mode="lstm", seed=87654321, seed2=0, _device="/job:localhost/replica:0/task:0/device:GPU:0"](bidirectional_1/transpose, bidirectional_1/ExpandDims_1, bidirectional_1/ExpandDims_1, bidirectional_1/concat)]]
	 [[Node: metrics/acc/Mean/_111 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1274_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

In [None]:
model_lstm.history.history

##  <b>Build neural network with LSTM and CNN <b>
The LSTM model worked well. However, it takes forever to train three epochs. One way to speed up the training time is to improve the network adding “Convolutional” layer. Convolutional Neural Networks (CNN) come from image processing. They pass a “filter” over the data and calculate a higher-level representation. They have been shown to work surprisingly well for text, even though they have none of the sequence processing ability of LSTMs.

In [None]:
def create_conv_model():
    model_conv = Sequential()
    model_conv.add(Embedding(vocabulary_size, embed_out_size, input_length=padding))
    model_conv.add(Dropout(0.2))
    model_conv.add(Conv1D(64, 5, activation='relu'))
    model_conv.add(MaxPooling1D(pool_size=4))
    model_conv.add(LSTM(100))
    model_conv.add(Dense(num_classes, activation='softmax'))
    model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model_conv 

In [None]:
model_conv = create_conv_model()
model_conv.fit(XEncodedTrain, np.array(YOneHotEncodedTrain), validation_split=0.2, epochs = 5)