In [1]:
import numpy as np 
import pandas as pd 
import time 
import gensim

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential, Model
from keras.layers import Convolution2D, MaxPool2D, Convolution1D, MaxPool1D
from keras.layers import Activation, Dropout, Input, Embedding, Flatten, Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Reshape, Concatenate
from keras.regularizers import l1,l2
from keras.optimizers import Adam, Adadelta, SGD, RMSprop
from keras.losses import sparse_categorical_crossentropy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Data source path 
data_source_path = "../../DATA/"

In [3]:
# Import data
X = pd.read_csv(data_source_path+'clean_data/cleaning_plus_lemmatizing_input_train.csv', sep=";", index_col=0)
y = pd.read_csv(data_source_path+'output_train.csv', sep=";", index_col=0)

not_null_indexes = X['question'].notnull()
X = X[not_null_indexes]
y = y[not_null_indexes]

features = X.columns
targets = y['intention'].unique()

In [4]:
fasttext_model = gensim.models.KeyedVectors.load_word2vec_format(fname =data_source_path+'trained_vectors/custom_word2vec/fr_lemmatizing_no_accent.bin', 
                                                                fvocab=data_source_path+'trained_vectors/custom_word2vec/fr_vocab_lemmatizing_no_accent.txt', 
                                                                binary=True)

In [5]:
vocabulary  = fasttext_model.vocab
word2idx = dict([(k, v.index) for k, v in vocabulary.items()])
#word2idx['unknown'] = len(vocabulary)

idx2word = dict([(v.index, k) for k, v in vocabulary.items()])
#word2idx[len(vocabulary)] = 'unknown'

In [6]:
MAX_NB_WORDS = len(word2idx)

In [7]:
print('Found %s unique tokens.' % len(idx2word))

Found 1306 unique tokens.


In [8]:
# Find max size of the sequences 
MAX_SEQUENCE_LENGTH = 0 
for sentence in X['question']:
    if MAX_SEQUENCE_LENGTH<len(sentence.split()):
        MAX_SEQUENCE_LENGTH = len(sentence.split())
MAX_SEQUENCE_LENGTH

383

In [51]:
fasttext_model.word_vec('malad')

TypeError: word_vec() missing 1 required positional argument: 'word'

In [49]:
# Preprocess text fo feed the net 
sequences = []
for sentence in X['question']:
    token_list = []
    for word in sentence.split():
        if word in word2idx.keys():
            token_list.append(fasttext_model.word_vec(word2idx[word]))
    sequences.append(token_list)
X_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

KeyError: "word '64' not in vocabulary"

In [10]:
X_train, X_test,y_train, y_test= train_test_split(X_sequences,y.values)

In [11]:
X_train.shape

(6020, 383)

In [12]:
INPUT_SHAPE = (MAX_SEQUENCE_LENGTH,)
EMBEDDING_DIM = 50

FILTER_SIZES = {3,4,5}
HIDDEN_DIMS = 50
NUMBER_FILTERS = 10 


In [18]:
#Regularization parameters
adam = Adam(lr=1e-2, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
adadelta = Adadelta(decay=1e-5)
DROPOUT_PERCENTS = [0.2, 0.5]
weights = fasttext_model.syn0
EPOCHS = 1000
BATCH_SIZE = 1000

  """


# TODO List :
- [  ] article sur le pooling : diminue le nb de params et 
- [  ] comment bien régulariser
- [  ] comment tuner son optimizer

In [31]:
# 2/3* sup layer + output size 
HIDDEN_DIMS = int(2/3*383+51)


In [40]:
X_train.shape

(6020, 383)

In [46]:
input_layer = Input(shape = (MAX_SEQUENCE_LENGTH,))
'''embedding_layer = Embedding(input_dim=len(idx2word), 
                            output_dim=weights.shape[1], 
                            weights=[weights],
                            trainable=True)(input_layer)
'''
drop_1 = Dropout(DROPOUT_PERCENTS[0])(input_layer)
conv_layer = Convolution1D(filters=NUMBER_FILTERS,
                     kernel_size=5,
                     padding="same",
                     activation="relu",
                     strides=3)(drop_1)
max_layer = MaxPool1D(pool_size=2)(conv_layer)
drop_2 = Dropout(DROPOUT_PERCENTS[1])(max_layer)
dense_layer = Dense(HIDDEN_DIMS, activation="relu")(input_layer)
dense_layer_2 = Dense(int(2/3*HIDDEN_DIMS+51), activation="relu")(dense_layer)
output_layer = Dense(51, activation="softmax")(dense_layer_2)

model_2 = Model(input_layer, output_layer)
print(model_2.summary())
model_2.compile(loss=sparse_categorical_crossentropy, optimizer=adam, metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_callback = ModelCheckpoint('DATA/CNN_weights/weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
model_2.fit(X_train.T, y_train,
            validation_data=(X_test.T, y_test), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=BATCH_SIZE,
            verbose=1)

ValueError: Input 0 is incompatible with layer conv1d_20: expected ndim=3, found ndim=2