In [12]:
import os
import sys
import re
import pickle
import numpy as np
import pandas as pd

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [6]:
import nltk # the natural langauage toolkit, open-source NLP
from nltk.corpus import stopwords  
from gensim import parsing # Help in preprocessing the data, very efficiently
import gensim
import os

In [8]:
DIR_DATA = os.path.abspath('..\Data')
MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
TEST_SPLIT = 0.1
VALIDATION_SPLIT = 0.1
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
label_dict = {}
labels=[]

In [9]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) #remplacement
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()
#string.strip(): Leading whitepsace are removed

In [10]:
def loadData(filename):
    df = pd.read_csv(DIR_DATA + filename,delimiter=';')
    selected = ['label', 'text']
    non_selected = list(set(df.columns) - set(selected))
    df = df.drop(non_selected, axis=1)
    df = df.dropna(axis=0, how='any', subset=selected)
    labels = sorted(list(set(df[selected[0]].tolist())))
    #labels=['negative', 'neutre', 'positive']
    for i in range(len(labels)):
        label_dict[labels[i]] = i
        #label_dict={'negative': 0, 'neutre': 1, 'positive': 2}
    x_train = df[selected[1]].apply(lambda x: clean_str(x)).tolist()
    y_train = df[selected[0]].apply(lambda y: label_dict[y]).tolist()
    y_train = to_categorical(np.asarray(y_train))
    #to_categorical: Converts a class vector (integers) to binary class matrix
    return x_train,y_train

In [7]:
def createVocabAndData(sentences):
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    vocab = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return vocab,data

In [8]:
def lstmModel(embedding_matrix,epoch):
    model = Sequential() # configure the model for training
    n, embedding_dims = embedding_matrix.shape
    # n taille du vocabulaire du dataset et embedding_dims est la taille du vecteur de chaque mot(300) selon glove
    
    model.add(Embedding(n, embedding_dims, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(LSTM(128, dropout=0.6, recurrent_dropout=0.6))
    model.add(Dense(2))
    model.add(Activation('softmax'))
    # add layers

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    # loss='categorical_crossentropy' for a multi-class classification problem
    print(model.summary())
    
    model.fit(X_train, y_train, validation_split=VALIDATION_SPLIT, epochs=epoch, batch_size=128)
    model.save_weights('text_lstm_weights.h5')

    scores= model.evaluate(X_test, y_test, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
    return model

In [13]:
sentences, labels = loadData('\service5.csv')

In [10]:
#nltk.download('punkt')
w =[nltk.word_tokenize(sent) for sent in sentences]

In [11]:
from gensim.models import word2vec
model = word2vec.Word2Vec(w, size=300, min_count = 1, window = 5)
# To make the model memory efficient
model.init_sims(replace=True)
pretrained_weights = model.wv.syn0

  """


In [12]:
vocab, data = createVocabAndData(sentences)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SPLIT, random_state=42)
m=lstmModel(pretrained_weights,40)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          586800    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
Total params: 806,706
Trainable params: 219,906
Non-trainable params: 586,800
_________________________________________________________________
None
Train on 1099 samples, validate on 123 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epo