In [1]:
import os
import sys
import re
import pickle
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Activation
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
DIR_GLOVE = os.path.abspath('..\Glove')
DIR_DATA = os.path.abspath('..\Data')
MAX_SEQUENCE_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
TEST_SPLIT = 0.2
VALIDATION_SPLIT = 0.1
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

In [3]:
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [4]:
def gloveVec(filename):
    embeddings = {}
    f = open(os.path.join(DIR_GLOVE, filename), encoding='utf-8')
    i = 0
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings[word] = coefs
        except ValueError:
            i += 1
    f.close()
    return embeddings

In [5]:
def loadData(filename):
    df = pd.read_csv(DIR_DATA + filename,delimiter=';')
    selected = ['label', 'text']
    non_selected = list(set(df.columns) - set(selected))
    df = df.drop(non_selected, axis=1)
    df = df.dropna(axis=0, how='any', subset=selected)
    labels = sorted(list(set(df[selected[0]].tolist())))
    dict.fromkeys(set(df[selected[0]].tolist()))
    label_dict = {}
    for i in range(len(labels)):
        label_dict[labels[i]] = i

    x_train = df[selected[1]].apply(lambda x: clean_str(x)).tolist()
    y_train = df[selected[0]].apply(lambda y: label_dict[y]).tolist()
    y_train = to_categorical(np.asarray(y_train))
    return x_train,y_train


In [6]:
def createVocabAndData(sentences):
    
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    vocab = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return vocab,data

In [7]:
def createEmbeddingMatrix(word_index,embeddings_index):
    nb_words = min(MAX_NB_WORDS, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NB_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [8]:
def lstmModel(embedding_matrix,epoch):
    model = Sequential()
    n, embedding_dims = embedding_matrix.shape

    model.add(Embedding(n, embedding_dims, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(LSTM(128, dropout=0.6, recurrent_dropout=0.6))
    model.add(Dense(2))
    model.add(Activation('softmax'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    model.fit(X_train, y_train, validation_split=VALIDATION_SPLIT, epochs=epoch, batch_size=128)
    model.save_weights('text_lstm_weights.h5')

    scores= model.evaluate(X_test, y_test, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))
    return model
embeddings = gloveVec('glove.840B.300d.txt')

KeyboardInterrupt: 

In [None]:
if __name__ == "__main__":

    sentences, labels = loadData('\pretraitement_service5.csv')
    
    vocab, data = createVocabAndData(sentences)
    embedding_mat = createEmbeddingMatrix(vocab,embeddings)
    pickle.dump([data, labels, embedding_mat], open('embedding_matrix.pkl', 'wb'))
    print ("Data created")

    print("Train Test split")
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=TEST_SPLIT, random_state=42)

    model=lstmModel(embedding_mat,40)

In [None]:
stopword={"until","off","the","any","only","d","in","because","during","on","which","who","re","then","each","will","its","these","but","other","been","t","y","between","she's","she","have","yourselves","ve","ain","i","just","to","that","own","before","himself","themselves","a","is","my","ll","you","where","had","too","haven","won","yours","over","her","his","shan","if","after","under","such","doing","up","through","same","we","as","or","very","no","myself","they","me","your","are","what","against","theirs","ours","and","it's","of","did","once","am","yourself","again","why","above","here","into","them","at","both","s","hers","herself","so","by","from","when","this","that'll","about","him","it","was","you've","ma","there","ourselves","their","m","how","itself","an","with","down","more","can","some","than","our","those","do","most","be","o","were","all","out","for","has","further","while","whom","below","he","few","being"}

In [None]:
def replace_word(text):
    switcher = {
       "couldn't": "could not",
       "couldn": "could not",
       "won't": "will not",
       "won": "will not",
       "mustn't": "must not",
       "mustn": "must not",
       "that'll": "that will",
       "shouldn't": "should not",
       "shouldn": "should not",
       "should've": "should have",
       "haven't": "have not",
       "haven": "have not",
       "hadn't": "have not",
       "hadn": "have not",
       "hasn't": "have not",
       "hasn": "have not",
       "didn't": "do not",
       "didn": "do not",
       "doesn't": "do not",
       "doesn": "do not",
       "don't": "do not",
       "don": "do not",
       "isn't": "be not",
       "you'd":"you would",
       "you've":"you have",
       "you're":"you are",
       "you'll":"you will",
       "she's":"she is",
       "it's":"it is",
       "aren't":"are not",
       "aren":"are not",
       "weren't":"were not",
       "weren":"were not",
       "wouldn't":"would not",
       "wouldn":"would not",
       "needn't":"need not",
       "needn":"need not",
       "wasn't":"was not",
       "wasn":"was not",
       "mightn't":"might not",
       "mightn":"might not",
       "shan't":"shall not",
       "shan":"shall not",
       "can't":"can not",
       "i'm":"i am"
   }
    text = text.lower()
    text = text.split()
    for i in range(len(text)):
        text[i] = switcher.get(text[i], text[i])
    text = " ".join(text)
    return text

In [None]:
import nltk
nltk.download('stopwords')
def transformText(text):
    
    text=split_alphanum(text)
    #stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    #replace_word
    text=replace_word(text)
    input_str=word_tokenize(text)
    for i in range(len(input_str)):
        input_str[i]=lemmatizer.lemmatize(input_str[i],pos='v')
        input_str[i]=lemmatizer.lemmatize(input_str[i],pos='n')
    
    text=" ".join(input_str)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
     #correcteur
    spell = SpellChecker()
    misspelled = text.split()
    for i in range(len(misspelled)):
# Get the one `most likely` answer
      word = spell.correction(misspelled[i])
      misspelled[i]=word
    text = " ".join(misspelled)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stopword]
    
   
    
    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
   
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    
    #Lematisation
    #text = [WordNetLemmatizer().lemmatize(word) for word in text.split()]
    
    
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

In [None]:
### https://www.kaggle.com/anurag3753/prediction-naive-bayes-preprocessing-with-gensim

# read in some helpful libraries
import nltk                       # the natural langauage toolkit, open-source NLP
nltk.download('wordnet')
import pandas as pd               # pandas dataframe
import re                         # regular expression
from nltk.corpus import stopwords  
from gensim import parsing        # Help in preprocessing the data, very efficiently
import gensim
import numpy as np
from spellchecker import SpellChecker
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.parsing.preprocessing import split_alphanum
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer=WordNetLemmatizer()

In [None]:
def prediction(input_file,output_file):
    df= pd.DataFrame(columns=['text', 'label'])
    data = pd.read_csv(input_file,delimiter=';')
    inputt=data.text
    x_input = inputt.values
    for i in x_input:
        t=transformText(i)
        t=np.array([i])
        seq= tokenizer.texts_to_sequences(t)
        seqs = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
        yhat = model.predict(seqs)
        class_pred = model.predict_classes(seqs)
        classe=prediction_classe(class_pred)
        df=df.append({'text':i,'label':classe},ignore_index=True)
    df.to_csv(output_file,sep=';')
    return data,df

def prediction_classe(class_pred):
    if class_pred[0]==0 :
        classe='noturgent' 
    if class_pred[0]==1 :
        classe='urgent'
    return classe

In [None]:
import nltk # the natural langauage toolkit, open-source NLP
from nltk.corpus import stopwords  
from gensim import parsing # Help in preprocessing the data, very efficiently
import gensim

In [None]:
t="You must now repair my phone!! "
t=transformText(t)
t=np.array([t])
text =[nltk.word_tokenize(sent) for sent in t]
seq= tokenizer.texts_to_sequences(text)
seqs = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
yhat = model.predict(seqs)
print(yhat)
class_pred = model.predict_classes(seqs)
print(class_pred)
print(prediction_classe(class_pred))