In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
import pickle
print("import done")

Using TensorFlow backend.


import done


In [2]:
#https://github.com/debadridtt/A-Review-of-Different-Word-Embeddings-for-Sentiment-Classification-using-Deep-Learning/blob/master/LSTM%20Experiment.ipynb

In [3]:
DATA_FILE = 'C:/Users/khmar/Desktop/ISSUE/dataset/CSV/data_ameliorate/data.csv'
df = pd.read_csv(DATA_FILE,delimiter=';',encoding='UTF-8')
print(df.head())

                                                text     label
0  I had ordered a data cable, got a very well fi...  NOTISSUE
1                                   Love This Phone.  NOTISSUE
2                I get a very well finished product.  NOTISSUE
3                            I could not be happier.  NOTISSUE
4  I was looking for this headset for a long time...  NOTISSUE


In [4]:
df['label'].value_counts() #imbalanced Dataset


NOTISSUE    2030
ISSUE       2025
Name: label, dtype: int64

### Preprocessing the Data

In [5]:
import nltk                      # the natural langauage toolkit, open-source NLP
import gensim
from nltk.corpus import stopwords  
from gensim import parsing
import re# Help in preprocessing the data, very efficiently
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khmar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
STOP_WORDS ={
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'nobody',
 'none',
 'noone',
 'nor',
 'nothing',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'she',
 'should',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'we',
 'well',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [7]:
def transform(text):
    #stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    #print(text)
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    text = [word for word in text.split() if word not in STOP_WORDS]

    ##Fixing Word Lengthening
    #pattern = re.compile(r"(.)\1{2,}")
    #pattern.sub(r"\1\1", text)
    #print(text)
    
    # Preprocessed text after stop words removal
    text = " ".join(text)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    ##Lemmatisation
    from nltk.stem.wordnet import WordNetLemmatizer
    
    # Stemming
    text=gensim.parsing.preprocessing.stem_text(text)
    #Spellchecker 
    #correcteur
    # find those words that may be misspelled
    spell = SpellChecker()
    misspelled = text.split()
    for i in range(len(misspelled)):
    # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        text = " ".join(misspelled)

    # Reduce words to their root form
    text = [WordNetLemmatizer().lemmatize(word) for word in text.split()]
        
    lemmatizer=WordNetLemmatizer()
    #text=word_tokenize(text)
    for word in text:
        word=lemmatizer.lemmatize(word,pos='v')
        word=lemmatizer.lemmatize(word,pos='n')
    
    return text

In [22]:
transform("this servic groop")

['service', 'group']

In [23]:
df['text'] = df['text'].map(transform)
texts= df['text']
tags= df['label']

In [24]:
df=df[texts,tags]

TypeError: '(0         [order, data, call, got, finish, work, product]
1                                           [love, phone]
2                                       [finish, product]
3                                          [not, happier]
4                        [look, headset, long, time, got]
5                      [headset, start, ring, no, reason]
6                    [display, excel, camera, good, year]
7                                  [battery, life, great]
8                              [worst, phone, had, month]
9       [not, good, item, work, start, have, problem, ...
10      [embarrass, ear, hurt, try, push, ear, plug, ear]
11                                 [protect, phone, side]
12      [average, phone, bad, battery, life, over, wea...
13      [clear, skye, call, long, battery, life, long,...
14                               [solo, technology, suck]
15                            [great, hand, free, device]
16      [self, portrait, outside, exterior, display, c...
17                                     [problem, mention]
18             [try, handsome, gadget, final, work, well]
19                                          [magic, help]
20         [worst, piece, crap, version, custom, service]
21                                 [poor, sound, quality]
22                                  [best, phone, market]
23                                           [work, well]
24         [company, ship, product, promptly, work, well]
25                                        [exactly, want]
26      [picture, resolute, far, compare, price, phone...
27                                          [great, deal]
28                     [excel, product, satisfy, purchas]
29            [highly, recommend, encourage, people, try]
                              ...                        
4025                     [flair, barend, absolute, amaze]
4026                [frozen, margarita, wai, sugar, last]
4027                                 [good, order, twice]
4028    [nutshell, restaurant, smell, like, combine, d...
4029                              [girlfriend, veal, bad]
4030                                [unfortun, not, good]
4031                            [pretty, satisfy, expert]
4032                  [join, club, awesome, offer, email]
4033    [perfect, me, like, beer, ic, cold, case, colder]
4034    [bland, flavourless, good, wai, describe, bare...
4035            [chain, no, fan, of, beat, place, easily]
4036                                        [macho, have]
4037                                    [not, come, back]
4038                  [nathan, word, place, pretty, well]
4039    [staff, super, nice, quick, crazy, crowd, down...
4040         [great, atmosphere, friendly, fast, service]
4041       [receive, pity, huge, lot, meat, thumb, there]
4042                                   [food, arrive, me]
4043    [pai, hot, dog, fri, look, like, came, kid, me...
4044              [classic, main, lobster, roll, fantasy]
4045    [brother, law, work, mall, at, dai, guess, sic...
4046                     [good, go, review, place, twice]
4047                    [chip, salsa, good, salsa, fresh]
4048                                       [place, great]
4049                                     [mediocre, food]
4050                             [inside, impress, place]
4051                           [service, super, friendly]
4052                       [sad, little, beget, overcook]
4053                              [place, nice, surprise]
4054                           [live, music, total, blow]
Name: text, Length: 4055, dtype: object, 0       NOTISSUE
1       NOTISSUE
2       NOTISSUE
3       NOTISSUE
4       NOTISSUE
5          ISSUE
6       NOTISSUE
7       NOTISSUE
8          ISSUE
9          ISSUE
10         ISSUE
11      NOTISSUE
12         ISSUE
13      NOTISSUE
14         ISSUE
15      NOTISSUE
16      NOTISSUE
17         ISSUE
18      NOTISSUE
19      NOTISSUE
20         ISSUE
21         ISSUE
22      NOTISSUE
23      NOTISSUE
24      NOTISSUE
25      NOTISSUE
26         ISSUE
27      NOTISSUE
28      NOTISSUE
29      NOTISSUE
          ...   
4025    NOTISSUE
4026       ISSUE
4027    NOTISSUE
4028       ISSUE
4029       ISSUE
4030       ISSUE
4031    NOTISSUE
4032    NOTISSUE
4033    NOTISSUE
4034       ISSUE
4035       ISSUE
4036    NOTISSUE
4037       ISSUE
4038    NOTISSUE
4039    NOTISSUE
4040    NOTISSUE
4041    NOTISSUE
4042       ISSUE
4043       ISSUE
4044    NOTISSUE
4045       ISSUE
4046    NOTISSUE
4047    NOTISSUE
4048    NOTISSUE
4049       ISSUE
4050    NOTISSUE
4051    NOTISSUE
4052       ISSUE
4053    NOTISSUE
4054    NOTISSUE
Name: label, Length: 4055, dtype: object)' is an invalid key

In [25]:
# dictionary of lists  
dict = {'text': texts, 'label': tags } 
     
df = pd.DataFrame(dict) 
  
# saving the dataframe 
df.to_csv('DATA_preprocessing.csv')

In [83]:
DATA_FILE = 'C:/Users/khmar/git_repo/IssueModelTraining/DATA/DATA_text_preprocessing.csv'
df = pd.read_csv(DATA_FILE,delimiter=',',encoding='UTF-8')
print(df.head())

   id                                     text     label
0   0  order data cabl got finish work product  NOTISSUE
1   1                               love phone  NOTISSUE
2   2                           finish product  NOTISSUE
3   3                              not happier  NOTISSUE
4   4               look headset long time got  NOTISSUE


###############################

In [8]:
DATA_FILE = 'DATA_preprocessing.csv'
df = pd.read_csv(DATA_FILE,delimiter=',',encoding='UTF-8')
print(df.head())

   Unnamed: 0                                               text     label
0           0  ['order', 'data', 'call', 'got', 'finish', 'wo...  NOTISSUE
1           1                                  ['love', 'phone']  NOTISSUE
2           2                              ['finish', 'product']  NOTISSUE
3           3                                 ['not', 'happier']  NOTISSUE
4           4         ['look', 'headset', 'long', 'time', 'got']  NOTISSUE


### 

In [9]:
msk = np.random.rand(len(df)) < 0.7 # Splitting into train(70%) and test(30%) randomly

In [10]:
train_df=df[msk]
test_df=df[~msk]

In [11]:
print(train_df.shape)
print(test_df.shape)

(2812, 3)
(1243, 3)


In [12]:
print('Checking target values for train data:\n')
print(train_df['label'].value_counts(),'\n')
print('Checking target values for test data:\n')
print(test_df['label'].value_counts())

Checking target values for train data:

ISSUE       1427
NOTISSUE    1385
Name: label, dtype: int64 

Checking target values for test data:

NOTISSUE    645
ISSUE       598
Name: label, dtype: int64


In [13]:
train_df['label'].value_counts()


ISSUE       1427
NOTISSUE    1385
Name: label, dtype: int64

In [14]:
x_train=train_df['text']
y_train=train_df['label']
x_test=test_df['text']
y_test=test_df['label']

In [15]:
texts_train = x_train.astype(str)
texts_test = x_test.astype(str)

In [16]:
x_train=train_df['text']
y_train=train_df['label']
x_test=test_df['text']
y_test=test_df['label']

### GLoVe Embedding 200

In [17]:
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 200
maxlen = 120  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [18]:
texts_train = x_train.astype(str)
texts_test = x_test.astype(str)

In [19]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

In [20]:
file_tok = 'LSTM_token_GLOVE_200_DATA_with_text_processing.sav'
pickle.dump(tokenizer, open(file_tok, 'wb'))

In [21]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
test_data = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")

In [22]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 2857 unique tokens.


In [23]:
MAX_SEQUENCE_LENGTH = 200
#pad sequences are used to bring all sentences to same size.
# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)

Shape of data tensor: (2812, 200)
Shape of data test tensor: (1243, 200)


In [24]:
y_train = y_train.map({"NOTISSUE": 1, "ISSUE" : 0 })
y_test = y_test.map({"NOTISSUE": 1, "ISSUE" : 0 })

In [25]:
# load the whole embedding into memory
embeddings_index = {}
f = open('C:/Users/khmar/Desktop/GLoVE/glove.6B.200d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [26]:
embedding_matrix = np.zeros((len(word_index) + 1, 200))
#Found 1489 unique tokens : word_index
#print(len(word_index) + 1) #### 1490 
#print(word_index.items()) ### unique tokens : words :word_index
#print('embedding_matrix',embedding_matrix)
out_of_vocab={}
for word, i in word_index.items():
    #print(i)
    embedding_vector = embeddings_index.get(word)
    #print('embedding_vector' ,embedding_vector )
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        #print('word: ', word)
        embedding_matrix[i] = embedding_vector
        #print('embedding_matrix :',embedding_matrix[i] )
        # words not found in embedding index will be all-zeros.
    if embedding_vector is  None:
        #print('word not found :',word)
        out_of_vocab[i] = word

In [27]:
#file_output= "C:/Users/khmar/Desktop/out_of_vocab_glove_200_without_text_processing.txt"
#with open(file_output, "w") as f1:
    #for word, i in out_of_vocab.items():
        #print(out_of_vocab.get(word))
        #f1.write(out_of_vocab.get(word))

In [28]:
f'There are {len(out_of_vocab)} out of vocab '

'There are 2856 out of vocab '

In [29]:
model_glove= Sequential()
model_glove.add(Embedding(len(word_index) + 1,
                            200,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model_glove.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model_glove.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [30]:
model_glove.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [31]:
%%time
model_glove.fit(data, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(test_data, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 2812 samples, validate on 1243 samples
Epoch 1/2
Epoch 2/2
Wall time: 58.1 s


<keras.callbacks.History at 0x279ed6b0358>

In [36]:
scores = model_glove.evaluate(x_test, y_test,
                            batch_size=batch_size)
print("%s: %.2f%%" % (model_glove.metrics_names[0], scores[0] * 100))
print("%s: %.2f%%" % (model_glove.metrics_names[1], scores[1] * 100))

loss: 47.05%
acc: 76.15%


In [33]:
# Save the model
model_glove.save('LSTM_model_glove_200_DATA_with_text_processing.sav')