In [2]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
print("import done")

Using TensorFlow backend.


import done


In [3]:
#https://github.com/debadridtt/A-Review-of-Different-Word-Embeddings-for-Sentiment-Classification-using-Deep-Learning/blob/master/LSTM%20Experiment.ipynb

In [4]:
DATA_FILE = 'C:/Users/khmar/Desktop/ISSUE/dataset/CSV/data_ameliorate/data.csv'
df = pd.read_csv(DATA_FILE,delimiter=';',encoding='UTF-8')
print(df.head())

                                                text     label
0  I had ordered a data cable, got a very well fi...  NOTISSUE
1                                   Love This Phone.  NOTISSUE
2                I get a very well finished product.  NOTISSUE
3                            I could not be happier.  NOTISSUE
4  I was looking for this headset for a long time...  NOTISSUE


In [5]:
df['label'].value_counts() #imbalanced Dataset


NOTISSUE    2030
ISSUE       2025
Name: label, dtype: int64

In [6]:
df.shape

(4055, 2)

### Preprocessing the Data

In [7]:
import nltk                      # the natural langauage toolkit, open-source NLP
import gensim
from nltk.corpus import stopwords  
from gensim import parsing
import re# Help in preprocessing the data, very efficiently
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khmar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
STOP_WORDS ={
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'nobody',
 'none',
 'noone',
 'nor',
 'nothing',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'she',
 'should',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'we',
 'well',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [9]:
def transform(text):
    #stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    #print(text)
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    text = [word for word in text.split() if word not in STOP_WORDS]

    ##Fixing Word Lengthening
    #pattern = re.compile(r"(.)\1{2,}")
    #pattern.sub(r"\1\1", text)
    #print(text)
    
    # Preprocessed text after stop words removal
    text = " ".join(text)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    ##Lemmatisation
    from nltk.stem.wordnet import WordNetLemmatizer
    
    # Stemming
    text=gensim.parsing.preprocessing.stem_text(text)
    #Spellchecker 
    #correcteur
    # find those words that may be misspelled
    spell = SpellChecker()
    misspelled = text.split()
    for i in range(len(misspelled)):
    # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        text = " ".join(misspelled)

    # Reduce words to their root form
    text = [WordNetLemmatizer().lemmatize(word) for word in text.split()]
        
    lemmatizer=WordNetLemmatizer()
    #text=word_tokenize(text)
    for word in text:
        word=lemmatizer.lemmatize(word,pos='v')
        word=lemmatizer.lemmatize(word,pos='n')
    
    return text

In [16]:
df['text'] = df['text'].map(transformText)
texts= df['text']
tags= df['label']


In [17]:
df=df[texts,tags]

TypeError: '(0                 order data cabl got finish work product
1                                              love phone
2                                          finish product
3                                             not happier
4                              look headset long time got
5                               headset start ring reason
6                          displai excel camera good year
7                                      batteri life great
8                                   worst phone had month
9       not good item work start have problem auto rev...
10                embarass ear hurt try push ear plug ear
11                                     protect phone side
12        averag phone bad batteri life oper weak network
13           clear skype call long batteri life long rang
14                                    soyo technolog suck
15                                  great hand free devic
16             self portrait outsid exterior displai cool
17                                        problem mention
18                    try handsfre gadget final work well
19                                             magic help
20                  worst piec crap verizon custom servic
21                                     poor sound qualiti
22                                      best phone market
23                                              work well
24                compani ship product promptli work well
25                                           exactli want
26      pictur resolut far compar price phone offer todai
27                                             great deal
28                          excel product satisfi purchas
29                    highli recommend encourag peopl try
                              ...                        
4025                           flair bartend absolut amaz
4026                     frozen margarita wai sugari tast
4027                                     good order twice
4028    nutshel restaraunt smell like combin dirti fis...
4029                                  girlfriend veal bad
4030                                    unfortun not good
4031                                 pretti satifi experi
4032                         join club awesom offer email
4033             perfect me like beer ic cold case colder
4034    bland flavorless good wai describ bare tepid meat
4035                       chain fan of beat place easili
4036                                           nacho have
4037                                        not come back
4038                        nothav word place pretti well
4039    staff super nice quick crazi crowd downtown ju...
4040                 great atmospher friendli fast servic
4041                receiv pita huge lot meat thumb there
4042                                       food arriv meh
4043    pai 7 85 hot dog fri look like came kid meal w...
4044                    classic main lobster roll fantast
4045    brother law work mall at dai guess sick night too
4046                           good go review place twice
4047                          chip salsa good salsa fresh
4048                                          place great
4049                                         mediocr food
4050                                  insid impress place
4051                                servic super friendli
4052                             sad littl veget overcook
4053                                   place nice surpris
4054                                live music total blow
Name: text, Length: 4055, dtype: object, 0       NOTISSUE
1       NOTISSUE
2       NOTISSUE
3       NOTISSUE
4       NOTISSUE
5          ISSUE
6       NOTISSUE
7       NOTISSUE
8          ISSUE
9          ISSUE
10         ISSUE
11      NOTISSUE
12         ISSUE
13      NOTISSUE
14         ISSUE
15      NOTISSUE
16      NOTISSUE
17         ISSUE
18      NOTISSUE
19      NOTISSUE
20         ISSUE
21         ISSUE
22      NOTISSUE
23      NOTISSUE
24      NOTISSUE
25      NOTISSUE
26         ISSUE
27      NOTISSUE
28      NOTISSUE
29      NOTISSUE
          ...   
4025    NOTISSUE
4026       ISSUE
4027    NOTISSUE
4028       ISSUE
4029       ISSUE
4030       ISSUE
4031    NOTISSUE
4032    NOTISSUE
4033    NOTISSUE
4034       ISSUE
4035       ISSUE
4036    NOTISSUE
4037       ISSUE
4038    NOTISSUE
4039    NOTISSUE
4040    NOTISSUE
4041    NOTISSUE
4042       ISSUE
4043       ISSUE
4044    NOTISSUE
4045       ISSUE
4046    NOTISSUE
4047    NOTISSUE
4048    NOTISSUE
4049       ISSUE
4050    NOTISSUE
4051    NOTISSUE
4052       ISSUE
4053    NOTISSUE
4054    NOTISSUE
Name: label, Length: 4055, dtype: object)' is an invalid key

In [18]:
# dictionary of lists  
dict = {'text': texts, 'label': tags } 
     
df = pd.DataFrame(dict) 
  
# saving the dataframe 
df.to_csv('C:/Users/khmar/Desktop/ISSUE/dataset/CSV/data_ameliorate/DATA_preprocessing.csv')

In [10]:
## read data saved

In [13]:
DATA_FILE = 'C:/Users/khmar/git_repo/IssueModelTraining/DATA/DATA_text_preprocessing.csv'
df = pd.read_csv(DATA_FILE,delimiter=',',encoding='UTF-8')

In [14]:
msk = np.random.rand(len(df)) < 0.7 # Splitting into train(70%) and test(30%) randomly

In [15]:
train_df=df[msk]
test_df=df[~msk]

In [16]:
print(train_df.shape)
print(test_df.shape)

(2848, 3)
(1207, 3)


In [17]:
print('Checking target values for train data:\n')
print(train_df['label'].value_counts(),'\n')
print('Checking target values for test data:\n')
print(test_df['label'].value_counts())

Checking target values for train data:

ISSUE       1451
NOTISSUE    1397
Name: label, dtype: int64 

Checking target values for test data:

NOTISSUE    633
ISSUE       574
Name: label, dtype: int64


In [18]:
train_df['label'].value_counts()


ISSUE       1451
NOTISSUE    1397
Name: label, dtype: int64

In [19]:
x_train=train_df['text']
y_train=train_df['label']
x_test=test_df['text']
y_test=test_df['label']

In [20]:
texts_train = x_train.astype(str)
texts_test = x_test.astype(str)

In [21]:
MAX_NB_WORDS = 20000
maxlen = 120  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

### Word2Vec Embedding

In [22]:
df=train_df.append(test_df)

In [23]:
from gensim.models import Word2Vec
import nltk
w =[nltk.word_tokenize(sent) for sent in df['text']]
word_model = Word2Vec(w, size=200, min_count = 1, window = 5,sg=0, negative=5)

TypeError: expected string or bytes-like object

In [24]:
word_vectors = word_model.wv

NameError: name 'word_model' is not defined

In [30]:
#print(word_vectors.vocab )

In [31]:
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

Number of word vectors: 3725


In [32]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

In [33]:
x_train=df["text"][:train_df.shape[0]]
y_train=df["label"][:train_df.shape[0]]
x_test=df["text"][train_df.shape[0]:]
y_test=df["label"][train_df.shape[0]:]

In [34]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)
sequences_test = tokenizer.texts_to_sequences(x_test)

In [35]:
#tok save
import pickle
tok_file = 'token_LSTM_Word2Vec_Methode_DATA.sav'
pickle.dump(tokenizer, open(tok_file, 'wb'))

In [36]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
test_data = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")

In [37]:
print('Shape of data tensor:', data.shape)
print('Shape of test_data tensor:', test_data.shape)

Shape of data tensor: (2846, 200)
Shape of test_data tensor: (1209, 200)


In [38]:
word_index = tokenizer.word_index

In [39]:

WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass

In [40]:
y_train = y_train.map({"NOTISSUE": 1, "ISSUE" : 0 })
y_test = y_test.map({"NOTISSUE": 1, "ISSUE" : 0 })

In [41]:
model_word2vec = Sequential()
model_word2vec.add(Embedding(MAX_NB_WORDS, 128))
model_word2vec.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model_word2vec.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [42]:
model_word2vec.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [43]:
model_word2vec.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         476800    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 608,513
Trainable params: 608,513
Non-trainable params: 0
_________________________________________________________________


In [44]:
%%time
model_word2vec.fit(data, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(test_data, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 2846 samples, validate on 1209 samples
Epoch 1/2
Epoch 2/2
Wall time: 50.1 s


<keras.callbacks.History at 0x1f8a15f5b70>

In [1]:
scores= model_word2vec.evaluate(x_test, y_test,
                           batch_size=batch_size)
print("%s: %.2f%%" % (model_word2vec.metrics_names[0], scores[0] * 100))
print("%s: %.2f%%" % (model_word2vec.metrics_names[1], scores[1] * 100))

NameError: name 'model_word2vec' is not defined

In [48]:
#model_word2vec.save('LSTM_Word2Vec_DATA.sav')

In [49]:
#file_tok = 'C:/Users/khmar/model_file/LSTM_token_word2vec_DATA.sav'
#pickle.dump(tokenizer, open(file_tok, 'wb'))

In [50]:
#file_model = 'C:/Users/khmar/model_file/LSTM_model_word2vec_DATA.sav'
#pickle.dump(model_word2vec, open(file_model, 'wb'))