In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
print("import done")

Using TensorFlow backend.


import done


In [2]:
#https://github.com/debadridtt/A-Review-of-Different-Word-Embeddings-for-Sentiment-Classification-using-Deep-Learning/blob/master/LSTM%20Experiment.ipynb

In [3]:
DATA_FILE = 'C:/Users/khmar/git_repo/IssueModelTraining/DATA/DATA.csv'
df = pd.read_csv(DATA_FILE,delimiter=';',encoding='UTF-8')
print(df.head())

                                                text     label
0  I had ordered a data cable, got a very well fi...  NOTISSUE
1                                   Love This Phone.  NOTISSUE
2                I get a very well finished product.  NOTISSUE
3                            I could not be happier.  NOTISSUE
4  I was looking for this headset for a long time...  NOTISSUE


In [4]:
df['label'].value_counts() #imbalanced Dataset


NOTISSUE    2030
ISSUE       2025
Name: label, dtype: int64

In [5]:
df.shape

(4055, 2)

### Preprocessing the Data 

In [6]:
import nltk                      # the natural langauage toolkit, open-source NLP
import gensim
from nltk.corpus import stopwords  
from gensim import parsing
import re# Help in preprocessing the data, very efficiently
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khmar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
STOP_WORDS ={
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',
 'full',
 'further',
 'get',
 'give',
 'go',
 'had',
 'has',
 'have',
 'he',
 'hence',
 'her',
 'here',
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'however',
 'hundred',
 'i',
 'if',
 'in',
 'indeed',
 'into',
 'is',
 'it',
 'its',
 'itself',
 'just',
 'keep',
 'last',
 'latter',
 'latterly',
 'least',
 'less',
 'made',
 'make',
 'many',
 'may',
 'me',
 'meanwhile',
 'might',
 'mine',
 'more',
 'moreover',
 'most',
 'mostly',
 'move',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'neither',
 'never',
 'nevertheless',
 'next',
 'nine',
 'nobody',
 'none',
 'noone',
 'nor',
 'nothing',
 'now',
 'nowhere',
 'of',
 'off',
 'often',
 'on',
 'once',
 'one',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'part',
 'per',
 'perhaps',
 'please',
 'put',
 'quite',
 'rather',
 're',
 'really',
 'regarding',
 'same',
 'say',
 'see',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'serious',
 'several',
 'she',
 'should',
 'show',
 'side',
 'since',
 'six',
 'sixty',
 'so',
 'some',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhere',
 'still',
 'such',
 'take',
 'ten',
 'than',
 'that',
 'the',
 'their',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'thereupon',
 'these',
 'they',
 'third',
 'this',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'top',
 'toward',
 'towards',
 'twelve',
 'twenty',
 'two',
 'under',
 'unless',
 'until',
 'up',
 'upon',
 'us',
 'used',
 'using',
 'various',
 'very',
 'via',
 'was',
 'we',
 'well',
 'were',
 'what',
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'with',
 'within',
 'without',
 'would',
 'yet',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [9]:
def transformText(text):
    #stops = set(stopwords.words("english"))
    # Convert text to lower
    text = text.lower()
    #print(text)
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    text = [word for word in text.split() if word not in STOP_WORDS]

    ##Fixing Word Lengthening
    #pattern = re.compile(r"(.)\1{2,}")
    #pattern.sub(r"\1\1", text)
    #print(text)
    
    # Preprocessed text after stop words removal
    text = " ".join(text)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    ##Lemmatisation
    from nltk.stem.wordnet import WordNetLemmatizer
    
    # Stemming
    text=gensim.parsing.preprocessing.stem_text(text)
    #Spellchecker 
    #correcteur
    # find those words that may be misspelled
    spell = SpellChecker()
    misspelled = text.split()
    for i in range(len(misspelled)):
    # Get the one `most likely` answer
        word = spell.correction(misspelled[i])
        misspelled[i]=word
        text = " ".join(misspelled)

    # Reduce words to their root form
    text = [WordNetLemmatizer().lemmatize(word) for word in text.split()]
        
    lemmatizer=WordNetLemmatizer()
    #text=word_tokenize(text)
    for word in text:
        word=lemmatizer.lemmatize(word,pos='v')
        word=lemmatizer.lemmatize(word,pos='n')
    
    return text

In [10]:
transformText("goud servic groop")

['good', 'service', 'group']

In [None]:
df['text'] = df['text'].map(transformText)
texts= df['text']
tags= df['label']

In [None]:
df=df[texts,tags]

In [None]:
# dictionary of lists  
dict = {'text': texts, 'label': tags } 
     
df = pd.DataFrame(dict) 
  
# saving the dataframe 
df.to_csv('C:/Users/khmar/git_repo/IssueModelTraining/DATA/DATA_text_preprocessing.csv')

In [11]:
## open data preprocessing saved 

In [12]:
DATA_FILE = 'C:/Users/khmar/git_repo/IssueModelTraining/DATA/DATA_text_preprocessing.csv'
df = pd.read_csv(DATA_FILE,delimiter=',',encoding='UTF-8')

In [13]:
msk = np.random.rand(len(df)) < 0.7 # Splitting into train(70%) and test(30%) randomly

In [14]:
train_df=df[msk]
test_df=df[~msk]

In [15]:
print(train_df.shape)
print(test_df.shape)

(2814, 3)
(1241, 3)


In [16]:
print('Checking target values for train data:\n')
print(train_df['label'].value_counts(),'\n')
print('Checking target values for test data:\n')
print(test_df['label'].value_counts())

Checking target values for train data:

NOTISSUE    1424
ISSUE       1390
Name: label, dtype: int64 

Checking target values for test data:

ISSUE       635
NOTISSUE    606
Name: label, dtype: int64


In [17]:
train_df['label'].value_counts()


NOTISSUE    1424
ISSUE       1390
Name: label, dtype: int64

In [18]:
x_train=train_df['text']
y_train=train_df['label']
x_test=test_df['text']
y_test=test_df['label']

In [19]:
texts_train = x_train.astype(str)
texts_test = x_test.astype(str)

In [20]:
MAX_NB_WORDS = 20000
maxlen = 120  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

### Normal Embedding Method

In [21]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

In [22]:
#tok save
import pickle
tok_file = 'token_LSTM_Normal_Embedding_Methode_DATA.sav'
pickle.dump(tokenizer, open(tok_file, 'wb'))

In [23]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 3135 unique tokens.


In [24]:
MAX_SEQUENCE_LENGTH = 200
#pad sequences are used to bring all sentences to same size.
# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)

Shape of data tensor: (2814, 200)
Shape of data test tensor: (1241, 200)


In [25]:
y_train = y_train.map({"NOTISSUE": 1, "ISSUE" : 0 })
y_test = y_test.map({"NOTISSUE": 1, "ISSUE" : 0 })

In [26]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2,input_shape=(1,)))
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [27]:
%%time
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=2,
          validation_data=(x_test, y_test))

Instructions for updating:
Use tf.cast instead.
Train on 2814 samples, validate on 1241 samples
Epoch 1/2
Epoch 2/2
Wall time: 57.8 s


In [28]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         2560000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


In [29]:
scores= model.evaluate(x_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[0], scores[0] * 100))
print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100))

loss: 40.04%
acc: 83.24%


In [30]:
model.save("LSTM_normal_Embeddings_DATA_with_text_processing.sav")

### Prediction

In [31]:
#import pickle
#model_file='lstm_model_glove_100_DATA_without_text_processing.sav'
#token_file='C:/Users/khmar/model_file/LSTM_token_glove_300d_DATA.sav'
##model = pickle.load(open(model_file,'rb'))
#token = pickle.load(open(token_file,'rb'))
#from keras.models import load_model # To save and load model

## Load the model
##my_model = load_model(model_file)
#my_model= model_glove

In [79]:
def prediction(input_file,output_file):
    df= pd.DataFrame(columns=['label', 'text'])
    data = pd.read_csv(input_file,delimiter=',')
    inputt=data.text
    x_input = inputt.values
    for i in x_input:
        seq = tokenizer.texts_to_sequences(i)
        seqs = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
        yhat = model.predict(seqs)
        class_pred = model.predict_classes(seqs)
        print(class_pred[0][0])
        classe1 = prediction_classe1(class_pred[0][0])
        df=df.append({'label':classe1,'text':i},ignore_index=True)
    df.to_csv(output_file,sep=';')
    return data,df

In [85]:
def prediction_classe1(class_pred):
    x=""

    if class_pred == 1:
        x = 'ISSUE'
    if class_pred== 0:
        x = 'NOTISSUE'
    return x

In [86]:
def prediction_classe(prob):
    x=""
    if prob > 0.6:
          x = 'NOTISSUE'
    if prob <= 0.6:
          x = 'ISSUE'
    return x

In [87]:
## prediction of one word

In [88]:
seq= tokenizer.texts_to_sequences("bad service")
seqs = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
y = model.predict(seqs)
print(prediction_classe1(y[0][0]))





In [71]:
### prediction of all data
input_file ='C:/Users/khmar/git_repo/IssueModelTraining/LSTM/with_text_processing/DATA_preprocessing.csv'
output_file ='C:/Users/khmar/git_repo/IssueModelTraining/DATA/DATA_predict_text_preprocessing.csv'
#prediction(input_file,output_file,model,tokenizer)
prediction(input_file,output_file)

(      Unnamed: 0                                               text     label
 0              0  ['order', 'data', 'call', 'got', 'finish', 'wo...  NOTISSUE
 1              1                                  ['love', 'phone']  NOTISSUE
 2              2                              ['finish', 'product']  NOTISSUE
 3              3                                 ['not', 'happier']  NOTISSUE
 4              4         ['look', 'headset', 'long', 'time', 'got']  NOTISSUE
 5              5       ['headset', 'start', 'ring', 'no', 'reason']     ISSUE
 6              6     ['display', 'excel', 'camera', 'good', 'year']  NOTISSUE
 7              7                       ['battery', 'life', 'great']  NOTISSUE
 8              8                 ['worst', 'phone', 'had', 'month']     ISSUE
 9              9  ['not', 'good', 'item', 'work', 'start', 'have...     ISSUE
 10            10  ['embarrass', 'ear', 'hurt', 'try', 'push', 'e...     ISSUE
 11            11                       ['protect', 

In [72]:
data_predict= pd.read_csv(output_file ,delimiter=';',encoding='latin-1')
print(data_predict.head())

   Unnamed: 0     label                                               text
0           0  NOTISSUE  ['order', 'data', 'call', 'got', 'finish', 'wo...
1           1  NOTISSUE                                  ['love', 'phone']
2           2  NOTISSUE                              ['finish', 'product']
3           3  NOTISSUE                                 ['not', 'happier']
4           4  NOTISSUE         ['look', 'headset', 'long', 'time', 'got']


In [73]:
data_predict['label'].value_counts() #imbalanced Dataset

NOTISSUE    4055
Name: label, dtype: int64

In [None]:
### confessin matrix

In [54]:
#DATA_FILE_PREDICT = ''
#data_predict= pd.read_csv(DATA_FILE_PREDICT ,delimiter=';',encoding='latin-1')
#print(df.head())

In [55]:
data_predict['label'].value_counts() #imbalanced Dataset

ISSUE    4055
Name: label, dtype: int64

In [56]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(df.label,data_predict.label)
print(cm)

[[2025    0]
 [2030    0]]


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import itertools
# This utility function is from the sklearn docs: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, fontsize=22)
    plt.yticks(tick_marks, classes, fontsize=22)

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label', fontsize=25)
    plt.xlabel('Predicted label', fontsize=25)

In [None]:
cnf_matrix = confusion_matrix(df.label, data_predict.label)
plt.figure(figsize=(24,20))
plot_confusion_matrix(cnf_matrix, classes=['ISSUE','NOTISSUE'], title="Confusion matrix")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure(figsize=(20,10))
ax = sns.heatmap(cf, annot=True, fmt="d",linewidths=.5, center = 90, vmax = 200)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print('accuracy %s' % accuracy_score(data_predict.label, df.label))
print(classification_report(df.label, data_predict.label,target_names=['ISSUE','NOTISSUE']))

In [None]:
def false(input_file,output_file,falses_file):
    diff= pd.DataFrame(columns=['Label1','Label2', 'Text'])
    data = pd.read_csv(input_file,delimiter=';')
    data_pred = pd.read_csv(output_file,delimiter=';')
    input1 = data.label
    input2 = data_pred.label
    x=data.text.values
    x_input1 = input1.values
    x_input2 = input2.values
    for i in range(len(x_input1)):
        if x_input1[i]!=x_input2[i]:
            diff = diff.append({'Label1':x_input1[i],'Label2':x_input2[i], 'Text':x[i]},ignore_index=True)
    diff.to_csv(falses_file,sep=';')
    return diff

In [None]:
falses_file='C:/Users/khmar/Desktop/ISSUE/dataset/CSV/DATA_pred_normal_embeddings.csv'
diff=false(input_file,output_file,falses_file)
print(diff)