In [1]:
import sys
import os
import numpy as np
import pandas as pd
import re
import itertools
import tensorflow as tf
import string
from io import BytesIO
from tensorflow.contrib import learn
from collections import Counter
from time import time
import datetime
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Read the input dataset 
d = pd.read_csv("./consumer_complaints.csv", 
                usecols=('product','consumer_complaint_narrative'),
                dtype={'consumer_complaint_narrative': object})
# Only interested in data with consumer complaints
d=d[d['consumer_complaint_narrative'].notnull()]
d=d[d['product'].notnull()]
d.reset_index(drop=True,inplace=True)

In [3]:
# Let's see what's in the data 
print ("Data dimensions:", d.shape)
print (d.head())

# Let's see a table of how many examples we have of each product
print ("\nList of Products       Occurrences\n")
print (d["product"].value_counts())

Data dimensions: (66806, 2)
           product                       consumer_complaint_narrative
0  Debt collection  XXXX has claimed I owe them {$27.00} for XXXX ...
1    Consumer Loan  Due to inconsistencies in the amount owed that...
2         Mortgage  In XX/XX/XXXX my wages that I earned at my job...
3         Mortgage  I have an open and current mortgage with Chase...
4         Mortgage  XXXX was submitted XX/XX/XXXX. At the time I s...

List of Products       Occurrences

Debt collection            17552
Mortgage                   14919
Credit reporting           12526
Credit card                 7929
Bank account or service     5711
Consumer Loan               3678
Student loan                2128
Prepaid card                 861
Payday loan                  726
Money transfers              666
Other financial service      110
Name: product, dtype: int64


In [4]:
def clean_str(string):
    """
    Tokenization/string cleaning (partially modified)
    """
    string = re.sub(r"[^A-Za-z0-9()!?\'\`%$]", " ", string) # keep also %$ but removed comma
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\$", " $ ", string) #yes, isolate $
    string = re.sub(r"\%", " % ", string) #yes, isolate %
    string = re.sub(r"\s{2,}", " ", string)
    
    # fixing XXX and xxx like as word
    string = re.sub(r'\S*(x{2,}|X{2,})\S*',"xxx",string)
    # removing non ascii
    string = re.sub(r'[^\x00-\x7F]+', "", string) 
    
    return string.strip().lower()

In [5]:
word_data=[]
t0 = time()

for message in d['consumer_complaint_narrative']:
    word_data.append(clean_str(message))

# With a MacBook Pro (Late 2011)
# 2.4 GHz Intel Core i5, 4 GB 1333 MHz DDR3
print ("\nCleaning time: mine = 41.8 s, here =", round(time()-t0, 1), "s")


Cleaning time: mine = 41.8 s, here = 36.1 s


In [6]:
print(word_data[0])

xxx has claimed i owe them $ 27 00 for xxx years despite the proof of payment i sent them canceled check and their ownpaid invoice for $ 27 00 ! they continue to insist i owe them and collection agencies are after me how can i stop this harassment for a bill i already paid four years ago ?


In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 80
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 100 

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(word_data)
sequencestr1 = tokenizer.texts_to_sequences(word_data)

#print(tokenizer)
#print(sequences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

datatr1 = pad_sequences(sequencestr1, maxlen=MAX_SEQUENCE_LENGTH)


print('Shape of data tensor:', datatr1.shape)
print(datatr1[0])


Found 52942 unique tokens.
Shape of data tensor: (66806, 80)
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     1    43   760     3   213    29  2699    27    16     1   107   621
     2   256     8    35     3    82    29  1328   124     5    55 26521
  1645    16  2699    27    11   335     4  2421     3   213    29     5
   108   352    36    66    18   145    77     3   304    17   984    16
     6   162     3   296    83  1306   107   295]


In [18]:
import os

embeddings_index = {}
f = open(os.path.join(r'C:\Users\HuaSheng\Desktop\reddragonai\dl_dev_course-master\redaicse\LSTMProject', 'glove6B100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix.shape)

from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Found 400000 word vectors.
(52943, 100)


In [32]:
tgtDict = {u:v for v,u in enumerate(d["product"].unique())}
print(tgtDict)
trainLabel = np.array([ tgtDict[i] for i in d["product"] ])

{'Debt collection': 0, 'Mortgage': 2, 'Consumer Loan': 1, 'Bank account or service': 6, 'Payday loan': 7, 'Credit reporting': 4, 'Money transfers': 8, 'Other financial service': 9, 'Credit card': 3, 'Student loan': 5, 'Prepaid card': 10}


In [33]:
from keras.utils import to_categorical
print(pdtClass)
trainLabel = to_categorical(trainLabel,11)
trainLabel[:10]

[0 1 2 ..., 7 2 2]


array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [None]:
from keras.models import load_model,Model
from keras.layers import *
import keras

biLSTM1 = Bidirectional(LSTM(50,return_sequences=True))
biLSTM2 = Bidirectional(LSTM(50))
dense1 =  Dense(128, activation='relu')
dense2 =  Dense(128, activation='relu')
def processBlk(sequenceInp):
    embedded_sequences = embedding_layer(sequenceInp)
    x = biLSTM1(embedded_sequences)
    x = Dropout(0.2)(x)
    x = biLSTM2(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = dense1(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = dense2(x)
    return(x)

sequence_inputtr1 = Input(shape=(MAX_SEQUENCE_LENGTH,),name = 'tr1Inp', dtype='int32')
x1 = processBlk(sequence_inputtr1)


preds = Dense(11, activation='softmax')(x1)


model = Model(sequence_inputtr1, preds)
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.fit(datatr1, trainLabel, validation_split=0.1,shuffle=True,
          epochs=5, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
tr1Inp (InputLayer)          (None, 80)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 80, 100)           5294300   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 80, 100)           60400     
_________________________________________________________________
dropout_7 (Dropout)          (None, 80, 100)           0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 100)               60400     
_________________________________________________________________
batch_normalization_5 (Batch (None, 100)               400       
_________________________________________________________________
dropout_8 (Dropout)          (None, 100)               0         
__________