# Word embeddings and RNNs

## 1. The data

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.models import Sequential 
from keras.layers import Flatten, Dense
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, Dense, Dropout
from keras import preprocessing

df = pd.read_csv('Bank_complaints.csv')
df.head()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Unnamed: 0,Product,Consumer complaint narrative
0,Student loan,In XX/XX/XXXX I filled out the Fedlaon applica...
1,Student loan,I am being contacted by a debt collector for p...
2,Student loan,I cosigned XXXX student loans at SallieMae for...
3,Student loan,Navient has sytematically and illegally failed...
4,Student loan,My wife became eligible for XXXX Loan Forgiven...


In [2]:
import random
random.seed(123)
df = df.sample(2000)

In [3]:
df.index = range(2000)

In [4]:
product = df["Product"]

There are 7 types of complaints

In [5]:
product.value_counts()

Student loan                   403
Credit card                    340
Consumer Loan                  320
Mortgage                       265
Bank account or service        241
Credit reporting               226
Checking or savings account    205
Name: Product, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
Product                         2000 non-null object
Consumer complaint narrative    2000 non-null object
dtypes: object(2)
memory usage: 31.3+ KB


In [7]:
complaints = df["Consumer complaint narrative"]

## 2. Classifying bank complaints using word embeddings

### 2.1 Pretraining your own embedding layer

We're going to create our own embedding. Let's start with importing an embedding layer. 
- `input_dim`: We specify the vocabulary `n_words` to be equal to 1000.
- `output_dim`: The size of the vector space of the embedding vector (100)
- `input_length`: This the length of the sentences, so equal to `max_sentence_length` in our example.

In [8]:
n_words = 1000
output_dim = 100
max_sentence_length = 200 

embedding_layer = Embedding(input_dim = n_words, output_dim = 100, input_length = max_sentence_length)

Only keep 1,000 most common words and use one-hot encoding

In [9]:
tokenizer = Tokenizer(num_words = n_words)

In [10]:
tokenizer.fit_on_texts(complaints)

In [11]:
complaints_encoded = tokenizer.texts_to_sequences(complaints)

In [12]:
# this will turn the sequence array into an array with
# the 30 most common words per sequence.
seq_padded = preprocessing.sequence.pad_sequences(complaints_encoded, 
                                               maxlen=max_sentence_length)

In [13]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(product)
list(le.classes_)
product_cat = le.transform(product)
product_onehot = to_categorical(product_cat)

In [14]:
np.shape(product_onehot)

(2000, 7)

In [15]:
product_onehot

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [16]:
np.shape(seq_padded)

(2000, 200)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(seq_padded,  
            product_onehot, test_size=0.10, random_state=1234)

In [18]:
np.shape(x_train)

(1800, 200)

In [19]:
np.shape(x_test)

(200, 200)

In [20]:
np.shape(y_train)

(1800, 7)

In [21]:
np.shape(y_test)

(200, 7)

In [22]:
product_onehot.shape

(2000, 7)

In [23]:
from keras import regularizers

model = Sequential()
model.add(Embedding(n_words, output_dim, input_length = max_sentence_length))
model.add(Flatten())
model.add(Dense(50, activation='relu')) #input_shape=(20000,)
model.add(Dense(25, activation='relu'))

model.add(Dense(7, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                1000050   
_________________________________________________________________
dense_2 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_3 (Dense)              (None, 7)                 182       
Total params: 1,101,507
Trainable params: 1,101,507
Non-trainable params: 0
_________________________________________________________________


In [24]:
history = model.fit(x_train, y_train, epochs=50, batch_size=256, validation_split=0.2)

Train on 1440 samples, validate on 360 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [25]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [26]:
loss

3.0200108528137206

In [27]:
accuracy

0.385

### 2.2 Using dropout regularization

In [28]:
from keras import regularizers

model = Sequential()
model.add(Embedding(n_words, output_dim, input_length = max_sentence_length))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train, epochs=50, batch_size=256, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 50)                1000050   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 25)                1275      
_________________________________________________________________
dropout_2 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 7)                 182       
Total para

In [29]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [30]:
loss

2.653766975402832

In [31]:
accuracy

0.41

### 2.3 A pre-trained embedding layer with a deeper network

In [32]:
embed_index = {}
embedding_file = open('glove.6B.100d.txt')
for line in embedding_file:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embed_index[word] = coefs
embedding_file.close()
len(embed_index)

400000

In [33]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(complaints)
n_words = len(tokenizer.word_index) + 1

In [34]:
complaints_encoded = tokenizer.texts_to_sequences(complaints)

In [35]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((n_words, 100))
for word, i in tokenizer.word_index.items():
	embedding_vector = embed_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [36]:
embedding_matrix.shape

(10555, 100)

In [37]:
max_sentence_length = 200
from keras.preprocessing.sequence import pad_sequences

padded_complaints= pad_sequences(complaints_encoded, maxlen=max_sentence_length)

In [38]:
# define model
model = Sequential()
model.add(Embedding(n_words, 100, weights=[embedding_matrix], input_length= 200, trainable=False))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(25, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(7, activation='softmax'))
# compile the model
model.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
history = model.fit(x_train, y_train, epochs=100, batch_size=256, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 100)          1055500   
_________________________________________________________________
flatten_3 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 50)                1000050   
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 25)                1275      
_________________________________________________________________
dropout_4 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 7)                 182       
Total para

Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [39]:
# evaluate the model
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [40]:
loss

2.8106768894195557

In [41]:
accuracy

0.31

## 2. Classifying bank complaints using word embeddings and an RNN

### 2.1 A naive recurrent neural network

In [49]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN

n_words = 1000
output_dim = 100
max_sentence_length = 200 

model = Sequential()
model.add(Embedding(input_dim = n_words, output_dim = 100, input_length = max_sentence_length))
model.add(SimpleRNN(100))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 100)               20100     
Total params: 120,100
Trainable params: 120,100
Non-trainable params: 0
_________________________________________________________________


In [50]:
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 200, 100)          100000    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 100)               20100     
_________________________________________________________________
dense_12 (Dense)             (None, 7)                 707       
Total params: 120,807
Trainable params: 120,807
Non-trainable params: 0
_________________________________________________________________


In [52]:
history = model.fit(x_train, y_train, epochs=100, batch_size=128, validation_split=0.2)

Train on 1440 samples, validate on 360 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [55]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN

n_words = 1000
output_dim = 100
max_sentence_length = 200 

model = Sequential()
model.add(Embedding(input_dim = n_words, output_dim = 100, input_length = max_sentence_length))
model.add(SimpleRNN(100, return_sequences=True))
model.add(SimpleRNN(100, return_sequences=True))
model.add(SimpleRNN(100))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 200, 100)          100000    
_________________________________________________________________
simple_rnn_8 (SimpleRNN)     (None, 200, 100)          20100     
_________________________________________________________________
simple_rnn_9 (SimpleRNN)     (None, 200, 100)          20100     
_________________________________________________________________
simple_rnn_10 (SimpleRNN)    (None, 100)               20100     
Total params: 160,300
Trainable params: 160,300
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 200, 100)          100000    
_________________________________________________________________
simple_rnn_8 (SimpleRNN)     (None, 200, 100)          20100     
_________________________________________________________________
simple_rnn_9 (SimpleRNN)     (None, 200, 100)          20100     
_________________________________________________________________
simple_rnn_10 (SimpleRNN)    (None, 100)               20100     
_________________________________________________________________
dense_13 (Dense)             (None, 7)                 707       
Total params: 161,007
Trainable params: 161,007
Non-trainable params: 0
_________________________________________________________________


In [58]:
history = model.fit(x_train, y_train, epochs=30, batch_size=128, validation_split=0.2)

Train on 1440 samples, validate on 360 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


very poor result! A random classifier would perform similarly!

### 2.2 LSTM

In [61]:
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(input_dim = n_words, output_dim = 100, input_length = max_sentence_length))
model.add(LSTM(100))
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train, epochs=80, batch_size=128, validation_split=0.2)

Train on 1440 samples, validate on 360 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80


Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [62]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [63]:
loss

2.17295569896698

In [64]:
accuracy

0.535

### 2.3 GRU

In [76]:
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(input_dim = n_words, output_dim = 100, input_length = max_sentence_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train, epochs=40, batch_size=128, validation_split=0.2)

Train on 1440 samples, validate on 360 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [68]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [69]:
loss

3.1009898948669434

In [70]:
accuracy

0.46

In [72]:
from keras.layers import GRU
model = Sequential()
model.add(Embedding(input_dim = n_words, output_dim = 100, input_length = max_sentence_length))
model.add(GRU(100))
model.add(Dense(7, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=50, batch_size=128, validation_split=0.2)

Train on 1440 samples, validate on 360 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [73]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

In [74]:
loss

2.942488350868225

In [75]:
accuracy

0.455

# SOURCES

https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Consumer_complaints.ipynb

https://catalog.data.gov/dataset/consumer-complaint-database