In [5]:
import pandas as pd
import re
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Bidirectional, Embedding, SpatialDropout1D
import matplotlib.pyplot as plt
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lavanyaseetharaman/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
complaints = pd.read_csv('complaints.csv', index_col=False, dtype='unicode')
complaints = complaints[['Consumer complaint narrative', 'Product']]
complaints.dropna(inplace=True)

In [10]:
complaints.head(20)

Unnamed: 0,Consumer complaint narrative,Product
3,Wells Fargo closed my credit card account with...,Credit card or prepaid card
51,My account is set up to limit communications t...,Checking or savings account
69,. I received a text stating a fraudulent charg...,"Money transfer, virtual currency, or money ser..."
70,I mailed a check to my granddaughter from Flor...,Checking or savings account
113,PayPal Credit in coordination with Synchrony B...,Credit card or prepaid card
134,Hard inquiries are on my credit report that I ...,"Credit reporting, credit repair services, or o..."
172,My minimum payment for my Fortiva Retail Credi...,Credit card or prepaid card
292,I have filed numerous complaints with the 3 cr...,"Credit reporting, credit repair services, or o..."
310,I noticed a negative report added to my credit...,Debt collection
347,I want to inform the Credit Bureaus AGAIN that...,"Credit reporting, credit repair services, or o..."


In [11]:
complaints

Unnamed: 0,Consumer complaint narrative,Product
3,Wells Fargo closed my credit card account with...,Credit card or prepaid card
51,My account is set up to limit communications t...,Checking or savings account
69,. I received a text stating a fraudulent charg...,"Money transfer, virtual currency, or money ser..."
70,I mailed a check to my granddaughter from Flor...,Checking or savings account
113,PayPal Credit in coordination with Synchrony B...,Credit card or prepaid card
...,...,...
3586722,Can you let me know why after i paid my $ XXXX...,Mortgage
3586766,"On XX/XX/XXXX Tuesday, after I switched my cho...",Mortgage
3586794,"After falling behind on our mortgage in 2011, ...",Mortgage
3586795,"Hello, I opened a CitiGold checking account wi...",Bank account or service


In [12]:
symbols_regex = re.compile('[/(){}\[\]\|@,;]')
bad_symbols_regex = re.compile('[^0-9a-z #+_]')

def clean_text(text):
    text = text.replace('\d+', '')
    text = text.lower()
    text = symbols_regex.sub(' ', text)
    text = bad_symbols_regex.sub('', text)
    text = text.replace('x', '')
    return text

complaints['Consumer complaint narrative'] = complaints['Consumer complaint narrative'].apply(clean_text)

In [13]:
complaints['Product'].value_counts().sort_values(ascending=False)

Credit reporting, credit repair services, or other personal consumer reports    642735
Debt collection                                                                 205604
Mortgage                                                                        103889
Credit card or prepaid card                                                      93321
Checking or savings account                                                      66387
Student loan                                                                     34748
Credit reporting                                                                 31587
Money transfer, virtual currency, or money service                               30269
Vehicle loan or lease                                                            23114
Credit card                                                                      18838
Payday loan, title loan, or personal loan                                        15217
Bank account or service                    

In [32]:
vectorize_layer = tf.keras.layers.TextVectorization(standardize='lower_and_strip_punctuation',max_tokens=5000,output_mode='int',output_sequence_length=512)

vectorize_layer.adapt(complaints['Consumer complaint narrative'], batch_size=None)

X_train_padded = vectorize_layer(complaints['Consumer complaint narrative'])
X_train_padded = X_train_padded.numpy()
X= X_train_padded

In [30]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [27]:
le = LabelEncoder()
complaints['Product']= le.fit_transform(complaints['Product'])
y = complaints['Product']

In [33]:
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size = 0.3 , random_state = 42)

Create LSTM Network

In [34]:
classifier = Sequential()
classifier.add(Embedding(5000, 100, input_length=X.shape[1]))
classifier.add(SpatialDropout1D(0.2))
classifier.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
classifier.add(Dense(17, activation='softmax'))

In [36]:
classifier.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 512, 100)          500000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 512, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 17)                1717      
                                                                 
Total params: 582,117
Trainable params: 582,117
Non-trainable params: 0
_________________________________________________________________


In [39]:
lr_schedule = keras.callbacks.LearningRateScheduler(lambda epoch:1e-7 * 10**(epoch/20))
opt = tf.keras.optimizers.Adam(learning_rate=1e-7)
classifier.compile(optimizer=opt, loss='mse', metrics=['mae', 'mape'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', mode='min', patience=20)
mc = tf.keras.callbacks.ModelCheckpoint('best_lstm_mode.h5', monitor='loss', mode='min', verbose=0 , save_best_only=True)
hist = classifier.fit(X_train, y_train, epochs=1, batch_size=32, callbacks=[mc, lr_schedule, early_stopping])



In [40]:
prediction = classifier.predict(X_train)



In [41]:
prediction

array([[0.05888884, 0.05884019, 0.05888471, ..., 0.05884019, 0.05882064,
        0.05887364],
       [0.05888884, 0.05884019, 0.05888471, ..., 0.05884019, 0.05882064,
        0.05887364],
       [0.05888884, 0.05884019, 0.05888471, ..., 0.05884019, 0.05882064,
        0.05887364],
       ...,
       [0.05773179, 0.05920929, 0.05907447, ..., 0.0589638 , 0.05794427,
        0.05898903],
       [0.05888884, 0.05884019, 0.05888471, ..., 0.05884019, 0.05882064,
        0.05887364],
       [0.05888884, 0.05884019, 0.05888471, ..., 0.05884019, 0.05882064,
        0.05887364]], dtype=float32)

In [42]:
print(hist.history.keys())

dict_keys(['loss', 'mae', 'mape', 'lr'])


In [44]:
classifier.evaluate(X_test,y_test)



[49.47797775268555, 6.42350959777832, 676049.4375]