In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input, Concatenate
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from keras.utils import to_categorical
import re
import nltk
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
tokenizer = RegexpTokenizer(r'\w+')

def preprocess(text):

  text = text.replace('{html}', "") # Remove weblinks
  text = text.lower()
  text = REPLACE_BY_SPACE_RE.sub(' ', text)
  text = BAD_SYMBOLS_RE.sub('', text)
  text = ' '.join(word for word in text.split() if word not in stop_words)
  tokens = tokenizer.tokenize(text)
  cleanedText = " ".join(tokens)

  return cleanedText

In [None]:
df = pd.read_csv('complaints.csv')
df.dropna(subset = ['Consumer complaint narrative', 'Issue', 'sub-issue'], inplace = True)
df['input1'] = df['Consumer complaint narrative'].map(lambda x: preprocess(x))
df['input2'] = df['Issue']
df['input3'] = df['sub-issue']
df.head()

In [None]:
df['product_id'] = df['Product'].factorize()[0]
product_id_df = df[['Product', 'product_id']].drop_duplicates().sort_values('product_id')
product_to_id = dict(product_id_df.values)
id_to_product = dict(product_id_df[['product_id', 'Product']].values)

In [None]:
X = df[['input1', 'input2', 'input3']]
Y = df['Product']

label_encoder = preprocessing.LabelEncoder()

y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100


tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(X['input1']+X['input2']+ X['input3'])

X1_train = pad_sequences(tokenizer.text_to_sequences(X_train['input1']), maxlen=MAX_SEQUENCE_LENGTH)
X1_test = pad_sequences(tokenizer.text_to_sequences(X_test['input1']), maxlen=MAX_SEQUENCE_LENGTH)
word_index1 = tokenizer.word_index
vocab_size1 = len(word_index1) + 1

X2_train = pad_sequences(tokenizer.text_to_sequences(X_train['input2']), maxlen=MAX_SEQUENCE_LENGTH)
X2_test = pad_sequences(tokenizer.text_to_sequences(X_test['input2']), maxlen=MAX_SEQUENCE_LENGTH)
word_index2 = tokenizer.word_index
vocab_size2 = len(word_index1) + 1

X3_train = pad_sequences(tokenizer.text_to_sequences(X_train['input3']), maxlen=MAX_SEQUENCE_LENGTH)
X3_test = pad_sequences(tokenizer.text_to_sequences(X_test['input3']), maxlen=MAX_SEQUENCE_LENGTH)
word_index3 = tokenizer.word_index
vocab_size3 = len(word_index1) + 1

input_1 = Input(shape=(MAX_SEQUENCE_LENGTH, ))
input_2 = Input(shape=(MAX_SEQUENCE_LENGTH, ))
input_3 = Input(shape=(MAX_SEQUENCE_LENGTH, ))

In [None]:
embedding_layer1 = Embedding(vocab_size1, 100, trainable=True)(input_1)
spatialdropout1 = SpatialDropout1D(0.2)(embedding_layer1)
LSTM_layer1 = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(spatialdropout1)

embedding_layer2 = Embedding(vocab_size1, 100, trainable=True)(input_2)
spatialdropout2 = SpatialDropout1D(0.2)(embedding_layer2)
LSTM_layer2 = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(spatialdropout2)

embedding_layer3 = Embedding(vocab_size1, 100, trainable=True)(input_3)
spatialdropout3 = SpatialDropout1D(0.2)(embedding_layer3)
LSTM_layer3 = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(spatialdropout3)

concat_layer = Concatenate()([LSTM_layer1, LSTM_layer2, LSTM_layer3])
dense_layer3 = Dense(50, activation='relu')(concat_layer)
dense_layer4 = Dense(10, activation='relu')(dense_layer3)
output= Dense(2, activation='softmax')(dense_layer4)

model = Model(inputs=[input_1, input_2, input3], outputs = output)

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())

In [None]:
history = model.fit(x=[X1_train, X2_train, X3_train], y=y_train, batch_size = 32, epochs = 5, verbose = 1, validation_split = 0.1,
                    callbacks = [EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
score = model.evaluate(x=[X1_test, X2_test, X3_test], y=y_test, verbose = 1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()