# Text classification using deep learning models

Dataset: https://www.kaggle.com/vikram92/multiclass-complaints-classification-using-bi-lstm

We study the text classification using the deep learning models which are given below:
<li> CNN model
<li> LSTM model
<li> Bidirectional LSTM model
<br>
We compare the performance among them.

In [1]:
import pandas as pd
import numpy as np
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Embedding
from tensorflow.keras.layers import Conv1D, LSTM, Bidirectional
from tensorflow.keras.layers import MaxPooling1D, BatchNormalization
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt

Load data from csv file

In [2]:
df = pd.read_csv('complaints_processed.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,product,narrative
0,credit_card,purchase order day shipping amount receive pro...
1,credit_card,forwarded message date tue subject please inve...
2,retail_banking,forwarded message cc sent friday pdt subject f...
3,credit_reporting,payment history missing credit report speciali...
4,credit_reporting,payment history missing credit report made mis...


Label the classes with the integers

In [4]:
le = LabelEncoder()
df['product'] = le.fit_transform(df['product'])

In [5]:
num_class = len(df['product'].unique()) # Number of classes

Clean the text

In [6]:
def clean_doc(data):
    data = data.str.lower() # change to lower capital
    data = data.str.replace(r'[\d]',' ') # remove digits
    data = data.str.replace('[{}]'.format(string.punctuation), '') # remove punctuation
    data = data.fillna('') # fill 'nan' with ''
    return data

In [7]:
df['narrative'] = clean_doc(df['narrative'])

In [8]:
# Parameters
num_words = 10000           # number of words for tokenization
max_features = num_words    # input dim for embedding
max_length = 200            # length of input sequence
embedding_dim = 64          # output dim for embedding

In [9]:
X = df['narrative'].tolist() # Text 
y = df['product'].values     # label

Tokenize the document and pad them into sequence

In [10]:
tokenizer = Tokenizer(num_words=num_words, lower=False) # tokenize the texts
tokenizer.fit_on_texts(X)
docs = tokenizer.texts_to_sequences(X)

X = pad_sequences(docs, maxlen=max_length, padding='post') # padding the sequences

Split the train and test sets

In [11]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.2, random_state=21)

In [12]:
train_label = train_y 
test_label = test_y
train_y = to_categorical(train_y) 
test_y = to_categorical(test_y)

<b> Deep Learning model

In [13]:
class CNN_model():
    '''
      CNN model
    '''
    def __init__(self, max_features, max_length, embedding_dim, num_class):
        self.max_features = max_features
        self.max_length = max_length
        self.embedding_dim = embedding_dim
        self.num_class = num_class
        
    def model(self):
        model = Sequential()
        model.add(Embedding(max_features, embedding_dim, input_length=max_length))
        model.add(Conv1D(filters=embedding_dim, kernel_size=8, padding="valid", activation='relu', strides=3))
        model.add(MaxPooling1D(pool_size=8))
        model.add(BatchNormalization())
        model.add(Dropout(0.1))
        model.add(Flatten())
        model.add(Dense(embedding_dim, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(num_class, activation='softmax'))
        return model

In [14]:
class LSTM_model(CNN_model):
    '''
      LSTM model
    '''
    
    def __init__(self):
        super().__init__(max_features, max_length, embedding_dim, num_class)
        
    def model(self):
        model = Sequential()
        model.add(Embedding(max_features, embedding_dim, input_length=max_length))
        model.add(LSTM(embedding_dim, return_sequences=True))
        model.add(LSTM(embedding_dim))
        model.add(Dense(embedding_dim, activation='relu'))
        model.add(Dense(num_class, activation='softmax'))
        return model

In [15]:
class BidirectLSTM_model(LSTM_model):
    '''
      Bidirectional LSTM model
    '''
    
    def __init__(self):
        super().__init__()
        
    def model(self):
        model = Sequential()
        model.add(Embedding(max_features, embedding_dim, input_length=max_length))
        model.add(Bidirectional(LSTM(64, return_sequences=True)))
        model.add(Bidirectional(LSTM(64)))
        model.add(Dense(embedding_dim, activation='relu'))
        model.add(Dense(num_class, activation='softmax'))
        return model

# CNN model

In [16]:
cnn = CNN_model(max_features, max_length, embedding_dim, num_class).model()

In [17]:
cnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 64)           640000    
_________________________________________________________________
conv1d (Conv1D)              (None, 65, 64)            32832     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 8, 64)             0         
_________________________________________________________________
batch_normalization (BatchNo (None, 8, 64)             256       
_________________________________________________________________
dropout (Dropout)            (None, 8, 64)             0         
_________________________________________________________________
flatten (Flatten)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 64)                3

In [18]:
cnn.compile(loss=CategoricalCrossentropy(), optimizer=Adam(), metrics=['accuracy'])

In [19]:
cnn.fit(train_X, train_y, epochs=6, validation_split=0.1, verbose=1)

Train on 116942 samples, validate on 12994 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x130002db5c8>

<b> Performance

In [20]:
def performance_metrics(truedata, prediction, avg='micro'):
    print('Metrics')
    print('====================================')
    balanced_accuracy = balanced_accuracy_score(truedata, prediction)
    f1 = f1_score(truedata, prediction, average=avg)
    precision = precision_score(truedata, prediction, average=avg)
    recall = recall_score(truedata, prediction, average=avg)
    print('Balanced accuracy: ', round(balanced_accuracy,3))
    print('F1 score : ', round(f1,3))
    print('Precision score : ', round(precision,3))
    print('Recall score : ', round(recall,3))
    return balanced_accuracy, f1, precision, recall

In [21]:
pred = cnn.predict_classes(test_X)

In [22]:
performance = {}
performance['CNN'] = performance_metrics(test_label, pred, avg='micro')

Metrics
Balanced accuracy:  0.807
F1 score :  0.868
Precision score :  0.868
Recall score :  0.868


# LSTM model

In [23]:
lstm = LSTM_model().model()

In [24]:
lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 64)           640000    
_________________________________________________________________
lstm (LSTM)                  (None, 200, 64)           33024     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
Total params: 710,533
Trainable params: 710,533
Non-trainable params: 0
_________________________________________________________________


In [25]:
lstm.compile(loss=CategoricalCrossentropy(), optimizer=Adam(), metrics=['accuracy'])

In [26]:
lstm.fit(train_X, train_y, epochs=6, validation_split=0.1, verbose=1)

Train on 116942 samples, validate on 12994 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x13002521588>

In [27]:
lstm.evaluate(test_X, test_y)



[0.3823408680156215, 0.8702786]

<b> Performance

In [28]:
pred = lstm.predict_classes(test_X)
performance['LSTM'] = performance_metrics(test_label, pred, avg='micro')

Metrics
Balanced accuracy:  0.814
F1 score :  0.87
Precision score :  0.87
Recall score :  0.87


# Bidirectional LSTM

In [29]:
bilstm = BidirectLSTM_model().model()

In [39]:
bilstm.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 64)           640000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 325       
Total params: 813,445
Trainable params: 813,445
Non-trainable params: 0
_________________________________________________________________


In [30]:
bilstm.compile(loss=CategoricalCrossentropy(), optimizer=Adam(), metrics=['accuracy'])

In [31]:
bilstm.fit(train_X, train_y, epochs=6, validation_split=0.1, verbose=1)

Train on 116942 samples, validate on 12994 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x130107c6908>

<b> Performance

In [32]:
pred = bilstm.predict_classes(test_X)
performance['BidirectLSTM'] = performance_metrics(test_label, pred, avg='micro')

Metrics
Balanced accuracy:  0.839
F1 score :  0.878
Precision score :  0.878
Recall score :  0.878


#  Comparison the performances among models

In [33]:
performance = pd.DataFrame.from_dict(performance)
performance.index = ['Balanced Acc.', 'F1', 'Precision', 'Recall']

In [38]:
performance.T.round(3)

Unnamed: 0,Balanced Acc.,F1,Precision,Recall
CNN,0.807,0.868,0.868,0.868
LSTM,0.814,0.87,0.87,0.87
BidirectLSTM,0.839,0.878,0.878,0.878


Bidirectional LSTM provides the best performance in classification. However, the training time is longest. In addition, the deep learning models could give a better performance than the machine learning model using Native Baynes.