# Santander Questions Classification Using:
### A Long short-term memory and Gated recurrent unit Model
> Author: Jefferson Licet

> Email: jeffersonlicet@gmail.com

In [1]:
EMBEDDINGS_DIMENSION = 120
MAX_LEN_WORD = 30
MIN_WORD_FQ = 1
BATCH_SIZE = 32

In [2]:
import os
import gc
import random as rn
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

# Define constants

SEED_NUMBER = 44 # Magic number

# Seed random numbers, only words with CPU/GPU
rn.seed(SEED_NUMBER)
np.random.seed(SEED_NUMBER)
tf.random.set_seed(SEED_NUMBER)
os.environ['PYTHONHASHSEED'] = '0'

In [3]:
import re
import nltk
import unicodedata
from nltk.stem import SnowballStemmer
from nltk.tokenize import WordPunctTokenizer
spanish_snow = SnowballStemmer('spanish')

tokenizer = WordPunctTokenizer()
remove_puntuaction = r'[^\w\s]' # Non words
numbers_regex = r'(\b)[0-9]+(\b)' # Numbers

def normailze_question(q):
  q = q.lower()

  # Remove tildes
  q = unicodedata.normalize('NFKD', q).encode('ASCII', 'ignore').decode('utf8')
  q = re.sub(remove_puntuaction, " ", q)
  q = re.sub(numbers_regex, " DIGITO ", q)

  tokens = tokenizer.tokenize(q)
  return [spanish_snow.stem(token) for token in tokens if len(token) > 1]

In [4]:
# Descargarmos el dataset desde el drive de la competencia
!wget -q -O train.csv https://drive.google.com/u/0/uc?id=1SvVbsYUpKphC3NuU4y7JDYsDJxYT61Yl&export=download
!wget -q -O test_santander.csv https://drive.google.com/u/0/uc?id=1bsV_URfRHy8LNLA1SKJ24hv0lRNVXMV4&export=download

In [5]:
# Load train and test data

DATA_PATH = ''
TRAIN_CSV_DIR = os.path.join(DATA_PATH, 'train.csv')
TEST_CSV_DIR = os.path.join(DATA_PATH, 'test_santander.csv')

train_data = pd.read_csv(TRAIN_CSV_DIR, sep='|')

# Append samples for less populated category
appendQuestions = [
 "correo electrónico inválido",
 "correo electrónico incorrecto",
 "el correo electronico es incorrecto",
 "el correo electronico no es correcto",
]

appendCategories = ["Cat_104", "Cat_104", "Cat_104", "Cat_104"]

df_concat = pd.DataFrame({'Pregunta': appendQuestions, 'Intencion': appendCategories })
train_data = pd.concat([train_data, df_concat], ignore_index=True)

print(train_data.tail(10))

test_data = pd.read_csv(TEST_CSV_DIR)

train_data['labels'], labels = pd.factorize(train_data.Intencion)

# Assert prints train data
print(train_data.head())

# Assert prints test data
print(test_data.head())

assert len(labels) == len(np.unique(train_data.Intencion))

                                                Pregunta Intencion
20098            estoy necesitando una tarjeta de debito    Cat_39
20099   el monto del prestamo depende de los ingresos???   Cat_251
20100       quiero cancelar una compra puntual el cuotas   Cat_339
20101                               necesito pagar deuda   Cat_192
20102  teniendo otro hipotecario es posible aplicar p...   Cat_218
20103                               comisión descubierto    Cat_56
20104                        correo electrónico inválido   Cat_104
20105                      correo electrónico incorrecto   Cat_104
20106                el correo electronico es incorrecto   Cat_104
20107               el correo electronico no es correcto   Cat_104
                                            Pregunta Intencion  labels
0               como puedo trabajar en santander rio   Cat_102       0
1                pagar tarjeta visa querer reintegro   Cat_350       1
2                      pagar tarjeta naranja siste

In [6]:
def normalize_row(row):
  row.Pregunta = normailze_question(row.Pregunta)
  return row

tqdm.pandas()
processed = train_data.progress_apply(normalize_row, axis=1)
processed.head()

HBox(children=(FloatProgress(value=0.0, max=20108.0), HTML(value='')))




Unnamed: 0,Pregunta,Intencion,labels
0,"[com, pued, trabaj, en, santand, rio]",Cat_102,0
1,"[pag, tarjet, vis, quer, reintegr]",Cat_350,1
2,"[pag, tarjet, naranj, sistem]",Cat_132,2
3,"[no, se, debit, la, primer, cuot, del, plan, d...",Cat_129,3
4,"[abon, tarjet, credit]",Cat_342,4


In [7]:
# Create dictionary using a Counter

from collections import Counter
dictionary = Counter()

categories = processed.labels
questions = processed.Pregunta

for tokens in tqdm(questions):
  dictionary.update(tokens)

print("The Dictionary has been generated")

print("Mean by word")
mean = np.mean([c for k,c in dictionary.items()])
print(mean)

print("Total words found:")
print(len(dictionary.items()))

dictionary_list = [k for k,c in dictionary.items() if c >= MIN_WORD_FQ]

print("Total words in dictionary")
print(len(dictionary_list))

dictionary = np.array(dictionary_list);

HBox(children=(FloatProgress(value=0.0, max=20108.0), HTML(value='')))


The Dictionary has been generated
Mean by word
47.49407933688573
Total words found:
3378
Total words in dictionary
3378


In [8]:
# Map words to dictionary index

from tensorflow.keras.preprocessing.sequence import pad_sequences

mean = np.mean([len(c) for c in questions])
print('mean by items ', mean)

vocabulary_size = len(dictionary)
dic_list = list(dictionary)
dictOfWords = { word : i for i, word in enumerate(dic_list) }

def _hash(arr):
  hashed_list = []
  for item in tqdm(arr):
    _list = []
    for token in item:
      if token in dictOfWords:
        _list.append((dictOfWords.get(token)+1))
    hashed_list.append(_list)
  return hashed_list

print('All items processed')
hashed_list = _hash(questions)
print(hashed_list[0])

assert questions[0] == [dic_list[k-1] for k in hashed_list[0]]

mean by items  7.978665207877461
All items processed


HBox(children=(FloatProgress(value=0.0, max=20108.0), HTML(value='')))


[1, 2, 3, 4, 5, 6]


In [9]:
x = hashed_list

test_data = pd.read_csv(TEST_CSV_DIR, sep=',')
test_data.head()
test_data.sample(10)

# Process test data
processed_test = test_data.progress_apply(normalize_row, axis=1)

# Hash test data
processed_test_questions = _hash(processed_test.Pregunta)
assert processed_test.Pregunta.values[0] == [dic_list[k-1] for k in processed_test_questions[0]]
processed_test_questions = pad_sequences(processed_test_questions, maxlen=MAX_LEN_WORD)

print(processed_test_questions[0])

HBox(children=(FloatProgress(value=0.0, max=6702.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6702.0), HTML(value='')))


[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0  10  26   8 397 189]


In [10]:
# Predicts using all the models
def ensemble_pred(models, testX, apply_argmax):
  yhats = [model.predict(testX) for model in models]
  yhats = np.array(yhats)
  summed = np.sum(yhats, axis=0)
  if apply_argmax:
    summed = np.argmax(summed, axis=1)
  return summed

In [11]:
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model

def generateModel(x, y, x_test, y_test, class_weights, categories, epochs, use_gru=False, iteration=0):
  input = Input(shape=(x.shape[1],), dtype='int32')

  input_layer = Embedding(
    trainable=True,
    input_length=x.shape[1],
    output_dim=EMBEDDINGS_DIMENSION,
    input_dim=(dictionary.shape[0]+1)
  )(input)

  input_layer = SpatialDropout1D(0.5)(input_layer)
  if use_gru:
    i1 = Bidirectional(GRU(EMBEDDINGS_DIMENSION*2, return_sequences=True), name="BGRU")(input_layer)
  else:
    i1 = Bidirectional(LSTM(EMBEDDINGS_DIMENSION*2, return_sequences=True), name="BLSTM")(input_layer)
  
  i1 = GlobalMaxPooling1D()(i1)
  i1 = Dropout(0.5)(i1)
  i1 = Dense(512, activation='relu')(i1)
  i1 = BatchNormalization()(i1)
  output = Dense(len(categories), activation="softmax")(i1)
  model = Model(inputs=input, outputs=output)
  model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=Adam(),
    metrics=["acc"]
  )

  if iteration == 0:
    print(model.summary())

  es = EarlyStopping(
        monitor='val_acc',
        mode='max',
        verbose=1,
        patience=15,
        restore_best_weights=True)
  
  callbacks_list=[es]

  history = model.fit(x,
            y,
            epochs=epochs,
            batch_size=BATCH_SIZE,
            validation_data=(x_test, y_test),
            class_weight=dict(enumerate(class_weights)),
            callbacks=callbacks_list,
            verbose=2)
  
  prefix = 'lstm'

  if use_gru:
    prefix = 'gru'
  
  plt.figure(1)
  plt.plot(history.history['acc'])
  plt.plot(history.history['val_acc'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'val'], loc='upper left')
  plt.savefig(str(iteration)+'_acc_'+prefix+'.png')
  plt.clf()
  plt.figure(1)
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('model loss')
  plt.ylabel('loss')
  plt.xlabel('epoch')
  plt.legend(['train', 'val'], loc='upper left')
  plt.savefig(str(iteration)+'_loss_'+prefix+'.png')
  plt.clf()
  return model

In [12]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import balanced_accuracy_score

def trainModel(gru=False):
  N_FOLDS = 10 # 10
  EPOCHS = 200 # 200

  y = np.array(categories)
  x = pad_sequences(hashed_list, maxlen=MAX_LEN_WORD)

  class_weights = class_weight.compute_class_weight(
      'balanced',
      classes=np.unique(y),
      y=y)

  kfold = StratifiedKFold(N_FOLDS, True, 1)
  iteration = 0
  models = []
  scores = []

  for train_ix, test_ix in kfold.split(x, y):
      print('🚀 Starting kfold iteration: ' + str(iteration) + '/' + str(N_FOLDS-1))
      trainX, trainy = x[train_ix], y[train_ix]
      testX, testy = x[test_ix], y[test_ix]
      
      model = generateModel(
          trainX,
          trainy,
          testX,
          testy,
          class_weights,
          np.unique(categories),
          EPOCHS,
          gru,
          iteration)
      
      y_pred = model.predict(testX, verbose=0)
      y_pred_max = np.argmax(y_pred, axis=1).tolist()
      iteration = iteration + 1

      models.append(model)
      bacc = balanced_accuracy_score(testy, y_pred_max)
      print ("\n########## Balanced Acc: %0.8f ##########\n" % bacc )
      scores.append(bacc)

  summed = np.sum(scores, axis=0)
  print ("\n########## Global Balanced Acc: %0.8f ##########\n" % (summed/len(models)) )
  return models, scores

In [13]:
lstm_models, lstm_scores = trainModel(gru=False)
np.save('lstm.npy', ensemble_pred(lstm_models, processed_test_questions, False))


del lstm_models

🚀 Starting kfold iteration: 0/9




Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 30, 120)           405480    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 30, 120)           0         
_________________________________________________________________
BLSTM (Bidirectional)        (None, 30, 480)           693120    
_________________________________________________________________
global_max_pooling1d (Global (None, 480)               0         
_________________________________________________________________
dropout (Dropout)            (None, 480)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               246272




########## Balanced Acc: 0.81118978 ##########

🚀 Starting kfold iteration: 1/9
Epoch 1/200
566/566 - 7s - loss: 5.5253 - acc: 0.0372 - val_loss: 4.3953 - val_acc: 0.2168
Epoch 2/200
566/566 - 6s - loss: 3.4034 - acc: 0.2427 - val_loss: 2.5191 - val_acc: 0.4142
Epoch 3/200
566/566 - 6s - loss: 2.1399 - acc: 0.3907 - val_loss: 1.9330 - val_acc: 0.5137
Epoch 4/200
566/566 - 6s - loss: 1.5501 - acc: 0.4888 - val_loss: 1.6340 - val_acc: 0.5738
Epoch 5/200
566/566 - 6s - loss: 1.2935 - acc: 0.5437 - val_loss: 1.6157 - val_acc: 0.5922
Epoch 6/200
566/566 - 6s - loss: 1.0675 - acc: 0.5809 - val_loss: 1.3637 - val_acc: 0.6574
Epoch 7/200
566/566 - 6s - loss: 0.9311 - acc: 0.6221 - val_loss: 1.3658 - val_acc: 0.6584
Epoch 8/200
566/566 - 6s - loss: 0.8856 - acc: 0.6364 - val_loss: 1.3306 - val_acc: 0.6703
Epoch 9/200
566/566 - 6s - loss: 0.7942 - acc: 0.6605 - val_loss: 1.2762 - val_acc: 0.6897
Epoch 10/200
566/566 - 6s - loss: 0.7519 - acc: 0.6683 - val_loss: 1.1939 - val_acc: 0.7126
Epoch 11

<Figure size 432x288 with 0 Axes>

In [14]:
gru_models, gru_scores = trainModel(gru=True)
np.save('gru.npy', ensemble_pred(gru_models, processed_test_questions, False))

print('\n Done training GRU model, starting LSTM Model')



🚀 Starting kfold iteration: 0/9
Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, 30)]              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 30, 120)           405480    
_________________________________________________________________
spatial_dropout1d_10 (Spatia (None, 30, 120)           0         
_________________________________________________________________
BGRU (Bidirectional)         (None, 30, 480)           521280    
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 480)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 480)               0         
_________________________________________________________________
dense_20 (Dense)          




########## Balanced Acc: 0.80808831 ##########

🚀 Starting kfold iteration: 1/9
Epoch 1/200
566/566 - 6s - loss: 5.3656 - acc: 0.0544 - val_loss: 3.8824 - val_acc: 0.2546
Epoch 2/200
566/566 - 6s - loss: 3.1676 - acc: 0.2759 - val_loss: 2.1954 - val_acc: 0.4684
Epoch 3/200
566/566 - 6s - loss: 1.9654 - acc: 0.4191 - val_loss: 1.7817 - val_acc: 0.5356
Epoch 4/200
566/566 - 6s - loss: 1.4594 - acc: 0.5021 - val_loss: 1.5319 - val_acc: 0.6022
Epoch 5/200
566/566 - 6s - loss: 1.1972 - acc: 0.5568 - val_loss: 1.5671 - val_acc: 0.6111
Epoch 6/200
566/566 - 6s - loss: 1.0332 - acc: 0.5960 - val_loss: 1.3414 - val_acc: 0.6569
Epoch 7/200
566/566 - 6s - loss: 0.9070 - acc: 0.6299 - val_loss: 1.2604 - val_acc: 0.6803
Epoch 8/200
566/566 - 6s - loss: 0.8416 - acc: 0.6480 - val_loss: 1.3792 - val_acc: 0.6713
Epoch 9/200
566/566 - 6s - loss: 0.7822 - acc: 0.6587 - val_loss: 1.2363 - val_acc: 0.6957
Epoch 10/200
566/566 - 6s - loss: 0.7069 - acc: 0.6812 - val_loss: 1.1758 - val_acc: 0.7215
Epoch 11

<Figure size 432x288 with 0 Axes>

In [15]:
# Download predictions, test ids and labels

from google.colab import files
files.download('gru.npy')  # GRU Predictions
files.download('lstm.npy')  # LSTM Predictions

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>