# Multi Class Text Classification - CNN & RNN

In [0]:
import tensorflow as tf
from tensorflow import keras

import keras as k
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding, Bidirectional
from keras.models import Model
from keras.initializers import Constant

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SpanishStemmer

import re
import string

!pip install stop_words
from stop_words import get_stop_words

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns


print(tf.__version__)

Load data from local

In [0]:
df = pd.read_excel('path/to/data.xlsx', sheet_name='Hoja1')
print('Data size: ', df.shape)

## Explore and preprocess data

In [0]:
df = df.dropna()
df = df.reset_index(drop=True)

df.info()

Labels to integers

In [0]:
df['TARGET_id'] = df['TARGET'].factorize()[0]
category_id_df = df[['TARGET', 'TARGET_id']].drop_duplicates().sort_values('TARGET_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['TARGET_id', 'TARGET']].values)

df.head()

Data cleaning, remove numbers, punctuation and convert to lowercase

In [0]:
def remove_numbers(text):
    return ''.join([letter for letter in text if not letter.isdigit()])
 
def remove_punctuation(text):
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    return regex.sub(' ', str(text))

def clean_text(text):
    text = remove_punctuation(text)
    text = remove_numbers(text)
    
    text = text.lower()
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [0]:
stemmer=SpanishStemmer()

stopwords_es = set(stopwords.words('spanish'))
stopwords_es_sw = set(get_stop_words('spanish'))

stopSpanish = set(stopwords_es.union(stopwords_es_sw))

stopSpanish = list(stopSpanish)
stopSpanish.extend(['tra', 'd', 'desc'])
stopSpanish.remove('no')

print(type(stopSpanish), len(stopSpanish))

def remove_stopwords(text):
    textList = text.split()
    textList = [word for word in textList if word not in stopSpanish]
    return ' '.join([word for word in textList])

In [0]:
df['TEXT'] = df['TEXT'].map(lambda com : remove_stopwords(clean_text(com)))

#### From Text to Numbers

In [0]:
texts = df['TEXT']
labels = df['TARGET_id']

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)


word_index = tokenizer.word_index
print('Number of Unique Tokens',len(word_index))

Number of Unique Tokens 15749


In [0]:
# Longitud uniformizada del texto
data = pad_sequences(sequences,
                     padding='post',
                     maxlen= 415) #128)

labels = to_categorical(np.asarray(labels))

In [0]:
embeddings_index = {}
with open('/path/to/word_embedding/cc.es.300.vec') as f:
    next(f)
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 2000000 word vectors.


In [0]:
# Embedding Matrix
embedding_matrix = np.zeros((len(word_index)+1, 300))
for word, i in word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [0]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)


## Build the model

#### ConvNet

In [0]:
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 415
num_words = len(word_index) + 1
embedding_layer = Embedding(num_words, 
                            EMBEDDING_DIM, 
                            embeddings_initializer=Constant(embedding_matrix), 
                            input_length=MAX_SEQUENCE_LENGTH, 
                            trainable=False)

In [0]:
# 1st model architecture

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(256, 3, activation='relu')(embedded_sequences)
x = GlobalMaxPooling1D()(x)
x = Dense(256, activation='relu')(x)
preds = Dense(len(set(df['TARGET_id'])), activation='softmax')(x)

model = Model(sequence_input, preds)

In [0]:
# 2nd Model architecture

# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer(sequence_input)
# x = Conv1D(300, 3, activation='relu')(embedded_sequences)
# x = GlobalMaxPooling1D()(x)
# x = Dropout(0.5)(x)
# x = Dense(300, activation='relu')(x)
# x = Dropout(0.8)(x)
# preds = Dense(len(set(df['TARGET_id'])), activation='softmax')(x)

# model = Model(sequence_input, preds)

#### Bidirectional LSTM

In [0]:
# 1st model architecture

#sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
#embedded_sequences = embedding_layer(sequence_input)
#x = LSTM(10, activation='tanh', return_sequences=False)(embedded_sequences)
#x = Dropout(0.25)(x)
#x = LSTM(10, activation='tanh', return_sequences=False)(x)
#x = Dropout(0.25)(x)
#x = Dense(10, activation='relu')(x)
#preds = Dense(len(set(df['TARGET_id'])), activation='softmax')(x)

#model = Model(sequence_input, preds)
#adam=k.optimizers.Adam()
#model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])


# 2nd model architecture
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
preds = Dense(len(set(df['TARGET_id'])), activation='softmax')(l_lstm)

model = Model(sequence_input, preds)

print("Bidirectional LSTM")
model.summary()

#### Train

In [0]:
model.compile(optimizer=tf.train.AdamOptimizer(0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])   # 0.0001 , 0.0005

In [0]:
X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

In [0]:
history = model.fit(X_train_,
                    y_train_,
                    epochs=30,
                    batch_size=8,
                    validation_data=(X_val, y_val),
                    verbose=1)

Evalute the model

In [0]:
results = model.evaluate(X_test, y_test)

print(results)

### Results Visulization 

In [0]:
history_dict = history.history
history_dict.keys()

In [0]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [0]:
plt.clf()   # clear figure
acc_values = history_dict['acc']
val_acc_values = history_dict['val_acc']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [0]:
y_pred = model.predict(X_test)

In [0]:
print(y_pred[100])
print(np.argmax(y_pred[100]))
print(y_test[100])
print(np.argmax(y_test[100]))

print(X_test.shape, y_test.shape)

y_test_ = list(map(np.argmax, y_test))
y_pred_ = list(map(np.argmax, y_pred))


Model evaluation

In [0]:
conf_mat = confusion_matrix(y_test_, y_pred_)
fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.TARGET.values, yticklabels=category_id_df.TARGET.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [0]:
print(metrics.classification_report(y_test_, y_pred_, 
                                    target_names=df['TARGET'].unique()))