Dataset downloaded from https://www.kaggle.com/amananandrai/ag-news-classification-dataset

In [32]:
import os
import pandas as pd
import json
import re
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras import Model
from keras.layers import concatenate

In [5]:
df = pd.read_csv('train.csv')

The dataframe has two text columns. We could concatenate them together as a single entry, but common sense tells us they are qualitatively different fields that have their own separate word distributions. Let's build a concetenation model to treat both fields as separate features.

In [None]:
df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


We'll keep track of hyperparameters and other useful details related to this run in a text log. Let's define these values as constants here.

In [None]:
NOTES = ''
LEN = len(df)
SPLIT = 0.9
LAYERS = 2
DROPOUT = 0.5
BATCH = 100
EPOCHS = 5
SMOOTHING = 0.1
NODES_1 = 100
NODES_2 = 32
NODES_3 = 0

In [20]:
log = pd.read_csv('text_and_title.csv')

X = df['Description']
Y = pd.get_dummies(df['Class Index'])

X2 = df['Title']

In [21]:
encoder = tf.keras.layers.TextVectorization(
    max_tokens=None)

encoder.adapt(X)

encoder2 = tf.keras.layers.TextVectorization(
    max_tokens=None)

encoder2.adapt(X2)

In [22]:
mod = tf.keras.Sequential()

mod.add(tf.keras.Input(shape=(1,), dtype=tf.string))
mod.add(encoder)
mod.add(tf.keras.layers.Embedding(len(encoder.get_vocabulary()), NODES_1, mask_zero=True))
mod.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(NODES_1,  return_sequences=True)))
mod.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(NODES_2)))

In [23]:
mod2 = tf.keras.Sequential()

mod2.add(tf.keras.Input(shape=(1,), dtype=tf.string))
mod2.add(encoder2)
mod2.add(tf.keras.layers.Embedding(len(encoder.get_vocabulary()), NODES_1, mask_zero=True))
mod2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(NODES_1,  return_sequences=True)))
mod2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(NODES_2)))

In [24]:
model_concat = concatenate([mod.output,mod2.output])
model_concat = tf.keras.layers.Dense(NODES_2, activation='relu')(model_concat)
model_concat = tf.keras.layers.Dropout(DROPOUT)(model_concat)
model_concat = tf.keras.layers.Dense(Y.shape[1], activation='softmax')(model_concat)

model = Model(inputs=[mod.input, mod2.input], outputs=model_concat)

model.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=SMOOTHING, from_logits=False),
            optimizer=tf.keras.optimizers.Adam(1e-4),
            metrics=([tf.keras.metrics.CategoricalCrossentropy(name='CatCrossentropy'), 
            tf.keras.metrics.CategoricalAccuracy(name='CatAccuracy'),
            tf.keras.metrics.Recall(name='Recall')]))

In [25]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, None)        0           ['input_1[0][0]']                
 ization)                                                                                         
                                                                                                  
 text_vectorization_1 (TextVect  (None, None)        0           ['input_2[0][0]']            

In [26]:
history = model.fit(x=[X.values, X2.values], y=Y.values, batch_size = BATCH, epochs=EPOCHS,
    validation_split=SPLIT, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
pred = model.predict([df['Description'], df['Title']])
pred = pd.Series(np.argmax(pred, axis=1))
pred += 1
df['pred'] = pred


ACC = len(df[df['pred']==df['Class Index']])/len(df)
print(ACC)

In [None]:
row = {'accuracy': ACC, 'notes': NOTES, 'len': LEN,
    'split': SPLIT, 'layers': LAYERS, 'dropout': DROPOUT,
    'batch': BATCH, 'epochs': EPOCHS, 'smoothing': SMOOTHING,
    'nodes1': NODES_1, 'nodes2': NODES_2, 'nodes3': NODES_3}

print(row)

log = log.append(row, ignore_index=True)
log.to_csv('text_and_title.csv', index=False)

{'accuracy': 0.8335416666666666, 'notes': '', 'len': 120000, 'split': 0.9, 'layers': 2, 'dropout': 0.5, 'batch': 100, 'epochs': 5, 'smoothing': 0.1, 'nodes1': 100, 'nodes2': 32, 'nodes3': 0}


This model gets a max accuracy of .83, versus 0.80 for the description-only model. Not extraordinary, but it's not nothing!

In [34]:
import pydot
tf.keras.utils.plot_model(model, show_shapes=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')
