### Import required packages

In [1]:
import mxnet as mx
from mxnet import gluon, nd
import gluonnlp as nlp
import pandas as pd

# Load Training and Testing Data

In [2]:
train_df = pd.read_csv('data/ag_news/train.csv')
test_df = pd.read_csv('data/ag_news/test.csv')

News_Article_classes = train_df['class'].unique()

print("Categories: ", News_Article_classes)

train_df.sample(n=5, random_state=2)

Categories:  ['Business' 'Sci/Tech' 'Sports' 'World']


Unnamed: 0,title,description,class
36389,Crash Gordon Put on Probation,"Robby Gordon, who admitted he intentionally ca...",Sports
112727,Salvation Army bell ringers hushed,Maybe you #39;ve noticed already this holiday ...,Business
68864,EU set to decide on Oracle next week,European Commission is set to announce whether...,Sci/Tech
88182,Quechua 1.0: Microsoft to launch Windows in th...,AFP - Microsoft will translate its blockbuster...,Sci/Tech
18233,Cricket-Vaughan upbeat about one-day form,LONDON (AFP) - England captain Michael Vaughan...,Sports


### Define Tokenizer

In [3]:
tokenizer = nlp.data.SpacyTokenizer()

# Build Vocab & choose Pretrained Embeddings/ Trained Embeddings

We can choose to use Pretrained Embeddings such as fasttext, word2vec, glove etc. <br>
_OR_ <br>
We can choose to train our Word Embeddings from scratch based on the training data <br>

In [4]:
# Flag to indicate use of Pretrained Embeddings
pretrained = False

In [5]:
if pretrained:
    embedding = nlp.embedding.create('fasttext', source='wiki.simple')
    vocab = nlp.Vocab(nlp.data.Counter(embedding.idx_to_token))
    vocab.set_embedding(embedding)

    token_to_index = vocab.token_to_idx
    vocab_size, embed_size = vocab.embedding.idx_to_vec.shape

else:
    tokenized_data = (tokenizer(text) for text in train_df['description'].values)
    counter = nlp.data.count_tokens(nlp.data.concat_sequence(tokenized_data), to_lower=True)
    vocab = nlp.Vocab(counter)

    token_to_index = vocab.token_to_idx
    vocab_size, embed_size = len(token_to_index), 16
  

In [6]:
label_to_index = {label: num for num, label in enumerate(train_df['class'].unique())}
print("Label -> Index mapping : ", label_to_index)
index_to_label = {index:label for label, index in label_to_index.items()}
print("Index -> Label mapping : ", index_to_label)

Label -> Index mapping :  {'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}
Index -> Label mapping :  {0: 'Business', 1: 'Sci/Tech', 2: 'Sports', 3: 'World'}


# Preprocessing Methods

Create and Preprocess Train & Test Datasets 

In [7]:
def clean_text(string):
    """
    Tokenization/string cleaning
    taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    string = string.strip().lower()

    return string

def tokenize_text(text):
    """ 
    Tokenize News Article Description using SpacyTokenizer
    :param text: list of news article descriptions
    """
    _tokenized_data = tokenizer(text)

    return _tokenized_data

def map_words_to_index(tokenized_data):
    """
    Convert Words to Indexed Token numbers
    :param text: list of lists - of tokenized texts
    :return: list of list of indexed numbers
    """
    _indexed_data = [token_to_index.get(token, 0) for token in tokenized_data] 

    return _indexed_data

def transform_dataset(text, label):  
    tokenized = tokenize_text(text)
    indexed = map_words_to_index(tokenized)
    label = label_to_index.get(label)
    return indexed, label

#### Test Preprocessing

In [8]:
transform_dataset('we are testing how preprocessing transforms this demo_text', 'Sci/Tech')

([568, 44, 1723, 332, 0, 28679, 53, 0], 1)

In [9]:
from mxnet import nd, autograd, gluon
from mxnet.gluon.data import ArrayDataset
from mxnet.gluon.data import DataLoader

train_dataset = ArrayDataset(train_df['description'].values, train_df['class'].values).transform(transform_dataset)
test_dataset = ArrayDataset(test_df['description'].values, test_df['class'].values).transform(transform_dataset)

train_data_lengths = [len(train_dataset[i][0]) for i in range(0, train_df.shape[0])]
test_data_lengths = [len(test_dataset[i][0]) for i in range(0, test_df.shape[0])]

# Defining Bucketing Sampler and Data Loader

In [13]:
import multiprocessing

batch_size = 128
bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.6)

# Bucket Sampler

train_batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
                                                          bucket_scheme=bucket_scheme,
                                                          num_buckets=10,
                                                          batch_size=batch_size,
                                                          use_average_length=True,
                                                          shuffle=True)

test_batch_sampler = nlp.data.sampler.FixedBucketSampler(test_data_lengths,
                                                         bucket_scheme=bucket_scheme,
                                                         num_buckets=10,
                                                         batch_size=batch_size,
                                                         use_average_length=True,
                                                         shuffle=True)

pad_value = token_to_index.get('<pad>')
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=pad_value), nlp.data.batchify.Stack())
num_workers = max(multiprocessing.cpu_count() - 3, 1)

# Data Loader

train_data_loader = gluon.data.DataLoader(dataset=train_dataset,
                                          batch_sampler=train_batch_sampler,
                                          batchify_fn=batchify_fn,
                                          num_workers=num_workers)

test_data_loader = gluon.data.DataLoader(dataset=test_dataset,
                                          batch_sampler=test_batch_sampler,
                                          batchify_fn=batchify_fn)

In [12]:
print(train_batch_sampler.stats())

FixedBucketSampler:
  sample_num=120000, batch_num=49471
  key=[4, 6, 9, 14, 21, 33, 53, 83, 133, 212]
  cnt=[2, 28, 136, 963, 6649, 45503, 59706, 6232, 684, 97]
  batch_size=[42, 20, 14, 9, 6, 4, 2, 1, 1, 1]


# Build the Network Architecture

In [14]:
class ConvMaxPool(gluon.nn.HybridBlock):
    """
    Apply 1D convolutions, relu activation and max pooling
    """

    def __init__(self, channels, kernel_size, padding):
        super().__init__()
        self.conv = gluon.nn.Conv1D(channels, kernel_size, 
                                    strides=1, padding=padding,
                                    activation='relu')
        self.maxpool = gluon.nn.GlobalMaxPool1D()

    def hybrid_forward(self, F, x, *args, **kwargs):
        conv = self.conv(x.transpose(axes=(0, 2, 1)))
        pool = self.maxpool(conv)
        return pool.flatten()


class CNNTextClassifier(gluon.nn.HybridBlock):
    """
    Convolutional network with Embed -> Conv(s) -> MaxPool -> Dropout -> FC
    """

    def __init__(self, vocab_size, embed_size, dropout, num_label, filters, num_filter):
        super().__init__()
        self.embed = gluon.nn.Embedding(input_dim=vocab_size, output_dim=embed_size)
        self.stacked_conv = []
        for f in filters:
            conv = ConvMaxPool(channels=num_filter, kernel_size=f, padding=f // 2)
            self.__setattr__(conv.name, conv)
            self.stacked_conv.append(conv)
        self.drop = gluon.nn.Dropout(rate=dropout)
        self.out = gluon.nn.Dense(units=num_label)

    def hybrid_forward(self, F, x, *args, **kwargs):
        e = self.embed(x)
        conv_outputs = [conv(e) for conv in self.stacked_conv]
        h = F.concat(*conv_outputs, dim=1)
        d = self.drop(h)
        o = self.out(d)
        return o

      
net = CNNTextClassifier(vocab_size, embed_size, dropout=0.85, num_label=len(label_to_index),
                             filters=[3,4,5], num_filter=100)

net.hybridize()

print(net)


CNNTextClassifier(
  (embed): Embedding(83087 -> 16, float32)
  (convmaxpool0): ConvMaxPool(
    (conv): Conv1D(None -> 100, kernel_size=(3,), stride=(1,), padding=(1,))
    (maxpool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  )
  (convmaxpool1): ConvMaxPool(
    (conv): Conv1D(None -> 100, kernel_size=(4,), stride=(1,), padding=(2,))
    (maxpool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  )
  (convmaxpool2): ConvMaxPool(
    (conv): Conv1D(None -> 100, kernel_size=(5,), stride=(1,), padding=(2,))
    (maxpool): GlobalMaxPool1D(size=(1,), stride=(1,), padding=(0,), ceil_mode=True)
  )
  (drop): Dropout(p = 0.85, axes=())
  (out): Dense(None -> 4, linear)
)


# Train the Model

In [15]:
# !export MXNET_CUDNN_AUTOTUNE_DEFAULT=1 # used when ctx = mx.gpu()
ctx = mx.cpu()
print(ctx)

cpu(0)


##### Initialize weights with Xavier Initialization

In [16]:
net.collect_params().initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx, force_reinit=True)

In [17]:
def evaluate_accuracy(data_loader, net):
    """
    :param data_loader: data loader object
    :param net: network block
    :return: accuracy of network on given data
    """
    acc = mx.metric.Accuracy()
    for data, label in data_loader:
        data = data.as_in_context(ctx)
        true_label = label.as_in_context(ctx)
        output = net(data)
        
        predicted_label = nd.argmax(output, axis=1)
        acc.update(preds=predicted_label, labels=true_label)
    return acc.get()[1]

In [18]:
import time

if pretrained:
    # Freeze weights of pretrained embeddings
    net.embed.weight.set_data(vocab.embedding.idx_to_vec.as_in_context(ctx))
    net.embed.collect_params().setattr('grad_req', 'null')

trainer = gluon.Trainer(params=net.collect_params(),
                          optimizer='adam',
                          optimizer_params={'learning_rate': 0.001})

sm_loss = gluon.loss.SoftmaxCrossEntropyLoss()

epochs = 3
for e in range(epochs):
    start = time.time()
    epoch_loss = 0
    weight_updates = 0
    
    for data, label in train_data_loader:
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        with autograd.record():
            pred = net(data)
            loss = sm_loss(pred, label)
        loss.backward()
        trainer.step(data.shape[0])
        epoch_loss += nd.mean(loss).asscalar()
        weight_updates += 1
    
    train_metrics = evaluate_accuracy(train_data_loader, net)

    if test_df.shape[0] == 0:
        test_metrics = 0.0
    else:
        test_metrics = evaluate_accuracy(test_data_loader, net)
    
    print("Epoch{}: Average Train Loss: {:.4} Train Acc: {:.4} Validation Acc: {:.4} Epoch Time: {}".
               format(e, epoch_loss / weight_updates, train_metrics, test_metrics, time.time() - start))

Epoch0: Average Train Loss: 0.6318 Train Acc: 0.8847 Validation Acc: 0.865 Epoch Time: 132.61098194122314
Epoch1: Average Train Loss: 0.3939 Train Acc: 0.9111 Validation Acc: 0.8713 Epoch Time: 139.58497524261475
Epoch2: Average Train Loss: 0.3311 Train Acc: 0.9251 Validation Acc: 0.8736 Epoch Time: 127.21225786209106


# Testing Predictions

In [27]:
from IPython.display import display, HTML
import random
import numpy as np
import re

total_test_examples = 8

for i in range(total_test_examples):
    
    # Picking random sample to test
    random_sample_index = random.randint(1, len(test_df))
    
    description = test_df['description'][random_sample_index]
    label = test_df['class'][random_sample_index]
    
    print(description)
    display(HTML('<h5> True Category: {} <h5>'.format(label)))

    # Apply Preprocessing before feeding for prediction : clean_text -> tokenize -> map_words
    preprocessed = nd.array([map_words_to_index(tokenize_text(clean_text(description)))], ctx=ctx)
    
    # Predict
    output = net(preprocessed)

    predicted = np.argmax(output[0].asnumpy())
    predicted_label = index_to_label.get(predicted)
    
    display(HTML('<h5> Predicted Category: {}\n <h5>'.format(predicted_label)))
    
    # Evaluate
    if predicted_label == label:
          display(HTML('<h4> ✅ Correct </h4>'))
    else:
          display(HTML('<h4> ❌ Incorrect : {}</h5>'.format(predicted_label)))
    
    display(HTML('<hr>'))

Canada has defended its decision to buy second-hand submarines after a crewman died from injuries sustained on one of the vessels that had broken down.


Ralf Schumacher is adamant memories of his horror crash at Indianapolis three months ago will not hamper his comeback in this weekends Chinese Grand Prix.


OKLAHOMA CITY - Former child star Macaulay Culkin was arrested on drug charges Friday during a traffic stop, authorities said. The 24-year-old actor, best known for his role in the "Home Alone" movies, was taken into custody on complaints of possession of a controlled dangerous substance without a valid prescription and possession of marijuana, according to the Oklahoma County Sheriff's office...


Federal prosecutors announced on Wednesday that they had cracked a global cartel that had illegally fixed prices of memory chips in personal computers and servers for 


Halfway around the world, standing virtually in the middle of the Pacific Ocean, the incomparable Timmy Chang is just days away from throwing his first pass of the season. From my tattered sofa, I will be watching him. I want you to watch him, too.


Premier Yu Shyi-Kun hopes economic ties to the mainland will guarantee peace. If not,  quot;Taiwan has to have to ability to defend itself quot;.


Schools across Portugal turn away pupils because of a teachers' assignment mix up on the first day of classes.


Southwest Airlines said on Friday it will bid at least USD\$100 million for assets of bankrupt ATA Airlines, including taking over six of ATA #39;s 14 gates at Chicago #39;s Midway Airport and selling tickets on some of each other #39;s flights.
