# Multi-Layer Perceptron for Text Classification

In [1]:
# load data
import pandas as pd
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.sample(frac=1) # shuffle data rows

Unnamed: 0,content,target,target_names
2574,From: bdm@cs.rit.edu (Brendan D McKay)\nSubjec...,17,talk.politics.mideast
7242,Subject: Re: Who's next? Mormons and Jews?\nFr...,19,talk.religion.misc
4437,From: msf@skaro.as.arizona.edu (Michael Fulbri...,12,sci.electronics
2242,From: hernlem@chess.ncsu.edu (Brad Hernlem)\nS...,17,talk.politics.mideast
3870,From: umturne4@ccu.umanitoba.ca (Daryl Turner)...,10,rec.sport.hockey
...,...,...,...
7827,From: cdkaupan@eos.ncsu.edu (CARL DAVID KAUPAN...,10,rec.sport.hockey
3438,From: mathew <mathew@mantis.co.uk>\nSubject: A...,0,alt.atheism
8452,From: huub@cwi.nl (Huub Bakker)\nSubject: wait...,5,comp.windows.x
4700,From: mkramer@world.std.com (Mark W Kramer)\nS...,17,talk.politics.mideast


In [2]:
wanted_doc_num = 5000 #the dataset is quite large; hence for illustration purpose I only use a small par of it; feel free to use more data points
raw_labels = df.target_names.values.tolist()[:wanted_doc_num]
docs = df.content.values.tolist()[:wanted_doc_num]

assert len(docs) == len(raw_labels)
label_list = list(set(raw_labels))
labels = [label_list.index(rl) for rl in raw_labels] # transfer raw labels (strings) to integer numbers
print('total data size: {}, label type num: {}'.format(len(docs), len(label_list)))
print('labels:', label_list)

total data size: 5000, label type num: 20
labels: ['rec.motorcycles', 'soc.religion.christian', 'comp.os.ms-windows.misc', 'comp.sys.mac.hardware', 'comp.sys.ibm.pc.hardware', 'misc.forsale', 'comp.graphics', 'rec.sport.baseball', 'talk.politics.misc', 'sci.electronics', 'sci.med', 'sci.crypt', 'sci.space', 'comp.windows.x', 'talk.religion.misc', 'talk.politics.mideast', 'rec.sport.hockey', 'alt.atheism', 'talk.politics.guns', 'rec.autos']


In [3]:
# take a look at some documents in the dataset
print(docs[19])


From: abarden@tybse1.uucp (Ann Marie Barden)
Subject: X-Terminal Config. file question
Organization: Tybrin Corporation, Shalimar, FL
Distribution: usa
Lines: 19

  QUESTION:
  What is the EXACT entry (parameter and syntax please), in the X-Terminal
configuration file (loaded when the X-Terminal boots), to add another system 
to the TCP/IP access control list?   

  BACKGROUND:
  I have two unix systems, 1. an AT&T 3B2 running X11R3 and MIT's X11R4 and 
2. a Sun SS10 without any X.  
  I want to have a window to the Sun and the 3B2 on the NCD X-Terminal at the
same time.  I can do this if I manually set the Network Parameter TCP/IP
Access Control List to off, then login to my telnet session. Not Great!  
  I've tried to get "xhost" to work and failed.  Either my syntax is wrong
or the X11R3 implementation is bogus.  
  I am trying to edit the NCD configuration file that is loaded when the 
NCD boots.  No matter what entry I add or edit, the NCD still boots with
the TCP/IP Access Contro

In [4]:
train_ratio, dev_ratio, test_ratio = 0.6, 0.2, 0.2
train_docs = docs[:int(len(docs)*train_ratio)]
train_labels = labels[:int(len(docs)*train_ratio)]

dev_docs = docs[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]
dev_labels = labels[int(len(docs)*train_ratio):int(len(docs)*(train_ratio+dev_ratio))]

test_docs = docs[-int(len(docs)*(test_ratio)):]
test_labels = labels[-int(len(docs)*(test_ratio)):]

print('train size {}, dev size {}, test size {}'.format(len(train_labels), len(dev_labels), len(test_labels)))

train size 3000, dev size 1000, test size 1000


In [5]:
# create vector representations; 
# TODO: consider to apply necessary text cleaning/normalization techniques, e.g. remove all emails, remove the section headers (QUESTION, BACKGROUND) and meta-information (From, Lines)
from sklearn.feature_extraction.text import TfidfVectorizer
vec_dim = 2000 # feel free to use longer vecs
tfidf_vectorizer = TfidfVectorizer(max_features=vec_dim)
train_vecs = tfidf_vectorizer.fit_transform(train_docs)
dev_vecs = tfidf_vectorizer.transform(dev_docs)

print('train vec size', train_vecs.shape)
print('dev vec size', dev_vecs.shape)

train vec size (3000, 2000)
dev vec size (1000, 2000)


In [6]:
# define the neural model, an MLP
import torch
import torch.nn as nn

class MLP(nn.Module):
    """ A perceptron has only two linear layers and an additional softmax layer on top"""
    def __init__(self, input_dim, out_dim, dp_rate):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_dim, input_dim) # we let the hidden layer width be the same as the input layer
        # nn.Linear initialize weight using Glorot initialization, see https://discuss.pytorch.org/t/how-are-layer-weights-and-biases-initialized-by-default/13073 
        self.output_layer = nn.Linear(input_dim, out_dim)
        # self.softmax = nn.Softmax(dim=1) # if you use nn.CrossEntropyLoss as loss, it includes softmax computation and so you don't need softmax layer in your net
        self.dropout = nn.Dropout(dp_rate)
       
    def forward(self, x_in, act_fnc):
        z1 = self.dropout(x_in) # output of the input layer, after dropout
        z2 = act_fnc(self.hidden_layer(z1)) # output of the hidden layer
        logits = self.output_layer(z2)
        # probs = self.softmax(logits) # if you use nn.CrossEntropyLoss as loss, it includes softmax computation and so you don't need softmax layer in your net
        return logits

In [7]:
# build model
relu = torch.nn.ReLU() # we use ReLU as activation function; feel free to try others
dropout_rate = 0.5 # dropout rate
model = MLP(vec_dim,len(label_list),dropout_rate) # since this is a classification problem, the output dimension of the MLP should be the number of classes
loss_fnc = torch.nn.CrossEntropyLoss() # cross entropy loss

# hyper parameters
n_epochs = 30 # number of epoch (i.e. number of iterations)
batch_size = 64 # usually set to the powers of 2, e.g. 2,4,8,32,64,128. 
lr = 0.001 # initial learning rate

# init optimizer and scheduler (lr adjustor)
import torch.optim as optim
optimizer = optim.Adam(params=model.parameters(), lr=lr) # use Adam as the optimizer
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # decays the learning rate of each parameter group by gamma every step_size epochs.

In [8]:
best_acc = -1.
best_model = None
import copy
import numpy as np
from sklearn.metrics import accuracy_score

for epoch_i in range(n_epochs):
    # the inner loop is over the batches in the dataset
    model.train() # let pytorch know that gradients should be computed, so as to update the model
    for idx in range(0,train_vecs.shape[0],batch_size):
        # Step 0: Get the data
        x_data = torch.tensor(train_vecs[idx:idx+batch_size].todense(), dtype=torch.float)
        if x_data.shape[0] == 0: continue
        y_target = torch.tensor(train_labels[idx:idx+batch_size], dtype=torch.int64)

        # Step 1: Clear the gradients 
        optimizer.zero_grad()

        # Step 2: Compute the forward pass of the model
        y_pred = model(x_data, relu)

        # Step 3: Compute the loss value that we wish to optimize
        loss = loss_fnc(y_pred, y_target)

        # Step 4: Propagate the loss signal backward
        loss.backward()

        # Step 5: Trigger the optimizer to perform one update
        optimizer.step()
    
    # after each epoch, we can test the model's performance on the dev set
    with torch.no_grad(): # let pytorch know that no gradient should be computed
        model.eval() # let the model know that it in test mode, i.e. no gradient and no dropout
        dev_data = torch.tensor(dev_vecs.todense(), dtype=torch.float)
        dev_target = torch.tensor(dev_labels, dtype=torch.int64)
        dev_prediction = model(dev_data, relu)
        pred_labels = [np.argmax(dp.numpy()) for dp in dev_prediction]
        acc = accuracy_score(dev_target, pred_labels)
        print('\n---> after epoch {} the accuracy on dev set is {}'.format(epoch_i, acc))
        for param_group in optimizer.param_groups:
            print('learning rate', param_group['lr'])
        
        # save the best model
        if acc > best_acc:
            best_acc = acc
            best_model = copy.deepcopy(model.state_dict())
            print('best model updated; new best acc',acc)
            
    # (optional) adjust learning rate according to the scheduler
    scheduler.step()
    


---> after epoch 0 the accuracy on dev set is 0.542
learning rate 0.001
best model updated; new best acc 0.542

---> after epoch 1 the accuracy on dev set is 0.702
learning rate 0.001
best model updated; new best acc 0.702

---> after epoch 2 the accuracy on dev set is 0.737
learning rate 0.001
best model updated; new best acc 0.737

---> after epoch 3 the accuracy on dev set is 0.759
learning rate 0.001
best model updated; new best acc 0.759

---> after epoch 4 the accuracy on dev set is 0.764
learning rate 0.001
best model updated; new best acc 0.764

---> after epoch 5 the accuracy on dev set is 0.764
learning rate 0.001

---> after epoch 6 the accuracy on dev set is 0.756
learning rate 0.001

---> after epoch 7 the accuracy on dev set is 0.753
learning rate 0.001

---> after epoch 8 the accuracy on dev set is 0.755
learning rate 0.001

---> after epoch 9 the accuracy on dev set is 0.764
learning rate 0.001

---> after epoch 10 the accuracy on dev set is 0.772
learning rate 0.0001


In [9]:
# test on the test set

# load the best model weights
model.load_state_dict(best_model) 

with torch.no_grad(): 
    model.eval()
    test_vecs = tfidf_vectorizer.transform(test_docs)
    test_data = torch.tensor(test_vecs.todense(), dtype=torch.float)
    test_target = torch.tensor(test_labels, dtype=torch.int64)
    test_prediction = model(test_data, relu)
    pred_labels = [np.argmax(dp.numpy()) for dp in test_prediction]
    acc = accuracy_score(test_target, pred_labels)
    print('acc on test data', acc)

acc on test data 0.795


### TODO:
* Print the dimension of each layer inside MLP to make sure you understand the how it works (e.g. print(z1.shape)). 
* Try different neural architectures (e.g. different dimension of the hidden layer, different activation functions, different number of hidden layers, different dropout rate) and see their influence on the performance
* Try different hyper parameters (e.g. num of epoch, learning rate, etc.) and see their influence on the performance
* Try to implement *early stoping* by following the instructions [here](https://github.com/pytorch/ignite/issues/560)
* Try different weight initialization strategies by following the top answer [here](https://stackoverflow.com/questions/49433936/how-to-initialize-weights-in-pytorch)
* Try different length of vector dimension and see its influence on the performance

## References
* Pytorch basics tutorial (**highly recommended if you have no deep learning development experiences before**): https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html 
* learning rate scheduler in pytorch: https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html 
* loss functions in pytorch: https://pytorch.org/docs/stable/nn.html#loss-functions 