# Problem 2: Newsgroups

The code for this single-layer perceptron can be found in `newsgroups.py`. The module `utils.py` contains helper functions to load the dataset, display progress bar, plot graphs, etc.

In [1]:
import sys
sys.path.append('../src/')
from newsgroups_v2 import *

---
## Building the Model

We initialize the parameters of the single-layer MLP.

In [2]:
# Model parameters
batch_size = 64
layers = [61188, 100, 20]
learning_rate = 1e-4
momentum = 0.9
eps = 1e-5
train_filename = "../data/newsgroups/matlab/train"
test_filename = "../data/newsgroups/matlab/test"
saved = "../data/newsgroups/saved/"
train_size = 11269
test_size = 7505

Next, we define a learning rate grid search for our model.

In [3]:
lr_search = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
nb_epochs = 20

We can now load different datasets for each preprocessing methods. Since loading all three datasets at once requires a huge amount of memory, we load them sequentially by clearing out the memory inbetween each procedure.

### No Preprocessing

In [4]:
# Load dataset without any preprocessing (count vector)
train_data, test_data = load_newsgroups(train_filename, test_filename, 
                                layers[0], train_size, test_size, "count")
train_loader = DataLoader(train_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [5]:
# Compile and train model
train_acc = np.zeros((len(lr_search), nb_epochs))
train_acc = np.zeros((len(lr_search), nb_epochs))

# Learning rate grid search
for i, lr in enumerate(lr_search):
    print("{}\nLearning rate = {:.4f}\n{}".format("="*30, lr, "-"*30))
    mlp_n = Newsgroups(layers, lr, momentum)
    _, train_acc[i], test_acc[i] = mlp_n.train(nb_epochs, train_loader, test_loader)

Learning rate = 0.1000
------------------------------
Epoch 1/20
Avg loss: 13.2354 -- Train acc: 0.0415 -- Test acc: 0.0414
Epoch 2/20
Avg loss: 55.6112 -- Train acc: 0.0434 -- Test acc: 0.0429
Epoch 3/20
Training [■■■■■■              ] 29.5% 

KeyboardInterrupt: 

In [None]:
# Find best learning rate
best_lr_idx = test_acc.max(1)[0].max(0)[1][0]
best_lr = lr_search[best_lr_idx]

# Plot accuracy per epoch for this learning rate
plots_per_epoch([train_acc[best_lr_idx], test_acc[best_lr_idx]], 
    ["Train", "Test"], "Accuracy", "Best learning rate = {}".format(best_lr))

### TF-IDF

In [None]:
# Load dataset with tf-idf preprocessing (tf-idf vector)
train_data, test_data = load_newsgroups(train_filename, test_filename, 
                                layers[0], train_size, test_size, "tfidf")
train_loader = DataLoader(train_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [None]:
# Compile and train model
train_acc = np.zeros((len(lr_search), nb_epochs))
train_acc = np.zeros((len(lr_search), nb_epochs))

# Learning rate grid search
for i, lr in enumerate(lr_search):
    print("{}\nLearning rate = {:.4f}\n{}".format("="*30, lr, "-"*30))
    mlp_t = Newsgroups(layers, lr, momentum)
    _, train_acc[i], test_acc[i] = mlp_t.train(nb_epochs, train_loader, test_loader)

In [None]:
# Find best learning rate
best_lr_idx = test_acc.max(1)[0].max(0)[1][0]
best_lr = lr_search[best_lr_idx]

# Plot accuracy per epoch for this learning rate
plots_per_epoch([train_acc[best_lr_idx], test_acc[best_lr_idx]], 
    ["Train", "Test"], "Accuracy", "Best learning rate = {}".format(best_lr))

### Standardization

In [None]:
# Load dataset with standardization preprocessing, given epsilon
train_data, test_data = load_newsgroups(train_filename, test_filename, 
                                layers[0], train_size, test_size, "stand", eps)
train_loader = DataLoader(train_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [None]:
# Compile and train model
train_acc = np.zeros((len(lr_search), nb_epochs))
train_acc = np.zeros((len(lr_search), nb_epochs))

# Learning rate grid search
for i, lr in enumerate(lr_search):
    print("{}\nLearning rate = {:.4f}\n{}".format("="*30, lr, "-"*30))
    mlp_s = Newsgroups(layers, lr, momentum)
    _, train_acc[i], test_acc[i] = mlp_s.train(nb_epochs, train_loader, test_loader)

In [None]:
# Find best learning rate
best_lr_idx = test_acc.max(1)[0].max(0)[1][0]
best_lr = lr_search[best_lr_idx]

# Plot accuracy per epoch for this learning rate
plots_per_epoch([train_acc[best_lr_idx], test_acc[best_lr_idx]], 
    ["Train", "Test"], "Accuracy", "Best learning rate = {}".format(best_lr))

---
## Variance in training

In [4]:
# Load tfidf dataset
train_data, test_data = load_newsgroups(train_filename, test_filename, 
                                layers[0], train_size, test_size, "tfidf")
learning_rate = 0.2

In [6]:
batch_size = 1
train_loader = DataLoader(train_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [7]:
mlp_t = Newsgroups(layers, learning_rate, momentum)
train_loss, train_acc, test_acc = mlp_t.train(1, train_loader, test_loader, 5000)

Epoch 1/1
Update    0/5000 -- Cur Loss: 1.9969
Update  100/5000 -- Cur Loss: 0.0000
Update  200/5000 -- Cur Loss: 0.0000
Update  300/5000 -- Cur Loss: 0.0000
Update  400/5000 -- Cur Loss: 0.0000
Update  500/5000 -- Cur Loss: 0.0000
Update  600/5000 -- Cur Loss: 0.0000


KeyboardInterrupt: 

In [7]:
print(train_loss)

NameError: name 'train_loss' is not defined

In [8]:
batch_size = 100
train_loader = DataLoader(train_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [10]:
train_loss, train_acc, test_acc = mlp_t.train(100, train_loader, test_loader, 5000)

Epoch 1/100
Update    0/5000 -- Cur Loss: 5.8199
Update  100/5000 -- Cur Loss: 6.6925
Epoch 2/100
Update  200/5000 -- Cur Loss: 6.3561
Epoch 3/100
Update  300/5000 -- Cur Loss: 4.9169


KeyboardInterrupt: 

In [None]:
train_loss

In [None]:
plot_per_epoch(train_loss, "Loss", "Vochier")