# Project 3: Text Classification in PyTorch

# Task 3: Let your creativity flow!

As discussed earlier, you are free to come up with anything in task 3. Think and try to model unique (not too complex!) neural architecture on your own. Remember that this model has to be novel as much as possible, so try not to copy other people's existing work. Using the same data, train the new model, and report the accuracy scores. How much better/worse is this model than the previous two models? Why do you think this is better/worse?

Brief description and analysis is in the report

## Model weighted Conv & GRU

Brief description and analysis is in the report

    outperform validation and test set
    not outperform training set

Epoch: 1  | time in 53 minutes, 44 seconds

	Loss: 0.0393(train)	|	Acc: 74.6%(train)
	Loss: 0.0212(valid)	|	Acc: 88.3%(valid)
Epoch: 2  | time in 53 minutes, 57 seconds

	Loss: 0.0189(train)	|	Acc: 89.5%(train)
	Loss: 0.0187(valid)	|	Acc: 89.7%(valid)
Epoch: 3  | time in 48 minutes, 56 seconds

	Loss: 0.0148(train)	|	Acc: 91.8%(train)
	Loss: 0.0183(valid)	|	Acc: 90.1%(valid)
Epoch: 4  | time in 103 minutes, 40 seconds

	Loss: 0.0116(train)	|	Acc: 93.5%(train)
	Loss: 0.0178(valid)	|	Acc: 90.5%(valid)
Epoch: 5  | time in 123 minutes, 16 seconds

	Loss: 0.0087(train)	|	Acc: 95.2%(train)
	Loss: 0.0229(valid)	|	Acc: 89.0%(valid)

Performance on test set

	Loss: 0.0218(test)	|	Acc: 89.9%(test)

In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification
import os

NGRAMS = 2

if not os.path.isdir('./.data'):
    os.mkdir('./.data')

train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)

BATCH_SIZE = 16
VOCAB_SIZE = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(train_dataset.get_vocab())
classes = train_dataset.get_labels()

from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torch.utils.data import DataLoader
from torch import nn

VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUM_CLASS = len(classes)

120000lines [00:17, 7026.61lines/s]
120000lines [00:30, 3944.83lines/s]
7600lines [00:01, 4705.45lines/s]


In [6]:
def generate_batch(batch):
    
    label = torch.tensor([i[0] for i in batch])
    text = [i[1] for i in batch]
    offsets = [0] + [len(entry) for entry in text]

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    
    return text, offsets, label

def train(train_data):
    
    train_loss = 0
    train_acc = 0

    data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    
    for i, (text, offsets, cls) in enumerate(data):
        
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        
        loss = criterion(output, cls)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        train_acc += (output.argmax(1) == cls).sum().item()
        
#         if i % 500 == 0: print("Batch: ", i, loss)
    scheduler.step()
    
    return train_loss/len(train_data), train_acc/len(train_data)

def test(test_data):
    
    test_loss = 0
    acc = 0
    
    data = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    
    for text, offsets, cls in data:
        
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            test_loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return test_loss / len(test_data), acc / len(test_data)

In [7]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.bag = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, embed_dim//4, bidirectional=True)
        
        self.conv = nn.Conv2d(1, 8, (3, embed_dim), stride=1, bias=True)
        self.conv1 = nn.Conv2d(1, 8, (4, embed_dim), stride=1, bias=True)
        self.conv2 = nn.Conv2d(1, 8, (5, embed_dim), stride=1, bias=True)
        
        self.fc = nn.Linear(8, 4)
        
        self.init_weights()

    def init_weights(self):
        self.bag.weight.data.uniform_(-0.5, 0.5)
        self.fc.weight.data.uniform_(-0.5, 0.5)
        self.fc.bias.data.zero_()
        self.weight_gru = torch.nn.Parameter(torch.Tensor(1, 8).uniform_(-0.5, 0.5))
        self.weight_cnn = torch.nn.Parameter(torch.Tensor(1, 8).uniform_(-0.5, 0.5))

    def forward(self, text, offsets):
        
        text_offsets = []
        for i in range(1, len(offsets)+1):
            if i == len(offsets):
                text_offsets.append(text[offsets[i-1]:])
            else:
                text_offsets.append(text[offsets[i-1]:offsets[i]])
        text_len = [len(x) for x in text_offsets]

        pad_text = nn.utils.rnn.pad_sequence(text_offsets)
        
        x = self.bag(pad_text)
        x_packed = pack_padded_sequence(x, text_len, enforce_sorted=False)

        gru_packed, hn = self.gru(x_packed)
        
        output, input_sizes = pad_packed_sequence(gru_packed)
        gru_output = torch.mean(hn, dim=0).squeeze(0)
        
        # CONV
        x_conv = x.transpose(0, 1)
        x_conv = x_conv.resize(x_conv.size(0), 1, x_conv.size(1), x_conv.size(2))
        out_conv1 = self.conv(x_conv)
        out_conv1, _ = torch.max(out_conv1, dim=2)
        
        out_conv2 = self.conv(x_conv)
        out_conv2, _ = torch.max(out_conv2, dim=2)
        
        out_conv3 = self.conv(x_conv)
        out_conv3, _ = torch.max(out_conv3, dim=2)
        
        out_conv = torch.cat([out_conv1, out_conv2, out_conv3], dim=2)
        out_conv = torch.mean(out_conv, dim=2)
        
        out_both = gru_output*self.weight_gru + out_conv*self.weight_cnn
        
        out = self.fc(out_both)
        
        return out
    
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUM_CLASS = len(classes)

model = TextClassifier(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)

In [8]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 5
TRAIN_RATIO = 0.9

validation_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_n = int(len(train_dataset) * TRAIN_RATIO)
training_data, valid_data = random_split(train_dataset, [train_n, len(train_dataset) - train_n])

for epoch in range(5):

    start_time = time.time()
    train_loss, train_acc = train(training_data)
    valid_loss, valid_acc = test(valid_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

print("Evaluating the test data:")
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')



Epoch: 1  | time in 53 minutes, 44 seconds
	Loss: 0.0393(train)	|	Acc: 74.6%(train)
	Loss: 0.0212(valid)	|	Acc: 88.3%(valid)
Epoch: 2  | time in 53 minutes, 57 seconds
	Loss: 0.0189(train)	|	Acc: 89.5%(train)
	Loss: 0.0187(valid)	|	Acc: 89.7%(valid)
Epoch: 3  | time in 48 minutes, 56 seconds
	Loss: 0.0148(train)	|	Acc: 91.8%(train)
	Loss: 0.0183(valid)	|	Acc: 90.1%(valid)
Epoch: 4  | time in 103 minutes, 40 seconds
	Loss: 0.0116(train)	|	Acc: 93.5%(train)
	Loss: 0.0178(valid)	|	Acc: 90.5%(valid)
Epoch: 5  | time in 123 minutes, 16 seconds
	Loss: 0.0087(train)	|	Acc: 95.2%(train)
	Loss: 0.0229(valid)	|	Acc: 89.0%(valid)
Checking the results of test dataset...
	Loss:  0.0218(test)	|	Acc: 89.9%(test)

## Model weighted Conv & GRU (with GloVe)

Brief description and analysis is in the report

TLDR;
    
    outperform validation and test set
    not outperform (but almost) training set

Epoch: 1  | time in 41 minutes, 5 seconds

	Loss: 0.0167(train)	|	Acc: 90.9%(train)
	Loss: 0.0139(valid)	|	Acc: 92.3%(valid)
    
Epoch: 2  | time in 39 minutes, 24 seconds

	Loss: 0.0100(train)	|	Acc: 94.5%(train)
	Loss: 0.0128(valid)	|	Acc: 93.3%(valid)
    
Epoch: 3  | time in 39 minutes, 39 seconds

	Loss: 0.0065(train)	|	Acc: 96.5%(train)
	Loss: 0.0138(valid)	|	Acc: 93.1%(valid)
    
Epoch: 4  | time in 37 minutes, 17 seconds

	Loss: 0.0040(train)	|	Acc: 97.8%(train)
	Loss: 0.0158(valid)	|	Acc: 92.5%(valid)
    
Epoch: 5  | time in 20 minutes, 35 seconds

	Loss: 0.0023(train)	|	Acc: 98.8%(train)
	Loss: 0.0195(valid)	|	Acc: 92.3%(valid)
    
Checking the results of test dataset...

	Loss:  0.0210(test)	|	Acc: 91.8%(test)

In [9]:
"""
Load the AG_NEWS dataset in bi-gram features format.
"""

import torch
import torchtext
from torchtext.datasets import text_classification
import os

from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torch import nn

NGRAMS = 1

if not os.path.isdir('./.data'):
    os.mkdir('./.data')

train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)

BATCH_SIZE = 16

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

120000lines [00:09, 12824.06lines/s]
120000lines [00:17, 7008.21lines/s]
7600lines [00:01, 4535.13lines/s]


In [10]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, pretrained_vectors):
        super().__init__()
        self.bag = nn.Embedding(vocab_size, embed_dim)
        self.bag.load_state_dict({'weight': pretrained_vectors})
        self.gru = nn.GRU(embed_dim, embed_dim//2, bidirectional=True)
        
        self.conv = nn.Conv2d(1, 25, (3, embed_dim), stride=1, bias=True)
        self.conv1 = nn.Conv2d(1, 25, (4, embed_dim), stride=1, bias=True)
        self.conv2 = nn.Conv2d(1, 25, (5, embed_dim), stride=1, bias=True)
        
        self.fc = nn.Linear(25, 4)
        
        self.init_weights()

    def init_weights(self):
        self.fc.weight.data.uniform_(-0.5, 0.5)
        self.fc.bias.data.zero_()
        self.weight_gru = torch.nn.Parameter(torch.Tensor(1, 25).uniform_(-0.5, 0.5))
        self.weight_cnn = torch.nn.Parameter(torch.Tensor(1, 25).uniform_(-0.5, 0.5))
    
    def forward(self, text, offsets):
        
        text_offsets = []
        for i in range(1, len(offsets)+1):
            if i == len(offsets):
                text_offsets.append(text[offsets[i-1]:])
            else:
                text_offsets.append(text[offsets[i-1]:offsets[i]])
        text_len = [len(x) for x in text_offsets]

        pad_text = nn.utils.rnn.pad_sequence(text_offsets)
        
        x = self.bag(pad_text)
        x_packed = pack_padded_sequence(x, text_len, enforce_sorted=False)

        gru_packed, hn = self.gru(x_packed)
        
        output, input_sizes = pad_packed_sequence(gru_packed)
        gru_output = torch.mean(hn, dim=0).squeeze(0)
        
        # CONV
        x_conv = x.transpose(0, 1)
        x_conv = x_conv.resize(x_conv.size(0), 1, x_conv.size(1), x_conv.size(2))
        out_conv1 = self.conv(x_conv)
        out_conv1, _ = torch.max(out_conv1, dim=2)
        
        out_conv2 = self.conv(x_conv)
        out_conv2, _ = torch.max(out_conv2, dim=2)
        
        out_conv3 = self.conv(x_conv)
        out_conv3, _ = torch.max(out_conv3, dim=2)
        
        out_conv = torch.cat([out_conv1, out_conv2, out_conv3], dim=2)
        out_conv = torch.mean(out_conv, dim=2)
        
        out_both = gru_output*self.weight_gru + out_conv*self.weight_cnn
        out = self.fc(out_both)

        return out

VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 50
NUM_CLASS = len(classes)

#load glove 
vectors = torchtext.vocab.GloVe(name='6B', dim=50)

train_vocab = train_dataset.get_vocab()
weights_matrix = torch.zeros((VOCAB_SIZE, EMBED_DIM))

for i, word in enumerate(train_vocab.itos):
    word_vector = torch.sum(torch.abs(vectors.get_vecs_by_tokens(word)))
    if word_vector.item() == 0:
        weights_matrix[i] = torch.FloatTensor(50).uniform_(-0.1, 0.1)
    else:
        weights_matrix[i] = vectors.get_vecs_by_tokens(word)

In [11]:
N_EPOCHS = 5
TRAIN_RATIO = 0.9

model = TextClassifier(VOCAB_SIZE, EMBED_DIM, NUM_CLASS, weights_matrix).to(device)
validation_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
train_n = int(len(train_dataset) * TRAIN_RATIO)
training_data, valid_data = random_split(train_dataset, [train_n, len(train_dataset) - train_n])

for epoch in range(5):

    start_time = time.time()
    train_loss, train_acc = train(training_data)
    valid_loss, valid_acc = test(valid_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    
print("Evaluating the test data:")
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Epoch: 1  | time in 41 minutes, 5 seconds
	Loss: 0.0167(train)	|	Acc: 90.9%(train)
	Loss: 0.0139(valid)	|	Acc: 92.3%(valid)
Epoch: 2  | time in 39 minutes, 24 seconds
	Loss: 0.0100(train)	|	Acc: 94.5%(train)
	Loss: 0.0128(valid)	|	Acc: 93.3%(valid)
Epoch: 3  | time in 39 minutes, 39 seconds
	Loss: 0.0065(train)	|	Acc: 96.5%(train)
	Loss: 0.0138(valid)	|	Acc: 93.1%(valid)
Epoch: 4  | time in 37 minutes, 17 seconds
	Loss: 0.0040(train)	|	Acc: 97.8%(train)
	Loss: 0.0158(valid)	|	Acc: 92.5%(valid)
Epoch: 5  | time in 20 minutes, 35 seconds
	Loss: 0.0023(train)	|	Acc: 98.8%(train)
	Loss: 0.0195(valid)	|	Acc: 92.3%(valid)
Checking the results of test dataset...
	Loss:  0.0210(test)	|	Acc: 91.8%(test)