# Project 3: Text Classification in PyTorch

# Task 2: Try the better option that you proposed

In Question 4, you have proposed some alternate solution that you think will be able to somehow improve your model. Following one of the options below, try to build and train a new model, and report the new loss and accuracy scores. Is it better than your initial classifier model for the same data?

For your reference, here are some neural models using which researchers have tried to classify text before:

* Recurrent Neural Networks (RNNs)
* Long-Short Term Memory (LSTM)
* Bi-directional LSTM (BiLSTM)
* Gated Recurrent Units (GRUs)

In [1]:
import torch
import torchtext
from torchtext.datasets import text_classification
import os

NGRAMS = 2

if not os.path.isdir('./.data'):
    os.mkdir('./.data')

train_dataset, test_dataset = text_classification.DATASETS['AG_NEWS'](
    root='./.data', ngrams=NGRAMS, vocab=None)

BATCH_SIZE = 16
VOCAB_SIZE = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(train_dataset.get_vocab())
classes = train_dataset.get_labels()

from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torch.utils.data import DataLoader
from torch import nn

VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUM_CLASS = len(classes)

120000lines [00:16, 7257.64lines/s]
120000lines [00:29, 4040.97lines/s]
7600lines [00:01, 4139.34lines/s]


In [2]:
def generate_batch(batch):
    
    label = torch.tensor([i[0] for i in batch])
    text = [i[1] for i in batch]
    offsets = [0] + [len(entry) for entry in text]

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    
    return text, offsets, label

def train(train_data):
    
    train_loss = 0
    train_acc = 0

    data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    
    
    for i, (text, offsets, cls) in enumerate(data):
        
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        
        loss = criterion(output, cls)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        train_acc += (output.argmax(1) == cls).sum().item()
        
    scheduler.step()
    

    return train_loss/len(train_data), train_acc/len(train_data)

def test(test_data):
    
    test_loss = 0
    acc = 0
    
    data = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    
    for text, offsets, cls in data:
        
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            test_loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()

    return test_loss / len(test_data), acc / len(test_data)

## Model Wang 

#### Brief description and analysis is in the report

TLDR;

    out perform on test accuracy
    not out perform on training and validation accuracy

Epoch: 1

	Loss: 0.0697(train)	|	Acc: 60.3%(train)
	Loss: 0.0558(valid)	|	Acc: 84.9%(valid)

Epoch: 2

	Loss: 0.0551(train)	|	Acc: 86.1%(train)
	Loss: 0.0541(valid)	|	Acc: 87.7%(valid)

Epoch: 3

	Loss: 0.0533(train)	|	Acc: 89.0%(train)
	Loss: 0.0537(valid)	|	Acc: 88.2%(valid)

Epoch: 4

	Loss: 0.0527(train)	|	Acc: 90.0%(train)
	Loss: 0.0535(valid)	|	Acc: 88.6%(valid)

Epoch: 5

	Loss: 0.0521(train)	|	Acc: 90.8%(train)
	Loss: 0.0532(valid)	|	Acc: 89.2%(valid)

Performance on test dataset

    Loss: 0.0532(test)	|	Acc: 89.0%(test)

In [3]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.bag = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.lstm = nn.LSTM(embed_dim, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)
        self.sf = nn.Softmax()
        self.init_weights()

    def init_weights(self):
        self.weight = torch.nn.Parameter(torch.Tensor(32, 1).uniform_(-0.0001, 0.0001) * torch.sqrt(torch.tensor(6./32+1)))
        self.bag.weight.data.uniform_(-0.5, 0.5)
        self.fc.weight.data.uniform_(-0.5, 0.5)
        self.fc.bias.data.zero_()
    
    def forward(self, text, offsets):
        
        text_offsets = []
        for i in range(1, len(offsets)+1):
            if i == len(offsets):
                text_offsets.append(text[offsets[i-1]:])
            else:
                text_offsets.append(text[offsets[i-1]:offsets[i]])
        text_len = [len(x) for x in text_offsets]
          
        sorted_text_len = sorted(text_len)
        max_length = sorted_text_len[13]
        
        text_len = []
        for x in text_offsets:
            tmp = len(x) if len(x) < max_length else max_length
            text_len.append(tmp)

        pad_text = nn.utils.rnn.pad_sequence(text_offsets)
        pad_text = pad_text[:max_length, :]
        x = self.bag(pad_text)
    
        x_packed = pack_padded_sequence(x, text_len, enforce_sorted=False)
        lstm_packed, (hn, cn) = self.lstm(x_packed)
        lstm_output, input_sizes = pad_packed_sequence(lstm_packed)

        lstm_output = lstm_output.resize(max_length, 16, 32, 1)
        lstm_output = lstm_output * self.weight
        lstm_output = lstm_output.resize(max_length, 16, 32)
        
        sum_output = torch.sum(lstm_output, dim=0)
        out = self.fc(sum_output)
        out = self.sf(out)
        return out

model = TextClassifier(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)

In [4]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 5
LEARNING_RATE = 0.6
TRAIN_RATIO = 0.9

validation_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_n = int(len(train_dataset) * TRAIN_RATIO)
training_data, valid_data = random_split(train_dataset, [train_n, len(train_dataset) - train_n])

for epoch in range(5):

    start_time = time.time()
    train_loss, train_acc = train(training_data)
    valid_loss, valid_acc = test(valid_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')



Epoch: 1  | time in 21 minutes, 8 seconds
	Loss: 0.0697(train)	|	Acc: 60.3%(train)
	Loss: 0.0558(valid)	|	Acc: 84.9%(valid)
Epoch: 2  | time in 23 minutes, 9 seconds
	Loss: 0.0551(train)	|	Acc: 86.1%(train)
	Loss: 0.0541(valid)	|	Acc: 87.7%(valid)
Epoch: 3  | time in 24 minutes, 58 seconds
	Loss: 0.0533(train)	|	Acc: 89.0%(train)
	Loss: 0.0537(valid)	|	Acc: 88.2%(valid)
Epoch: 4  | time in 26 minutes, 7 seconds
	Loss: 0.0527(train)	|	Acc: 90.0%(train)
	Loss: 0.0535(valid)	|	Acc: 88.6%(valid)
Epoch: 5  | time in 30 minutes, 12 seconds
	Loss: 0.0521(train)	|	Acc: 90.8%(train)
	Loss: 0.0532(valid)	|	Acc: 89.2%(valid)
Checking the results of test dataset...
	Loss: 0.0532(test)	|	Acc: 89.0%(test)


## Model Li

#### Brief description and analysis is in the report

TLDR;

    out perform on test accuracy
    relatively same on validation accuracy
    not out perform on training accuracy

Epoch: 1

	Loss: 0.0737(train)	|	Acc: 41.6%(train)
	Loss: 0.0500(valid)	|	Acc: 62.4%(valid)
    
Epoch: 2

	Loss: 0.0329(train)	|	Acc: 80.2%(train)
	Loss: 0.0242(valid)	|	Acc: 87.7%(valid)

Epoch: 3

	Loss: 0.0208(train)	|	Acc: 88.9%(train)
	Loss: 0.0191(valid)	|	Acc: 89.9%(valid)
    
Epoch: 4

	Loss: 0.0168(train)	|	Acc: 91.1%(train)
	Loss: 0.0183(valid)	|	Acc: 90.2%(valid)
    
Epoch: 5 

	Loss: 0.0139(train)	|	Acc: 92.6%(train)
	Loss: 0.0187(valid)	|	Acc: 90.9%(valid)
    
    
Performance on test dataset

	Loss: 0.0205(test)	|	Acc: 89.0%(test)

In [5]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.bag = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, embed_dim//2, bidirectional=True)
        
        self.conv = nn.Conv2d(1, 16, (3, embed_dim//2), stride=1, bias=True)
        self.conv1 = nn.Conv2d(1, 16, (4, embed_dim//2), stride=1, bias=True)
        self.conv2 = nn.Conv2d(1, 16, (5, embed_dim//2), stride=1, bias=True)
        
        self.fc = nn.Linear(48, 4)
    
        self.relu = nn.ReLU()
        
        self.init_weights()

    def init_weights(self):
        self.bag.weight.data.uniform_(-0.5, 0.5)
        self.fc.weight.data.uniform_(-0.5, 0.5)
        self.fc.bias.data.zero_()
    
    def forward(self, text, offsets):
        
        text_offsets = []
        for i in range(1, len(offsets)+1):
            if i == len(offsets):
                text_offsets.append(text[offsets[i-1]:])
            else:
                text_offsets.append(text[offsets[i-1]:offsets[i]])
        text_len = [len(x) for x in text_offsets]

        pad_text = nn.utils.rnn.pad_sequence(text_offsets)
        
        x = self.bag(pad_text)
        x_packed = pack_padded_sequence(x, text_len, enforce_sorted=False)
        
        # GRU
        gru_packed, hn = self.gru(x_packed)
        
        output, input_sizes = pad_packed_sequence(gru_packed)
        
        oz = output.size()
        output = output.resize(oz[0], oz[1], oz[2], 1)
        output = output.transpose(1, 3)
        output = output.transpose(2, 3)
        output = output.transpose(0, 2)
        
        output_1 = output[:, :, :, :16]
        output_2 = output[:, :, :, 16:]
        output = output_1.add(output_2)
        
        # CONV
        out_conv1 = self.conv(output)
        out_conv1, _ = torch.max(out_conv1, dim=2)
        
        out_conv2 = self.conv(output)
        out_conv2, _ = torch.max(out_conv2, dim=2)
        
        out_conv3 = self.conv(output)
        out_conv3, _ = torch.max(out_conv3, dim=2)
    
        out_conv = torch.cat([out_conv1, out_conv2, out_conv3], dim=1)
        out = self.fc(out_conv.squeeze(2))
        return out

model = TextClassifier(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)

In [6]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 5
LEARNING_RATE = 0.6
TRAIN_RATIO = 0.9

validation_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_n = int(len(train_dataset) * TRAIN_RATIO)
training_data, valid_data = random_split(train_dataset, [train_n, len(train_dataset) - train_n])

for epoch in range(10):

    start_time = time.time()
    train_loss, train_acc = train(training_data)
    valid_loss, valid_acc = test(valid_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 101 minutes, 45 seconds
	Loss: 0.0737(train)	|	Acc: 41.6%(train)
	Loss: 0.0500(valid)	|	Acc: 62.4%(valid)
Epoch: 2  | time in 102 minutes, 31 seconds
	Loss: 0.0329(train)	|	Acc: 80.2%(train)
	Loss: 0.0242(valid)	|	Acc: 87.7%(valid)
Epoch: 3  | time in 102 minutes, 43 seconds
	Loss: 0.0208(train)	|	Acc: 88.9%(train)
	Loss: 0.0191(valid)	|	Acc: 89.9%(valid)
Epoch: 4  | time in 103 minutes, 7 seconds
	Loss: 0.0168(train)	|	Acc: 91.1%(train)
	Loss: 0.0183(valid)	|	Acc: 90.2%(valid)
Epoch: 5  | time in 93 minutes, 32 seconds
	Loss: 0.0139(train)	|	Acc: 92.6%(train)
	Loss: 0.0187(valid)	|	Acc: 90.9%(valid)
Checking the results of test dataset...
	Loss: 0.0205(test)	|	Acc: 89.0%(test)


## Model Modified Li

#### Brief description and analysis is in the report

TLDR;
   
    out perform on test accuracy & validation accuracy
    not out perform on training accuracy

Epoch: 1  | time in 68 minutes, 57 seconds

	Loss: 0.0366(train)	|	Acc: 76.0%(train)
	Loss: 0.0201(valid)	|	Acc: 86.6%(valid)

Epoch: 2  | time in 42 minutes, 21 seconds

	Loss: 0.0170(train)	|	Acc: 90.4%(train)
	Loss: 0.0173(valid)	|	Acc: 90.2%(valid)
    
Epoch: 3  | time in 40 minutes, 8 seconds

	Loss: 0.0131(train)	|	Acc: 92.7%(train)
	Loss: 0.0163(valid)	|	Acc: 91.1%(valid)

Epoch: 4  | time in 31 minutes, 47 seconds

	Loss: 0.0103(train)	|	Acc: 94.2%(train)
	Loss: 0.0168(valid)	|	Acc: 91.0%(valid)
    
Epoch: 5  | time in 35 minutes, 12 seconds

	Loss: 0.0077(train)	|	Acc: 95.7%(train)
	Loss: 0.0183(valid)	|	Acc: 90.8%(valid)
    
Performance on test dataset

	Loss: 0.0200(test)	|	Acc: 90.1%(test)


In [7]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.bag = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, embed_dim//2, bidirectional=True)

        self.conv1 = nn.Conv1d(2, 8, 3, stride=2, bias=True)
        self.conv2 = nn.Conv1d(2, 8, 4, stride=2, bias=True)
        self.fc = nn.Linear(7, 4)
        
        self.init_weights()

    def init_weights(self):
        self.bag.weight.data.uniform_(-0.5, 0.5)
        self.fc.weight.data.uniform_(-0.5, 0.5)
        self.fc.bias.data.zero_()
    
    def forward(self, text, offsets):
        
        text_offsets = []
        for i in range(1, len(offsets)+1):
            if i == len(offsets):
                text_offsets.append(text[offsets[i-1]:])
            else:
                text_offsets.append(text[offsets[i-1]:offsets[i]])
        text_len = [len(x) for x in text_offsets]

        pad_text = nn.utils.rnn.pad_sequence(text_offsets)
        
        x = self.bag(pad_text)
        x_packed = pack_padded_sequence(x, text_len, enforce_sorted=False)
        
        # GRU
        gru_packed, hn = self.gru(x_packed)
        hn = hn.transpose(0, 1)
        
        # CONV
        out_conv = self.conv1(hn)
        out_conv = torch.mean(out_conv, dim=1)
        out = self.fc(out_conv)
        return out

'''
Paramters and model instance creation.
'''
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 32
NUM_CLASS = len(classes)

model = TextClassifier(VOCAB_SIZE, EMBED_DIM, NUM_CLASS).to(device)

In [8]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 5
LEARNING_RATE = 0.615
TRAIN_RATIO = 0.9

validation_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, weight_decay=0.00001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.95)

train_n = int(len(train_dataset) * TRAIN_RATIO)
training_data, valid_data = random_split(train_dataset, [train_n, len(train_dataset) - train_n])

for epoch in range(5):

    start_time = time.time()
    train_loss, train_acc = train(training_data)
    valid_loss, valid_acc = test(valid_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
    
print('Checking the results of test dataset...')
test_loss, test_acc = test(test_dataset)
print(f'\tLoss: {test_loss:.4f}(test)\t|\tAcc: {test_acc * 100:.1f}%(test)')

Epoch: 1  | time in 68 minutes, 57 seconds
	Loss: 0.0366(train)	|	Acc: 76.0%(train)
	Loss: 0.0201(valid)	|	Acc: 88.6%(valid)
Epoch: 2  | time in 42 minutes, 21 seconds
	Loss: 0.0170(train)	|	Acc: 90.4%(train)
	Loss: 0.0173(valid)	|	Acc: 90.2%(valid)
Epoch: 3  | time in 40 minutes, 8 seconds
	Loss: 0.0131(train)	|	Acc: 92.7%(train)
	Loss: 0.0163(valid)	|	Acc: 91.1%(valid)
Epoch: 4  | time in 31 minutes, 47 seconds
	Loss: 0.0103(train)	|	Acc: 94.2%(train)
	Loss: 0.0168(valid)	|	Acc: 91.0%(valid)
Epoch: 5  | time in 35 minutes, 12 seconds
	Loss: 0.0077(train)	|	Acc: 95.7%(train)
	Loss: 0.0183(valid)	|	Acc: 90.8%(valid)
Checking the results of test dataset...
	Loss: 0.0200(test)	|	Acc: 90.1%(test)

## Model Li with GloVe

#### Brief description and analysis is in the report

TLDR;

    out perform on test accuracy & validation accuracy
    not out perform on training accuracy

Epoch: 1  | time in 51 minutes, 59 seconds

	Loss: 0.0176(train)	|	Acc: 90.3%(train)
	Loss: 0.0143(valid)	|	Acc: 92.1%(valid)
    
Epoch: 2  | time in 42 minutes, 0 seconds

	Loss: 0.0109(train)	|	Acc: 94.0%(train)
	Loss: 0.0129(valid)	|	Acc: 92.9%(valid)
    
Epoch: 3  | time in 40 minutes, 38 seconds

	Loss: 0.0074(train)	|	Acc: 96.0%(train)
	Loss: 0.0135(valid)	|	Acc: 92.9%(valid)
    
Epoch: 4  | time in 39 minutes, 4 seconds

	Loss: 0.0048(train)	|	Acc: 97.4%(train)
	Loss: 0.0156(valid)	|	Acc: 92.4%(valid)
    
Epoch: 5  | time in 39 minutes, 8 seconds
	
    Loss: 0.0029(train)	|	Acc: 98.4%(train)
	Loss: 0.0202(valid)	|	Acc: 91.7%(valid)

Performance on test dataset

	Loss: 0.0214(test)	|	Acc: 91.5%(test)


In [9]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, pretrained_vectors):
        super().__init__()
        self.bag = nn.Embedding(vocab_size, embed_dim)
        self.bag.load_state_dict({'weight': pretrained_vectors})
        
        self.rnn = nn.GRU(embed_dim, embed_dim//2, bidirectional=True)
        
        self.conv = nn.Conv2d(1, 8, (3, embed_dim//2), stride=1, bias=True)
        self.conv1 = nn.Conv2d(1, 8, (4, embed_dim//2), stride=1, bias=True)
        self.conv2 = nn.Conv2d(1, 8, (5, embed_dim//2), stride=1, bias=True)
        
        self.fc = nn.Linear(24, 4)
        
        self.init_weights()

    def init_weights(self):
        self.fc.weight.data.uniform_(-0.5, 0.5)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        
        text_offsets = []
        for i in range(1, len(offsets)+1):
            if i == len(offsets):
                text_offsets.append(text[offsets[i-1]:])
            else:
                text_offsets.append(text[offsets[i-1]:offsets[i]])
        text_len = [len(x) for x in text_offsets]

        pad_text = nn.utils.rnn.pad_sequence(text_offsets)
        
        x = self.bag(pad_text)
        x_packed = pack_padded_sequence(x, text_len, enforce_sorted=False)

        rnn_packed, hn = self.rnn(x_packed)
        
        output, input_sizes = pad_packed_sequence(rnn_packed)
        
        oz = output.size()
        output = output.resize(oz[0], oz[1], oz[2], 1)
        output = output.transpose(1, 3)
        output = output.transpose(2, 3)
        output = output.transpose(0, 2)
        
        output_1 = output[:, :, :, :25]
        output_2 = output[:, :, :, 25:]
        output = output_1.add(output_2)
        
        # CONV
        out_conv1 = self.conv(output)
        out_conv1, _ = torch.max(out_conv1, dim=2)
        
        out_conv2 = self.conv(output)
        out_conv2, _ = torch.max(out_conv2, dim=2)
        
        out_conv3 = self.conv(output)
        out_conv3, _ = torch.max(out_conv3, dim=2)
        
        out_conv = torch.cat([out_conv1, out_conv2, out_conv3], dim=1)
        out = self.fc(out_conv.squeeze(2))
        

        return out

In [10]:
'''
Paramters and model instance creation.
'''
VOCAB_SIZE = len(train_dataset.get_vocab())
EMBED_DIM = 50
NUM_CLASS = len(classes)

#load glove 
vectors = torchtext.vocab.GloVe(name='6B', dim=50)

train_vocab = train_dataset.get_vocab()
weights_matrix = torch.zeros((VOCAB_SIZE, EMBED_DIM))

for i, word in enumerate(train_vocab.itos):
    word_vector = torch.sum(torch.abs(vectors.get_vecs_by_tokens(word)))
    if word_vector.item() == 0:
        weights_matrix[i] = torch.FloatTensor(50).uniform_(-0.1, 0.1)
    else:
        weights_matrix[i] = vectors.get_vecs_by_tokens(word)

In [11]:
N_EPOCHS = 5
TRAIN_RATIO = 0.9

model = TextClassifier(VOCAB_SIZE, EMBED_DIM, NUM_CLASS, weights_matrix).to(device)
validation_loss = float('inf')
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
train_n = int(len(train_dataset) * TRAIN_RATIO)
training_data, valid_data = random_split(train_dataset, [train_n, len(train_dataset) - train_n])

for epoch in range(5):

    start_time = time.time()
    train_loss, train_acc = train(training_data)
    valid_loss, valid_acc = test(valid_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 51 minutes, 59 seconds
	Loss: 0.0176(train)	|	Acc: 90.3%(train)
	Loss: 0.0143(valid)	|	Acc: 92.1%(valid)
Epoch: 2  | time in 42 minutes, 0 seconds
	Loss: 0.0109(train)	|	Acc: 94.0%(train)
	Loss: 0.0129(valid)	|	Acc: 92.9%(valid)
Epoch: 3  | time in 40 minutes, 38 seconds
	Loss: 0.0074(train)	|	Acc: 96.0%(train)
	Loss: 0.0135(valid)	|	Acc: 92.9%(valid)
Epoch: 4  | time in 39 minutes, 4 seconds
	Loss: 0.0048(train)	|	Acc: 97.4%(train)
	Loss: 0.0156(valid)	|	Acc: 92.4%(valid)
Epoch: 5  | time in 39 minutes, 8 seconds
	Loss: 0.0029(train)	|	Acc: 98.4%(train)
	Loss: 0.0202(valid)	|	Acc: 91.7%(valid)
Checking the results of test dataset...
	Loss: 0.0214(test)	|	Acc: 91.5%(test)