## Import Modules

In [0]:
import time
import pandas as pd

from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator, Iterator
import random

import torch
import torch.nn as nn
import torch.optim as optim

import nltk
from nltk.corpus import stopwords

In [62]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
SEED = 479

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# !CUDA_LAUNCH_BLOCKING=0

In [0]:
STOPWORDS = set(stopwords.words('english'))

## Preparing Data

In [65]:
# read in data
path = '4_19_lyrics_with_tags.csv'
ly = pd.read_csv(path, delimiter = ",", dtype = {'lyrics':str})
ly.head(3)

Unnamed: 0,artist,title,language,lyrics,track_id,tags,category
0,Lena Philipsson,006,1.0,I had come in the name of love\nWith a mission...,TRMMMKI128F931D80D,pop,2
1,Shawn Colvin,The Heart Of Saturday,1.0,Well you gassed her up\nBehind the wheel\nWith...,TRQFODA128F93319C3,pop,2
2,Dying Fetus,Ethos of Coercion,1.0,"Castigation of the offenders, no punishment ou...",TRLXQQL128F4291A8F,rock,1


In [0]:
# declare field
tokenize = lambda x: x.split()

# TEXT = data.Field(tokenize = 'spacy')
TEXT = Field(sequential=True, tokenize=tokenize, lower=True,
             stop_words = STOPWORDS)
LABEL = Field(sequential=False, use_vocab=True, is_target=True)

In [0]:
# setup dataset

trn_vld_tst_fields = [("artist", None), ("title", None), ("language", None),
                      ("lyrics", TEXT), ("track_id", None), ("tags", LABEL)]

trn_vld_tst_dataset = TabularDataset(
    path="4_19_lyrics_with_tags.csv", format='csv',
    skip_header=True, fields=trn_vld_tst_fields)

In [68]:
# split dataset into trn, tst, vld
# with ratio: 6, 3, 1
# use random.seed

trn, vld, tst = trn_vld_tst_dataset.split(
    split_ratio = [0.6, 0.3, 0.1],
    random_state = random.seed(SEED))

print('Number of training examples:', len(trn), sep = ' ')
print('Number of validation examples:', len(vld), sep = ' ')
print('Number of testing examples:', len(tst), sep = ' ')

Number of training examples: 46731
Number of validation examples: 7788
Number of testing examples: 23366


In [69]:
# setup dictionary on train dataset
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(trn, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(trn)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 7


In [0]:
# create iterators
BATCH_SIZE = 64
trn_iter, vld_iter = BucketIterator.splits(
    (trn, vld), device = device,
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.lyrics))

tst_iter = Iterator(tst, batch_size=BATCH_SIZE, 
                     device=device, 
                     sort=False, 
                     sort_within_batch=False, 
                     repeat=False)

## Build the Model

In [0]:
class RNN(nn.Module):
  
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

* input_dim = the dimension of the one-hot vectors = vocabulary size

* embedding_dim = size of the dense word vectors = 50~250, depending on input_dim

* hidden_dim = size of the hidden states = 100~500, depending on input_dim, embedding_dim, complexity of the task

* output_dim = number of classes

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 120
HIDDEN_DIM = 256
OUTPUT_DIM = 7

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [73]:
def count_trnable_para(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_trnable_para(model):,} trainable parameters')

The model has 3,098,807 trainable parameters


## Train the Model

In [0]:
# define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# define loss function
criterion = nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def multiclass_accuracy(model_output, y):
    model_preds = torch.sigmoid(model_output).max(1)[1]
    correct = (model_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [0]:
# define train function

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # model.train(): put the model in "training mode"
    # which turns on dropout and batch normalization.
    model.train()
    
    for batch in iterator:
        
        # zero the gradients
        # PyTorch does not automatically remove (zero) the gradients
        # of the last gradient calculation
        optimizer.zero_grad()
                
        predictions = model(batch.lyrics).squeeze(1)
        
        loss = criterion(predictions, batch.tags)
        
        acc = multiclass_accuracy(predictions, batch.tags)
        
        # update model's parameters
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
# define evaluate function
# similar to train, but don't want update parameters

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    # model.eval(): puts the model in "evaluation mode"
    # which turns off dropout and batch normalization
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.lyrics).squeeze(1)
            
            loss = criterion(predictions, batch.tags)
            
            acc = multiclass_accuracy(predictions, batch.tags)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
# tell us how long an epoch takes
# to compare training times between models.

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 50

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    trn_loss, trn_acc = train(model, trn_iter, optimizer, criterion)
    vld_loss, vld_acc = evaluate(model, vld_iter, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if vld_loss < best_valid_loss:
        best_valid_loss = vld_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {trn_loss:.3f} | Train Acc: {trn_acc*100:.2f}%')
    print(f'\t Val. Loss: {vld_loss:.3f} |  Val. Acc: {vld_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 27s
	Train Loss: 0.936 | Train Acc: 74.31%
	 Val. Loss: 0.972 |  Val. Acc: 73.74%
Epoch: 02 | Epoch Time: 0m 27s
	Train Loss: 0.929 | Train Acc: 74.41%
	 Val. Loss: 0.995 |  Val. Acc: 73.14%
Epoch: 03 | Epoch Time: 0m 27s
	Train Loss: 0.928 | Train Acc: 74.44%
	 Val. Loss: 0.944 |  Val. Acc: 74.30%
Epoch: 04 | Epoch Time: 0m 27s
	Train Loss: 0.929 | Train Acc: 74.44%
	 Val. Loss: 0.989 |  Val. Acc: 73.11%
Epoch: 05 | Epoch Time: 0m 27s
	Train Loss: 0.942 | Train Acc: 74.07%
	 Val. Loss: 1.471 |  Val. Acc: 49.86%
Epoch: 06 | Epoch Time: 0m 27s
	Train Loss: 0.943 | Train Acc: 73.94%
	 Val. Loss: 1.397 |  Val. Acc: 52.36%
Epoch: 07 | Epoch Time: 0m 27s
	Train Loss: 0.938 | Train Acc: 74.19%
	 Val. Loss: 1.236 |  Val. Acc: 67.11%
Epoch: 08 | Epoch Time: 0m 27s
	Train Loss: 0.935 | Train Acc: 74.39%
	 Val. Loss: 1.121 |  Val. Acc: 72.73%
Epoch: 09 | Epoch Time: 0m 27s
	Train Loss: 0.932 | Train Acc: 74.40%
	 Val. Loss: 1.035 |  Val. Acc: 73.53%
Epoch: 10 | Epoch T

In [0]:
model.load_state_dict(torch.load('tut1-model.pt'))

tst_loss, tst_acc = evaluate(model, tst_iter, criterion)

print(f'Test Loss: {tst_loss:.3f} | Test Acc: {tst_acc*100:.2f}%')