In [1]:
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla P100-PCIE-16GB


In [3]:
import string
import regex as re
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tokenizers import BertWordPieceTokenizer
from transformers import AutoTokenizer


def preprocess(text):
    # Remove integers
    text = re.sub(r'\d+', '', text)

    # remove newlines as \r and \n
    text = re.sub(r'\r', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove punctuation marks
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    return text


def encode_text_and_labels(df, max_num_words, pre_or_post='post', subword=False):
    # create a tokenizer
    if subword == 'subword':
        t = BertWordPieceTokenizer(
            clean_text=True,
            handle_chinese_chars=False,
            strip_accents=False,
            lowercase=True
        )

        t.train_from_iterator(df['text'])
        vocab_size = t.get_vocab_size()
        # integer encode the documents
        encoded_list = t.encode_batch(df['text'])
        encoded_docs = [x.ids for x in encoded_list]
        # pad documents to be as long as the longest sequence in the dataset
        max_length = max([len(x) for x in encoded_docs])
        padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding=pre_or_post)
    elif subword == 'word':
        t = Tokenizer(num_words=max_num_words, oov_token='<unk>')
        t.fit_on_texts(df['text'])
        vocab_size = len(t.word_index) + 1
        # integer encode the documents
        encoded_docs = t.texts_to_sequences(df['text'])
        # pad documents to be as long as the longest sequence in the dataset
        max_length = df['text'].apply(lambda x: len(x.split(' '))).max()
        padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding=pre_or_post)
    else:
        t = AutoTokenizer.from_pretrained('bert-base-uncased')
        padded_docs = t(df['text'].tolist(), padding=True, truncation=True, return_tensors='np')




    # integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(df['artist'])
    # binary encode
    #onehot_encoded = to_categorical(integer_encoded)
    return padded_docs, integer_encoded, t, label_encoder


def load_and_preprocess_data(path, max_num_words=None, pre_or_post='post', subword=False):
    """
    Load the data and preprocess it
    :param path: path to the data
    :return: preprocessed data in the form of a pandas dataframe. The first item returned is the data,
    the second is the labels, the third is the vocabulary size, and the fourth is the maximum length of a sequence
    """
    df = pd.read_csv(path)

    df = df.groupby('artist').filter(lambda x: len(x) > 100)

    df['text'] = df['text'].apply(preprocess)

    # Identify the rows that contain duplicated text in the 'song' column
    no_covers = ~df['song'].duplicated()

    # Filter the DataFrame to include only the rows with unique text
    df = df[no_covers]

    # prepare text data for a recurrent network
    return encode_text_and_labels(df, max_num_words, pre_or_post, subword)

2023-01-24 11:43:07.131952: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [4]:
path = "/kaggle/input/spotify-million-song-dataset/spotify_millsongdata.csv"
padded_docs, onehot_encoded, token, label = load_and_preprocess_data(path)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [5]:
from sklearn.model_selection import train_test_split

input_ids = padded_docs['input_ids']
attention_masks = padded_docs['attention_mask']

NUM_LABELS = max(onehot_encoded) + 1

X_train, X_test, y_train, y_test, mask_train, mask_test = train_test_split(
    input_ids, onehot_encoded, attention_masks, test_size=0.2, 
    random_state=42, stratify=onehot_encoded)

# get validation set, which is 8% of entire data set
X_train, X_val, y_train, y_val, mask_train, mask_val = train_test_split(
    X_train, y_train, mask_train, stratify=y_train,
    test_size=0.1, random_state=42)

X_train = torch.tensor(X_train)
mask_train = torch.tensor(mask_train)
y_train = torch.tensor(y_train, dtype=torch.long)

X_val = torch.tensor(X_val)
mask_val = torch.tensor(mask_val)
y_val = torch.tensor(y_val, dtype=torch.long)

X_test = torch.tensor(X_test)
mask_test = torch.tensor(mask_test)
y_test = torch.tensor(y_test, dtype=torch.long)


In [6]:
print(NUM_LABELS)
print(y_train)

268
tensor([  2, 134, 206,  ..., 243, 108, 155])


In [7]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(X_train, mask_train, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(X_val, mask_val, y_val)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Create the DataLoader for our test set
test_data = TensorDataset(X_test, mask_test, y_test)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [8]:
print(len(train_dataloader))

703


In [9]:
%%time
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, num_labels, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, num_labels, num_labels
        
        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 4.44 ms, sys: 42 µs, total: 4.49 ms
Wall time: 5.69 ms


In [10]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(lr, eps, epochs=4, freeze_bert=False):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(NUM_LABELS, freeze_bert)
    
    #print(torch.cuda.memory_summary(device=None, abbreviated=False))
    # Tell PyTorch to run the model on GPU
    report_gpu()
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=lr,    # Default learning rate
                      eps=eps    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [11]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer, scheduler, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    
    if evaluation:
        outputs = {'train_loss': [],'train_acc': [], 'val_loss': [], 'val_acc': [] }
    else:
        outputs = {'train_loss': [],'train_acc': []}
    
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Train accuracy':^14} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)
        
        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts, total_acc, batch_acc = 0, 0, 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            report_gpu()
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()
            
            report_gpu()
            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).flatten()
                
            cur_acc = (preds == b_labels).cpu().numpy().mean() * 100
            batch_acc += cur_acc
            total_acc += cur_acc

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {total_acc / step:^14.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts, batch_acc = 0, 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        avg_train_acc = total_acc / len(train_dataloader)
        
        outputs['train_loss'].append(avg_train_loss)
        outputs['train_acc'].append(avg_train_acc)
        

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)
            outputs['val_loss'].append(val_loss)
            outputs['val_acc'].append(val_accuracy)

            
            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f}  | {avg_train_acc:^14.6f}| {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")
    torch.save(model, '/kaggle/working/BERT-model')
    return outputs


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy


In [12]:
import gc
def report_gpu():
   #print(torch.cuda.list_gpu_processes())
   gc.collect()
   torch.cuda.empty_cache()

In [13]:
set_seed(42)    # Set seed for reproducibility
EPOCHS = 50
lr = 1e-4
eps = 1e-6
bert_classifier, optimizer, scheduler = initialize_model(lr, eps, epochs=EPOCHS, freeze_bert=True)
history = train(bert_classifier, optimizer, scheduler, train_dataloader, val_dataloader, 
                epochs=EPOCHS, evaluation=True)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Start training...

 Epoch  |  Batch  |  Train Loss  | Train accuracy |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   5.594500   |    0.468750    |     -      |     -     |   20.02  
   1    |   40    |   5.589546   |    0.468750    |     -      |     -     |   18.99  
   1    |   60    |   5.607351   |    0.364583    |     -      |     -     |   19.02  
   1    |   80    |   5.582361   |    0.429688    |     -      |     -     |   18.86  
   1    |   100   |   5.589366   |    0.468750    |     -      |     -     |   18.93  
   1    |   120   |   5.580254   |    0.625000    |     -      |     -     |   18.91  
   1    |   140   |   5.583315   |    0.580357    |     -      |     -     |   18.86  
   1    |   160   |   5.590578   |    0.546875    |     -      |     -     |   18.84  


KeyboardInterrupt: 

In [None]:
print(EPOCHS)