In [1]:
import sys
import os
import torch
import csv
import argparse
from functools import partial
import itertools
import uuid

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')
from torchtext.vocab import Vocab, build_vocab_from_iterator
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence
import time
import importlib
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !pip install selenium
# !pip install webdriver-manager
# !pip install pickle5
# !pip install datasets transformers

In [12]:
COLAB = False

USE_CUDA = False
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = 'gdrive/MyDrive/nlp22/project/'
    sys.path.append('gdrive/MyDrive/nlp22/project')

    USE_CUDA = torch.cuda.is_available()

    if USE_CUDA:
        DEVICE = torch.device('cuda')
        print("Using cuda.")
    else:
        DEVICE = torch.device('cpu')
        print("Using cpu.")

    os.chdir(os.path.join(os.getcwd(),'gdrive/MyDrive/nlp22/project'))

from album_loader import *
import lyric_loader
import nlpmodel
importlib.reload(nlpmodel)

# VECTORS_CACHE_DIR = './.vector_cache'

UNK, PAD, LBS, LBE, SBS, SBE, PART = 0, 1, 2, 3, 4, 5, 6
FIRST_TOKENS = 5000
STRATEGY = f'FIRST {FIRST_TOKENS} - Embeddings On'
EMBEDDING_DIMENSIONS = 300

RATE_TYPE = 'c_rate'

In [4]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [3]:
from huggingface_hub import notebook_login
import transformers
print(transformers.__version__)

ModuleNotFoundError: No module named 'huggingface_hub'

In [14]:
def init_albums(path, file, standardize_parts, see_lbs, u_rate_min):
    """
    Instantiates a set of Albums for regression purposes

    kwargs:
    file  -- file containing Albums info (albums_f.pickle)
    standardize_parts - signals whether to standardize parts in lyrics
    see_lbs - signals whether to see linen breaks in lyrics
    """
    albums_data = os.path.join(path, file)
    albums_pre = lyric_loader.RegAlbums(album_path = albums_data, 
                                        standardize_parts = standardize_parts, 
                                        see_line_breaks = see_lbs,
                                        u_rate_min = u_rate_min)
    reg_albums = albums_pre.reg_full_album_text() 
    return reg_albums

def getData(data_list, rating_type):
    """
    Gets data from RegAlbums object

    kwargs:
    data_list -- dataset containing regression albums
    rating_type - critic or user rating
    """
    x = [i[3] for i in data_list]
    if rating_type == 'c_rate':
        y = [int(i[1]) for i in data_list]
    else:
        y = [int(10 * i[2]) for i in data_list]
    return x, y
  
def split_datasets(reg_albums, rating_type):
    """
    Splits data into train, valid, and test sets

    kwargs:
    reg_albums -- unprocessed regression albums to be split up
    rating_type - critic or user rating
    """
    num_train_valid = int(len(reg_albums) * 0.8)
    num_test = len(reg_albums) - num_train_valid
    train_valid_data, test_data = random_split(reg_albums, [num_train_valid, num_test])

    num_train = int(num_train_valid * 0.90)
    num_valid = num_train_valid - num_train
    train_data, valid_data = random_split(train_valid_data, [num_train, num_valid])

    x_train, y_train = getData(train_data, 'c_rate')
    x_valid, y_valid = getData(valid_data, 'c_rate')
    x_test, y_test = getData(test_data, 'c_rate')
    
    return x_train, y_train, x_valid, y_valid, x_test, y_test

def create_encodings(tokenizer, text, **kwargs):
    """
    Tokenize text into encdoings
    """
    max_length = None
    if 'max_length' in kwargs:
        max_length = kwargs['max_length']
    
    if 'add_special_tokens' in kwargs:
        add_special_tokens = kwargs['add_special_tokens']
    
    encodings = tokenizer(text, truncation=True, padding=True, 
                                max_length=max_length, add_special_tokens = add_special_tokens)
    return encodings

class MakeTorchData(torch.utils.data.Dataset):
    """
    Make Dataset out of encodings to pass to HF Trainer 
    (which will conduct training)

    kwargs:
    encodings -- encodings to convert
    labels -- response variables
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        for k, v in self.encodings.items():
            if torch.is_tensor(v):
                item[k] = v[idx]
            else:
                item[k] = torch.tensor(v[idx])
        item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = float(item["labels"])
        return item

    def __len__(self):
        return len(self.labels)


In [15]:
""" Unused :("""

def chunker(item, chunksize):
    newObs = []
    input_id_chunks = list(item['input_ids'].split(chunksize - 2))
    mask_chunks = list(item['attention_mask'].split(chunksize - 2))
    for i in range(len(input_id_chunks)):
        if input_id_chunks[i][-1].item() == 0:
            break
        input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.Tensor([0] * pad_len)])
            mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])
        newDictItem = {}
        newDictItem['input_ids'] = input_id_chunks[i]
        newDictItem['attention_mask'] = mask_chunks[i]
        newDictItem['labels'] = item['labels']
        newObs.append(newDictItem)
    return newObs

def collate_batch_into_chunks(features):
    newObs = []
    for item in features:
        newObs.extend(chunker(item, 512))
    return transformers.default_data_collator(newObs)

In [16]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    """
    Compute custom metrics for regression
    -Function automatically run and input automatically
        supplied by HF Trainer

    kwargs:
    eval_pred -- prediction and label for evaluated dataset
    """
    print("I am computing the metrics for regression...")
    print(f"Here is the type of eval_pred input to this function {type(eval_pred)}")
    print(f"Now, here is the actual value of eval_pred {eval_pred}")
    logits, labels = eval_pred
    print(f"Here is the length of logits: {len(logits)}")
    
    labels = labels.reshape(-1, 1)

    print("Logits:", logits[0:5])
    print("Labels:", labels[0:5])
    
    mse = mean_squared_error(labels, logits)
    var = np.var(labels)
    r2 = r2_score(labels, logits)
    
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
        
    return {"mse": mse, "var": var, "r2": r2, "accuracy" : accuracy}


In [17]:
def main(methodology, 
          methodologies, 
          tokenizer,
          save_label,
          max_length,
          evaluation_strategy = 'epoch',
          save_strategy = 'epoch',
          save_total_limit = 1,
          learning_rate = 5e-5,
          per_device_train_batch_size = 16,
          per_device_eval_batch_size = 16,
          num_train_epochs = 20,
          weight_decay = 0,
          load_best_model_at_end = True,
          metric_for_best_model = 'r2',
          compute_metrics_for_regression = compute_metrics_for_regression,
          collate_batch_into_chunks = collate_batch_into_chunks
         ):
    
    """
    Runs Transformer from end-to-end, from initializing
        Albums to supply to regression to training and 
        evaluation
    
    kwargs:
    -Defined in separte cell below
    """
    
    id = uuid.uuid4()

    methodology_name = methodologies[methodology][0]
    model_name = methodologies[methodology][1]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_tokens(['<lb>', '</lb>', '<sb>', '</sb>', '[part]'])

    create_encodings_fx = methodologies[methodology][2]
    make_datasets_fx = methodologies[methodology][3]
    standardize_parts = methodologies[methodology][4]
    see_lbs = methodologies[methodology][5]
    chunk = methodologies[methodology][6]

    if methodology == 0:
        max_length = 512
    if chunk:
        print("Creating chunks so...")
        max_length = 40000
        per_device_eval_batch_size = 1
    else:
        collate_batch_into_chunks = None
    
    print(f"Max length is now {max_length}")
    print(f"Collate function is now {collate_batch_into_chunks}")
    
    print(f"Running following model: {methodology_name}")
    print(f"Methodology: {methodology_name}")
    print(f"Create encodings function: {create_encodings_fx}")
    print(f"Standardize parts: {standardize_parts}")
    print(f"See line breaks: {see_lbs}")
    print(f"Chunking: {chunk}")

    reg_albums = init_albums(path = '', 
                              file = 'albums_f.pickle', 
                              standardize_parts = standardize_parts, 
                              see_lbs = see_lbs, 
                              u_rate_min = 10)
    
    print(f"Working with {len(reg_albums)} albums in total for reg_albums...")
    

    x_train, y_train, x_valid, y_valid, x_test, y_test = split_datasets(reg_albums, 'c_rate')
    

    print("Creating train encodings...")
    train_encodings = create_encodings_fx(tokenizer, x_train, max_length = max_length, add_special_tokens = False)
    print("Creating valid encodings...")
    valid_encodings = create_encodings_fx(tokenizer, x_valid, max_length = max_length, add_special_tokens = False)
    print("Creating test encodings...")
    test_encodings = create_encodings_fx(tokenizer, x_test, max_length = max_length, add_special_tokens = False)
    
    train_dataset = make_datasets_fx(train_encodings, y_train)
    valid_dataset = make_datasets_fx(valid_encodings, y_valid)
    test_dataset =  make_datasets_fx(test_encodings, y_test)
    
    print(train_dataset[0])
    print(len(train_dataset[0]['input_ids']))
    f"{save_label}/test_dataset"
    print(f"After creating the datasets, the length of the training set is: {len(train_dataset)}")
    print(f"The length of the validation set is: {len(valid_dataset)}")
    print(f"The length of the test set is: {len(test_dataset)}")
        
    print("Finalized dataset creation, moving on to model...")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1, ignore_mismatched_sizes = True) # np.log(1000)
    model.resize_token_embeddings(len(tokenizer))
    
    if USE_CUDA:
        model = model.cuda()

    args = TrainingArguments(
        f"./{save_label}",
        evaluation_strategy = evaluation_strategy,
        save_strategy = save_strategy,
        save_total_limit = save_total_limit,
        learning_rate = learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=load_best_model_at_end,
        metric_for_best_model=metric_for_best_model
    )
    info = [model_name, learning_rate, per_device_train_batch_size, per_device_eval_batch_size, num_train_epochs]
    
    print(collate_batch_into_chunks)
    print("Instantiating the Trainer...")
    # # Call the Trainer
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
        data_collator = collate_batch_into_chunks,
        compute_metrics=compute_metrics_for_regression    # the callback that computes metrics of interest
    )

    print("Training the model...")
    # # # Train the model
    trainer.train()

    print("Evaluating the model using evaluation dataset...")
    # # Call the summary
    a = trainer.evaluate()
    print("Returned from evaluate on evaluation set...")
    print(type(a))
    print(a)

    print("Evaluating the model using test dataset...")
    b = trainer.evaluate(test_dataset)
    print("Returned from evaluate on test set...")
    print(type(b))
    print(bin)
    return a, b

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

ModuleNotFoundError: No module named 'transformers'

In [19]:
"""
Run this cell to run main() function and fine-tune a BERT-based regressor
"""

                      #   Name           /Create encodings /Make dataset/Std Pts/Line breaks/Chunk
methodologies = {0 : ('BERT First 512',"distilbert-base-uncased-finetuned-sst-2-english",
                      create_encodings, MakeTorchData, True, False, False),
               #   1 : ('BERT Longformer 4096', "allenai/longformer-base-4096",
               #        create_encodings, MakeTorchData, True, False, False)
                    }

train_batch_size = 16
test_batch_size = 16
save_label = "BERT First 512"
max_length = 512

# Training Arguments
evaluation_strategy = 'epoch'
save_strategy = 'epoch'
save_total_limit = 1
learning_rate = 5e-5
per_device_train_batch_size = train_batch_size
per_device_eval_batch_size = test_batch_size
num_train_epochs = 10
weight_decay = 0
load_best_model_at_end = True
metric_for_best_model = 'r2'

main_args= {'methodologies' : methodologies,
            'tokenizer' : tokenizer,
            'save_label': save_label,
            'max_length' : max_length,
            'evaluation_strategy': evaluation_strategy,
            'save_strategy' : save_strategy,
            'save_total_limit' : save_total_limit,
            'learning_rate' : learning_rate,
            'per_device_train_batch_size' : per_device_train_batch_size,
            'per_device_eval_batch_size' : per_device_eval_batch_size,
            'num_train_epochs' : num_train_epochs,
            'weight_decay': weight_decay,
            'load_best_model_at_end' : load_best_model_at_end,
            'metric_for_best_model' : metric_for_best_model}

a, b = main(0, **main_args)

"""
Scroll down to cell below all this output to view 
performance metrics on validation and test datasets
(a and b, respectively)
"""    

NameError: name 'AutoTokenizer' is not defined

In [12]:

print(a)
print(b)

{'eval_loss': 73.21768951416016, 'eval_mse': 73.21768951416016, 'eval_var': 84.99159240722656, 'eval_r2': 0.13853022414011906, 'eval_accuracy': 0.06986899563318777, 'eval_runtime': 3.7872, 'eval_samples_per_second': 60.466, 'eval_steps_per_second': 3.961, 'epoch': 10.0}
{'eval_loss': 76.2323989868164, 'eval_mse': 76.23239135742188, 'eval_var': 84.88729095458984, 'eval_r2': 0.1019574448215369, 'eval_accuracy': 0.07517482517482517, 'eval_runtime': 9.4004, 'eval_samples_per_second': 60.849, 'eval_steps_per_second': 3.83, 'epoch': 10.0}
