In [1]:
import sys
import os
import torch
import csv
import argparse
from functools import partial
import itertools
import uuid

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')
from torchtext.vocab import Vocab, build_vocab_from_iterator
from torchtext.vocab import GloVe
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
from torch.utils.data.dataset import random_split
from torch.nn.utils.rnn import pad_sequence
import time
import importlib
import numpy as np

In [2]:
# !pip install selenium
# !pip install webdriver-manager
# !pip install pickle5
# !pip install datasets transformers

In [3]:
COLAB = True

USE_CUDA = False
if COLAB:
    from google.colab import drive 
    drive.mount('/content/gdrive')
    PATH = 'gdrive/MyDrive/nlp22/project/'
    sys.path.append('gdrive/MyDrive/nlp22/project')

    USE_CUDA = torch.cuda.is_available()

    if USE_CUDA:
        DEVICE = torch.device('cuda')
        print("Using cuda.")
    else:
        DEVICE = torch.device('cpu')
        print("Using cpu.")

    os.chdir(os.path.join(os.getcwd(),'gdrive/MyDrive/nlp22/project'))

from album_loader import *
import lyric_loader
import nlpmodel
importlib.reload(nlpmodel)

# VECTORS_CACHE_DIR = './.vector_cache'

UNK, PAD, LBS, LBE, SBS, SBE, PART = 0, 1, 2, 3, 4, 5, 6
FIRST_TOKENS = 5000
STRATEGY = f'FIRST {FIRST_TOKENS} - Embeddings On'
EMBEDDING_DIMENSIONS = 300

RATE_TYPE = 'c_rate'

Mounted at /content/gdrive
Using cuda.


In [4]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.


In [5]:
from huggingface_hub import notebook_login
import transformers
print(transformers.__version__)

4.21.1


In [6]:
def init_albums(path, file, standardize_parts, see_lbs):
    albums_data = os.path.join(path, file)
    albums_pre = lyric_loader.RegAlbums(album_path = albums_data, 
                                        standardize_parts = standardize_parts, 
                                        see_line_breaks = see_lbs)
    reg_albums = albums_pre.reg_full_album_text() 
    return reg_albums

def getData(data_list, rating_type):
    x = [i[3] for i in data_list]
    if rating_type == 'c_rate':
        y = [int(i[1]) for i in data_list]
    else:
        y = [int(10 * i[2]) for i in data_list]
    return x, y
  
def split_datasets(reg_albums, rating_type):
    num_train_valid = int(len(reg_albums) * 0.8)
    num_test = len(reg_albums) - num_train_valid
    train_valid_data, test_data = random_split(reg_albums, [num_train_valid, num_test])

    num_train = int(num_train_valid * 0.90)
    num_valid = num_train_valid - num_train
    train_data, valid_data = random_split(train_valid_data, [num_train, num_valid])

    x_train, y_train = getData(train_data, 'c_rate')
    x_valid, y_valid = getData(valid_data, 'c_rate')
    x_test, y_test = getData(test_data, 'c_rate')
    
    return x_train, y_train, x_valid, y_valid, x_test, y_test

def create_encodings(tokenizer, text, **kwargs):
    max_length = None
    if 'max_length' in kwargs:
        max_length = kwargs['max_length']
    
    if 'add_special_tokens' in kwargs:
        add_special_tokens = kwargs['add_special_tokens']
    
    encodings = tokenizer(text, truncation=True, padding=True, 
                                max_length=max_length, add_special_tokens = add_special_tokens)
    return encodings

class MakeTorchData(torch.utils.data.Dataset):
      def __init__(self, encodings, labels):
          self.encodings = encodings
          self.labels = labels

      def __getitem__(self, idx):
          item = {}
          for k, v in self.encodings.items():
              if torch.is_tensor(v):
                  item[k] = v[idx]
              else:
                item[k] = torch.tensor(v[idx])
          item["labels"] = torch.tensor([self.labels[idx]])
          item["labels"] = float(item["labels"])
          return item

      def __len__(self):
          return len(self.labels)


In [7]:
def chunker(item, chunksize):
    newObs = []
    input_id_chunks = list(item['input_ids'].split(chunksize - 2))
    mask_chunks = list(item['attention_mask'].split(chunksize - 2))
    for i in range(len(input_id_chunks)):
        if input_id_chunks[i][-1].item() == 0:
            break
        input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.Tensor([0] * pad_len)])
            mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])
        newDictItem = {}
        newDictItem['input_ids'] = input_id_chunks[i]
        newDictItem['attention_mask'] = mask_chunks[i]
        newDictItem['labels'] = item['labels']
        newObs.append(newDictItem)
    return newObs

def collate_batch_into_chunks(features):
    newObs = []
    for item in features:
        newObs.extend(chunker(item, 512))
    return transformers.default_data_collator(newObs)

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    print("I am computing the metrics for regression...")
    print(f"Here is the type of eval_pred input to this function {type(eval_pred)}")
    print(f"Now, here is the actual value of eval_pred {eval_pred}")
    logits, labels = eval_pred
    print(f"Here is the length of logits: {len(logits)}")
    
    labels = labels.reshape(-1, 1)

    print("Logits:", logits[0:5])
    print("Labels:", labels[0:5])
    
    mse = mean_squared_error(labels, logits)
    var = np.var(labels)
    r2 = r2_score(labels, logits)
    
    single_squared_errors = ((logits - labels).flatten()**2).tolist()
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

        
    return {"mse": mse, "var": var, "r2": r2, "accuracy" : accuracy}


In [None]:
def create_train_test(standardize_parts, see_lbs):
    print("Create train test was called...")
    sp = 1 if standardize_parts else 0
    sl = 1 if see_lbs else 0
    u_rate_min = 10
    reg_albums = init_albums(path = '', file = 'albums_f.pickle', 
                standardize_parts = standardize_parts, see_lbs = see_lbs, u_rate_min = u_rate_min)

    fourth = len(reg_albums) // 4
    train_val, test = random_split(reg_albums, [len(reg_albums) - fourth, fourth])

    comb = (train_val, test)
    with open(f'train_val_test_{sp}_{sl}.pickle', 'wb') as handle:
        pickle.dump(comb, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return comb

def load_train_test(standardize_parts, see_lbs):
    sp = 1 if standardize_parts else 0
    sl = 1 if see_lbs else 0
    try:
        with open(f'train_val_test_{sp}_{sl}.pickle', 'rb') as handle:
            comb = pickle.load(handle)   
    except:
        print(f"Creating train/val/test sets for standardize_parts: {standardize_parts}, see_lbs: {see_lbs}")
        comb = create_train_test(standardize_parts, see_lbs)
    train, test = comb
    return train, test

In [9]:
def main(methodology, 
          methodologies, 
          tokenizer,
          save_label,
          max_length,
          evaluation_strategy = 'epoch',
          save_strategy = 'epoch',
          save_total_limit = 1,
          learning_rate = 5e-5,
          per_device_train_batch_size = 16,
          per_device_eval_batch_size = 16,
          num_train_epochs = 20,
          weight_decay = 0,
          load_best_model_at_end = True,
          metric_for_best_model = 'r2',
          compute_metrics_for_regression = compute_metrics_for_regression,
          collate_batch_into_chunks = collate_batch_into_chunks
         ):
    
    id = uuid.uuid4()

    methodology_name = methodologies[methodology][0]
    model_name = methodologies[methodology][1]
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_tokens(['<lb>', '</lb>', '<sb>', '</sb>', '[part]'])

    create_encodings_fx = methodologies[methodology][2]
    make_datasets_fx = methodologies[methodology][3]
    standardize_parts = methodologies[methodology][4]
    see_lbs = methodologies[methodology][5]
    chunk = methodologies[methodology][6]

    if methodology == 0:
        max_length = 512
    if chunk:
        print("Creating chunks so...")
        max_length = 40000
        per_device_eval_batch_size = 1
    else:
        collate_batch_into_chunks = None
    
    print(f"Max length is now {max_length}")
    print(f"Collate function is now {collate_batch_into_chunks}")
    
    print(f"Running following model: {methodology_name}")
    print(f"Methodology: {methodology_name}")
    print(f"Create encodings function: {create_encodings_fx}")
    print(f"Standardize parts: {standardize_parts}")
    print(f"See line breaks: {see_lbs}")
    print(f"Chunking: {chunk}")

    reg_albums, test_reg_albums = load_train_test(standardize_parts, see_lbs)
    print(f"Working with {len(reg_albums)} albums in total for reg_albums...")
    

    x_train, y_train, x_valid, y_valid, x_test, y_test = split_datasets(reg_albums, 'c_rate')
    

    print("Creating train encodings...")
    train_encodings = create_encodings_fx(tokenizer, x_train, max_length = max_length, add_special_tokens = False)
    print("Creating valid encodings...")
    valid_encodings = create_encodings_fx(tokenizer, x_valid, max_length = max_length, add_special_tokens = False)
    print("Creating test encodings...")
    test_encodings = create_encodings_fx(tokenizer, x_test, max_length = max_length, add_special_tokens = False)
    
    train_dataset = make_datasets_fx(train_encodings, y_train)
    valid_dataset = make_datasets_fx(valid_encodings, y_valid)
    test_dataset =  make_datasets_fx(test_encodings, y_test)
    
    print(train_dataset[0])
    print(len(train_dataset[0]['input_ids']))
    f"{save_label}/test_dataset"
    print(f"After creating the datasets, the length of the training set is: {len(train_dataset)}")
    print(f"The length of the validation set is: {len(valid_dataset)}")
    print(f"The length of the test set is: {len(test_dataset)}")
        
    print("Finalized dataset creation, moving on to model...")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1, ignore_mismatched_sizes = True) # np.log(1000)
    model.resize_token_embeddings(len(tokenizer))
    
    if USE_CUDA:
        model = model.cuda()

    args = TrainingArguments(
        f"./{save_label}",
        evaluation_strategy = evaluation_strategy,
        save_strategy = save_strategy,
        save_total_limit = save_total_limit,
        learning_rate = learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        load_best_model_at_end=load_best_model_at_end,
        metric_for_best_model=metric_for_best_model
    )
    info = [model_name, learning_rate, per_device_train_batch_size, per_device_eval_batch_size, num_train_epochs]
    
    print(collate_batch_into_chunks)
    print("Instantiating the Trainer...")
    # # Call the Trainer
    trainer = Trainer(
        model=model,                         # the instantiated Transformers model to be trained
        args=args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=valid_dataset,          # evaluation dataset
        data_collator = collate_batch_into_chunks,
        compute_metrics=compute_metrics_for_regression    # the callback that computes metrics of interest
    )

    print("Training the model...")
    # # # Train the model
    trainer.train()

    print("Evaluating the model using evaluation dataset...")
    # # Call the summary
    a = trainer.evaluate()
    print("Returned from evaluate on evaluation set...")
    print(type(a))
    print(a)

    print("Evaluating the model using test dataset...")
    b = trainer.evaluate(test_dataset)
    print("Returned from evaluate on test set...")
    print(type(b))
    print(bin)
    return a, b

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [11]:
                      #   Name           /Create encodings /Make dataset/Std Pts/Line breaks/Chunk
methodologies = {0 : ('BERT First 512',"distilbert-base-uncased-finetuned-sst-2-english",
                      create_encodings, MakeTorchData, True, False, False),
                 1 : ('BERT Longformer 4096', "allenai/longformer-base-4096",
                      create_encodings, MakeTorchData, True, False, False)}
train_batch_size = 16
test_batch_size = 16
save_label = "BERT First 512"
max_length = 512

# Training Arguments
evaluation_strategy = 'epoch'
save_strategy = 'epoch'
save_total_limit = 1
learning_rate = 5e-5
per_device_train_batch_size = train_batch_size
per_device_eval_batch_size = test_batch_size
num_train_epochs = 10
weight_decay = 0
load_best_model_at_end = True
metric_for_best_model = 'r2'

main_args= {'methodologies' : methodologies,
            'tokenizer' : tokenizer,
            'save_label': save_label,
            'max_length' : max_length,
            'evaluation_strategy': evaluation_strategy,
            'save_strategy' : save_strategy,
            'save_total_limit' : save_total_limit,
            'learning_rate' : learning_rate,
            'per_device_train_batch_size' : per_device_train_batch_size,
            'per_device_eval_batch_size' : per_device_eval_batch_size,
            'num_train_epochs' : num_train_epochs,
            'weight_decay': weight_decay,
            'load_best_model_at_end' : load_best_model_at_end,
            'metric_for_best_model' : metric_for_best_model}

a, b = main(0, **main_args)
    

Max length is now 512
Collate function is now None
Running following model: BERT First 512
Methodology: BERT First 512
Create encodings function: <function create_encodings at 0x7ff743716a70>
Standardize parts: True
See line breaks: False
Chunking: False
After making limitations, working with 2856 albums in total...
2814/2856 (98.5%) albums, have length >200 and are retained.
Creating train encodings...
Creating valid encodings...
Creating test encodings...
{'input_ids': tensor([30524,  4873,  2058,  1996, 10098,  1010,  2126,  2039,  2152,  2045,
         1005,  1055,  1037,  2455,  2008,  1045,  2657,  1997,  2320,  1999,
         1037, 29149,  4873,  2058,  2008, 10098,  1010, 15717,  2024,  2630,
         1998,  1996,  5544,  2008,  2017,  8108,  2000,  3959,  2428,  2079,
         2272,  2995, 13834,  1045,  1005,  2222,  4299,  2588,  1037,  2732,
         1998,  5256,  2039,  2073,  1996,  8044,  2024,  2521,  2369,  2033,
         2073, 13460, 14899,  2066, 14380,  9010,  2152,

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([1, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 2055
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1290


None
Instantiating the Trainer...
Training the model...


Epoch,Training Loss,Validation Loss,Mse,Var,R2,Accuracy
1,No log,2857.026123,2857.025635,84.991592,-32.615396,0.0
2,No log,895.021301,895.021301,84.991592,-9.530704,0.0
3,No log,86.924179,86.924179,84.991592,-0.022739,0.034934
4,1705.524600,87.243263,87.243263,84.991592,-0.026493,0.039301
5,1705.524600,99.849922,99.849922,84.991592,-0.174821,0.039301
6,1705.524600,84.505058,84.505066,84.991592,0.005724,0.061135
7,1705.524600,73.21769,73.21769,84.991592,0.13853,0.069869
8,65.864300,77.713051,77.713066,84.991592,0.085638,0.078603
9,65.864300,84.941544,84.941544,84.991592,0.000589,0.056769
10,65.864300,86.963882,86.963882,84.991592,-0.023206,0.048035


***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6ac5cc750>
Here is the length of logits: 229
Logits: [[21.256742]
 [21.223179]
 [21.253834]
 [21.254168]
 [21.260906]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-129
Configuration saved in ./BERT First 512/checkpoint-129/config.json
Model weights saved in ./BERT First 512/checkpoint-129/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-1161] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6ac8e9c10>
Here is the length of logits: 229
Logits: [[45.45079 ]
 [45.42547 ]
 [45.443764]
 [45.443398]
 [45.44888 ]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-258
Configuration saved in ./BERT First 512/checkpoint-258/config.json
Model weights saved in ./BERT First 512/checkpoint-258/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-1290] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6acb08850>
Here is the length of logits: 229
Logits: [[72.51901 ]
 [72.5089  ]
 [72.5209  ]
 [72.51377 ]
 [72.516205]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-387
Configuration saved in ./BERT First 512/checkpoint-387/config.json
Model weights saved in ./BERT First 512/checkpoint-387/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-129] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6c4070650>
Here is the length of logits: 229
Logits: [[75.42979]
 [75.28631]
 [75.44517]
 [75.41584]
 [75.42215]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-516
Configuration saved in ./BERT First 512/checkpoint-516/config.json
Model weights saved in ./BERT First 512/checkpoint-516/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-258] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6ac5cc650>
Here is the length of logits: 229
Logits: [[78.222435]
 [78.190125]
 [78.20078 ]
 [77.93885 ]
 [78.206024]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-645
Configuration saved in ./BERT First 512/checkpoint-645/config.json
Model weights saved in ./BERT First 512/checkpoint-645/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-516] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6acb44190>
Here is the length of logits: 229
Logits: [[78.97354 ]
 [78.958046]
 [78.988106]
 [74.543045]
 [78.65009 ]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-774
Configuration saved in ./BERT First 512/checkpoint-774/config.json
Model weights saved in ./BERT First 512/checkpoint-774/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-387] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6ac486150>
Here is the length of logits: 229
Logits: [[79.31385 ]
 [78.992775]
 [77.894295]
 [69.9058  ]
 [77.29882 ]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-903
Configuration saved in ./BERT First 512/checkpoint-903/config.json
Model weights saved in ./BERT First 512/checkpoint-903/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-645] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6ac6cd350>
Here is the length of logits: 229
Logits: [[80.841286]
 [81.27626 ]
 [79.87078 ]
 [75.67984 ]
 [77.749405]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-1032
Configuration saved in ./BERT First 512/checkpoint-1032/config.json
Model weights saved in ./BERT First 512/checkpoint-1032/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-774] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6acad5d90>
Here is the length of logits: 229
Logits: [[82.16741]
 [82.14816]
 [80.70864]
 [75.42825]
 [78.80461]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-1161
Configuration saved in ./BERT First 512/checkpoint-1161/config.json
Model weights saved in ./BERT First 512/checkpoint-1161/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-1032] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6acbc9550>
Here is the length of logits: 229
Logits: [[82.438194]
 [82.37202 ]
 [80.2937  ]
 [75.64771 ]
 [78.10065 ]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]


Saving model checkpoint to ./BERT First 512/checkpoint-1290
Configuration saved in ./BERT First 512/checkpoint-1290/config.json
Model weights saved in ./BERT First 512/checkpoint-1290/pytorch_model.bin
Deleting older checkpoint [BERT First 512/checkpoint-1161] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./BERT First 512/checkpoint-903 (score: 0.13853022414011906).
***** Running Evaluation *****
  Num examples = 229
  Batch size = 16


Evaluating the model using evaluation dataset...


***** Running Evaluation *****
  Num examples = 572
  Batch size = 16


I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.trainer_utils.EvalPrediction object at 0x7ff6acb249d0>
Here is the length of logits: 229
Logits: [[79.31385 ]
 [78.992775]
 [77.894295]
 [69.9058  ]
 [77.29882 ]]
Labels: [[85.]
 [61.]
 [85.]
 [79.]
 [76.]]
Returned from evaluate on evaluation set...
<class 'dict'>
{'eval_loss': 73.21768951416016, 'eval_mse': 73.21768951416016, 'eval_var': 84.99159240722656, 'eval_r2': 0.13853022414011906, 'eval_accuracy': 0.06986899563318777, 'eval_runtime': 3.7872, 'eval_samples_per_second': 60.466, 'eval_steps_per_second': 3.961, 'epoch': 10.0}
Evaluating the model using test dataset...
I am computing the metrics for regression...
Here is the type of eval_pred input to this function <class 'transformers.trainer_utils.EvalPrediction'>
Now, here is the actual value of eval_pred <transformers.traine

In [12]:
print(a)
print(b)

{'eval_loss': 73.21768951416016, 'eval_mse': 73.21768951416016, 'eval_var': 84.99159240722656, 'eval_r2': 0.13853022414011906, 'eval_accuracy': 0.06986899563318777, 'eval_runtime': 3.7872, 'eval_samples_per_second': 60.466, 'eval_steps_per_second': 3.961, 'epoch': 10.0}
{'eval_loss': 76.2323989868164, 'eval_mse': 76.23239135742188, 'eval_var': 84.88729095458984, 'eval_r2': 0.1019574448215369, 'eval_accuracy': 0.07517482517482517, 'eval_runtime': 9.4004, 'eval_samples_per_second': 60.849, 'eval_steps_per_second': 3.83, 'epoch': 10.0}


Graveyard

In [None]:

os.getcwd()

'/content/gdrive/MyDrive/nlp22/project'

In [None]:
# def chunker(input_ids, attention_mask, curr_label, chunksize):
#     input_id_chunks = list(torch.tensor(input_ids).split(chunksize - 2))
#     mask_chunks = list(torch.tensor(attention_mask).split(chunksize - 2))
#     label_chunks = [curr_label] * len(input_id_chunks)
#     for i in range(len(input_id_chunks)):
#       # add CLS and SEP tokens to input IDs
#       input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
#       # add attention tokens to attention mask
#       mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
#       # get required padding length
#       pad_len = chunksize - input_id_chunks[i].shape[0]
#       # check if tensor length satisfies required chunk size
#       if pad_len > 0:
#           # if padding length is more than 0, we must add padding
#           input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.Tensor([0] * pad_len)])
#           mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])
#     return input_id_chunks, mask_chunks, label_chunks
# # chunker(a, b, 512)

# def chunk_encodings(encodings, labels):
#     encodings_dict = {'input_ids' : [],
#                       'attention_mask' : []}
#     y_train_big = []
#     for i in range(len(encodings['input_ids'])):
#         curr_input_ids = encodings['input_ids'][i]
#         curr_mask = encodings['attention_mask'][i] # train_encodings, test_encodings
#         curr_label = labels[i] # y_train

#         input_id_chunks, mask_chunks, label_chunks = chunker(curr_input_ids, curr_mask, curr_label, 512)

#         encodings_dict['input_ids'].extend(input_id_chunks)
#         encodings_dict['attention_mask'].extend(mask_chunks)
#         y_train_big.extend(label_chunks)
#     return encodings_dict, y_train_big

**Test Section**`

In [None]:
methodologies = {0 : ('BERT First 512', create_encodings, MakeTorchData, True, False, False),
                 1 : ('BERT Chunk 512', create_encodings, MakeTorchData, True, False, True)}


model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_tokens(['<lb>', '</lb>', '<sb>', '</sb>', '[part]'])
max_length = 50000
batch_size = 16

# Training Arguments
evaluation_strategy = 'epoch'
save_strategy = 'epoch'
learning_rate = 5e-5
per_device_train_batch_size = batch_size
per_device_eval_batch_size = batch_size
num_train_epochs = 15
weight_decay = 0
load_best_model_at_end = True
metric_for_best_model = 'r2'

main_args= {'methodologies' : methodologies,
            'model_name' : model_name,
            'tokenizer' : tokenizer,
            'evaluation_strategy': evaluation_strategy,
            'save_strategy' : save_strategy,
            'learning_rate' : learning_rate,
            'per_device_train_batch_size' : per_device_train_batch_size,
            'per_device_eval_batch_size' : per_device_eval_batch_size,
            'num_train_epochs' : num_train_epochs,
            'weight_decay': weight_decay,
            'load_best_model_at_end' : load_best_model_at_end,
            'metric_for_best_model' : metric_for_best_model}


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
# methodology = 1
# methodology_name = methodologies[methodology][0]
# create_encodings_fx = methodologies[methodology][1]
# make_datasets_fx = methodologies[methodology][2]
# standardize_parts = methodologies[methodology][3]
# see_lbs = methodologies[methodology][4]
# chunk = methodologies[methodology][5]

# print(f"Running following model: {methodology_name}")
# print(f"Methodology: {methodology_name}")
# print(f"Create encodings function: {create_encodings_fx}")
# print(f"Standardize parts: {standardize_parts}")
# print(f"See line breaks: {see_lbs}")
# print(f"Chunking: {chunk}")

# train_dataset, valid_dataset = create_datasets(tokenizer, create_encodings_fx, make_datasets_fx, standardize_parts, see_lbs)

my_batch = [train_dataset[i] for i in range(3)]

def chunker(item, chunksize):
    newObs = []
    input_id_chunks = list(item['input_ids'].split(chunksize - 2))
    mask_chunks = list(item['attention_mask'].split(chunksize - 2))
    for i in range(len(input_id_chunks)):
        if input_id_chunks[i][-1].item() == 0:
            break
        input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.Tensor([0] * pad_len)])
            mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])
        newDictItem = {}
        newDictItem['input_ids'] = input_id_chunks[i]
        newDictItem['attention_mask'] = mask_chunks[i]
        newDictItem['labels'] = item['labels']
        newObs.append(newDictItem)
    return newObs

def collate_batch_into_chunks(features):
    newObs = []
    for item in features:
        newObs.extend(chunker(item, 512))
    
    print(len(newObs))
    return newObs

# collate_batch_into_chunks(my_batch)

# print("Finalized dataset creation, moving on to model...")
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1, ignore_mismatched_sizes = True) # np.log(1000)
# model.resize_token_embeddings(len(tokenizer))

# if USE_CUDA:
#     model = model.cuda()

# args = TrainingArguments(
#     f"test-{model_name}-finetuned",
#     evaluation_strategy = evaluation_strategy,
#     save_strategy = save_strategy,
#     learning_rate = learning_rate,
#     per_device_train_batch_size=per_device_train_batch_size,
#     per_device_eval_batch_size=per_device_eval_batch_size,
#     num_train_epochs=num_train_epochs,
#     weight_decay=weight_decay,
#     load_best_model_at_end=load_best_model_at_end,
#     metric_for_best_model=metric_for_best_model,
# )
# info = [model_name, learning_rate, per_device_train_batch_size, per_device_eval_batch_size, num_train_epochs]

# print("Instantiating the Trainer...")
# # # Call the Trainer
# trainer = Trainer(
#     model=model,                         # the instantiated Transformers model to be trained
#     args=args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=valid_dataset,          # evaluation dataset

#     compute_metrics=compute_metrics_for_regression    # the callback that computes metrics of interest
# )

# print("Training the model...")
# # # # Train the model
# trainer.train()

12


[{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
methodologies = {0 : ('First try', create_train_test_encodings, MakeTorchData, True, False)}
id = uuid.uuid4()
methodology = 0

methodology_name = methodologies[methodology][0]
create_encodings_fx = methodologies[methodology][1]
make_datasets_fx = methodologies[methodology][2]
standardize_parts = methodologies[methodology][3]
see_lbs = methodologies[methodology][4]

print(f"Running following model: {methodology_name}")
print(f"Methodology: {methodology_name}")

# train_dataset, valid_dataset = create_datasets(tokenizer, create_encodings_fx, make_datasets_fx, standardize_parts, see_lbs)

# 1. def create_datasets(tokenizer, create_encodings_fx, make_datasets_fx, standardize_parts = True, see_lbs = False, chunk = False):
chunk = True
max_length = 512
reg_albums = init_albums(path = '', file = 'albums_f.pickle', 
            standardize_parts = standardize_parts, see_lbs = see_lbs)
    
x_train, y_train, x_test, y_test = split_datasets(reg_albums, 'c_rate')

add_special_tokens = True
if chunk:
    add_special_tokens = False

# 2. def create_train_test_encodings(tokenizer, train_text, test_text, **kwargs):
train_encodings, test_encodings = create_encodings_fx(tokenizer, x_train, x_test, 
                                                      max_length = max_length, 
                                                      add_special_tokens = add_special_tokens)

a = train_encodings['input_ids'][0]
b = train_encodings['attention_mask'][0]

# print(c[0])
# print(train_dataset[0])
# print(a)
# print(b)
# print(type(train_encodings))

# print(train_encodings['input_ids'][0:3])
# print(train_encodings['input_ids'][0:1])




# print("Creating train encodings")
# train_encodings = tokenizer(x_train, truncation=True, padding=True, max_length=max_length)

# len(train_encodings['input_ids'])
# chunksize = max_length - 2

# input_id_chunks = list(torch.tensor(train_encodings['input_ids'][0]).split(chunksize - 2))
# mask_chunks = list(torch.tensor(train_encodings['attention_mask'][0]).split(chunksize - 2))

# for i in range(len(input_id_chunks)):
#     # add CLS and SEP tokens to input IDs
#     input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
#     # add attention tokens to attention mask
#     mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
#     # get required padding length
#     pad_len = chunksize - input_id_chunks[i].shape[0]
#     # check if tensor length satisfies required chunk size
#     if pad_len > 0:
#         # if padding length is more than 0, we must add padding
#         input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.Tensor([0] * pad_len)])
#         mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])

# # check length of each tensor
# for chunk in input_id_chunks:
#     print(len(chunk))
# # print final chunk so we can see 101, 102, and 0 (PAD) tokens are all correctly placed
# chunk
# [print(i) for i in x_train[:10]]
# print(train_encodings)


# print("Finalized dataset creation, moving on to model...")
# model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 1, ignore_mismatched_sizes = True) # np.log(1000)
# model.resize_token_embeddings(len(tokenizer))

# if USE_CUDA:
#     model = model.cuda()

# args = TrainingArguments(
#     f"test-{model_name}-finetuned",
#     evaluation_strategy = evaluation_strategy,
#     save_strategy = save_strategy,
#     learning_rate = learning_rate,
#     per_device_train_batch_size=per_device_train_batch_size,
#     per_device_eval_batch_size=per_device_eval_batch_size,
#     num_train_epochs=num_train_epochs,
#     weight_decay=weight_decay,
#     load_best_model_at_end=load_best_model_at_end,
#     metric_for_best_model=metric_for_best_model,
# )
# info = [model_name, learning_rate, per_device_train_batch_size, per_device_eval_batch_size, num_train_epochs]

# print("Instantiating the Trainer...")
# # # Call the Trainer
# trainer = Trainer(
#     model=model,                         # the instantiated Transformers model to be trained
#     args=args,                  # training arguments, defined above
#     train_dataset=train_dataset,         # training dataset
#     eval_dataset=valid_dataset,          # evaluation dataset
#     compute_metrics=compute_metrics_for_regression    # the callback that computes metrics of interest
# )

# print("Training the model...")
# # # # Train the model
# trainer.train()

# print("Evaluating the model...")
# # # # Call the summary
# # trainer.evaluate()

Running following model: First try
Methodology: First try
After making limitations, working with 2856 albums in total...
2814/2856 (98.5%) albums, have length >200 and are retained.
512
False
Creating train encodings
Creating valid encodings


In [None]:
def split_datasets(reg_albums, rating_type):
    num_train = int(len(reg_albums) * 0.8)
    num_test = len(reg_albums) - num_train
    train, test = random_split(reg_albums, [num_train, num_test])

    x_train = [i[3] for i in train]
    x_test = [i[3] for i in test]
    
    if rating_type == 'c_rate':
        y_train = [int(i[1]) for i in train]
        y_test = [int(i[1]) for i in test]
    elif rating_type == 'u_rate':
        y_train = [int(10 * i[2]) for i in train]
        y_test = [int(10 * i[2]) for i in test]
    
    return x_train, y_train, x_test, y_test

def create_train_test_encodings(tokenizer, train_text, test_text, **kwargs):
    max_length = None
    if 'max_length' in kwargs:
        max_length = kwargs['max_length']
    print(max_length)

    if 'add_special_tokens' in kwargs:
        add_special_tokens = kwargs['add_special_tokens']
    print(add_special_tokens)

    print("Creating train encodings")
    train_encodings = tokenizer(train_text, truncation=True, padding=True, 
                                max_length=max_length, add_special_tokens = add_special_tokens)
    print("Creating valid encodings")
    test_encodings = tokenizer(test_text, truncation=True, padding=True,
                               max_length=max_length, add_special_tokens = add_special_tokens)
    return train_encodings, test_encodings

class MakeTorchData(torch.utils.data.Dataset):
      def __init__(self, encodings, labels):
          self.encodings = encodings
          self.labels = labels

      def __getitem__(self, idx):
          item = {}
          for k, v in self.encodings.items():
              if torch.is_tensor(v):
                  print("yo, it's a tensor")
                  item[k] = v[idx]
              else:
                item[k] = torch.tensor(v[idx])
                print("yo, it is not a tensor")
          #     item[k] = v
          # item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
          item["labels"] = torch.tensor([self.labels[idx]])
          item["labels"] = float(item["labels"])
          return item

      def __len__(self):
          return len(self.labels)


def chunker(input_ids, attention_mask, curr_label, chunksize):
    input_id_chunks = list(torch.tensor(input_ids).split(chunksize - 2))
    mask_chunks = list(torch.tensor(attention_mask).split(chunksize - 2))
    label_chunks = [curr_label] * len(input_id_chunks)
    for i in range(len(input_id_chunks)):
      # add CLS and SEP tokens to input IDs
      input_id_chunks[i] = torch.cat([torch.tensor([101]), input_id_chunks[i], torch.tensor([102])])
      # add attention tokens to attention mask
      mask_chunks[i] = torch.cat([torch.tensor([1]), mask_chunks[i], torch.tensor([1])])
      # get required padding length
      pad_len = chunksize - input_id_chunks[i].shape[0]
      # check if tensor length satisfies required chunk size
      if pad_len > 0:
          # if padding length is more than 0, we must add padding
          input_id_chunks[i] = torch.cat([input_id_chunks[i], torch.Tensor([0] * pad_len)])
          mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])
    return input_id_chunks, mask_chunks, label_chunks
# chunker(a, b, 512)

def chunk_encodings(encodings, labels):
    encodings_dict = {'input_ids' : [],
                      'attention_mask' : []}
    y_train_big = []
    for i in range(len(encodings['input_ids'])):
        curr_input_ids = encodings['input_ids'][i]
        curr_mask = encodings['attention_mask'][i] # train_encodings, test_encodings
        curr_label = labels[i] # y_train

        input_id_chunks, mask_chunks, label_chunks = chunker(curr_input_ids, curr_mask, curr_label, 512)

        encodings_dict['input_ids'].extend(input_id_chunks)
        encodings_dict['attention_mask'].extend(mask_chunks)
        y_train_big.extend(label_chunks)
    return encodings_dict, y_train_big

def create_datasets(tokenizer, create_encodings_fx, make_datasets_fx, standardize_parts = True, see_lbs = False, chunk = False):
    reg_albums = init_albums(path = '', file = 'albums_f.pickle', 
            standardize_parts = standardize_parts, see_lbs = see_lbs)
    
    x_train, y_train, x_test, y_test = split_datasets(reg_albums, 'c_rate')

    add_special_tokens = True
    if chunk:
        add_special_tokens = False
    
    train_encodings, test_encodings = create_encodings_fx(tokenizer, x_train, x_test, 
                                                          max_length = max_length, 
                                                          add_special_tokens = add_special_tokens)

    # if chunk:
    #     train_encodings_new, y_train_new = chunk_encodings(train_encodings, y_train)
    #     test_encodings_new, y_test_new = chunk_encodings(test_encodings, y_test)

    train_dataset = make_datasets_fx(train_encodings_new, y_train_new)
    valid_dataset = make_datasets_fx(test_encodings_new, y_test_new)
    return train_dataset, valid_dataset

In [None]:
train_dataset[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1

In [None]:

print(train_dataset[0])

{'input_ids': tensor([  101, 30524, 10506, 13106,  6170,  1997,  2115,  9135,  1999,  2023,
         5507,  8004,  6313,  9478,  1997, 17128,  8554, 12758,  1999,  2023,
         3690,  2302, 13734,  2302, 13734,  1010,  2302, 13734,  2057,  2562,
         8119, 15561,  1999,  2023,  3690,  1997, 15862,  2040,  2562, 11065,
        24501, 16781,  2013,  1996,  8616,  4355,  1997,  3628,  1010,  3628,
         2023, 14099, 27523, 23436,  1045,  1005,  1049,  2469,  2017,  2657,
         2009,  2077,  2612,  1997,  8119, 15561,  2191,  2033,  1037, 14412,
         7485,  2006,  2115,  2723,  2006,  2115,  2723,  1010,  2006,  2115,
         2723,  1012,  1012,  1012, 30525, 30524, 30525, 30524,  2017,  2020,
         1037,  9467,  3238,  2775,  2316,  6340,  2011, 10551,  2892, 14731,
         2505,  2021, 10256,  2748,  1998,  2053,  2074,  3432,  4694,  1005,
         1056,  8826,  2664,  2821,  2748,  1010,  2821,  2053,  2059,  2028,
         2154,  2017,  1005,  1040,  2018,  2009, 