<a href="https://colab.research.google.com/github/heinohen/tko_7095_i2hlt/blob/main/week3_exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
!pip3 install -q transformers[torch] datasets evaluate optuna plotly

In [68]:
from pprint import pprint # Pretty print
import datasets
from google.colab import userdata
userdata.get('hf') # hugging face secret
dset = datasets.load_dataset('imdb') # loads dataset
print(dset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [69]:
dset = dset.shuffle() # Like i thought in week2 ex, this should be done...
del dset['unsupervised']

In [70]:
import sklearn.feature_extraction

vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary = True, max_features = 20000) # As in ex

texts = [ex['text'] for ex in dset['train']] # Generates a list of all texts from the 'train' category of data
vectorizer.fit(texts) # Learn a vocabulary dictionary of all tokens in the raw documents.

In [71]:
# Example from course

def vectorize_example(ex) -> dict:
  vectorized = vectorizer.transform([ex['text']]) # Transform documents to document-term matrix.
  non_zero_features = vectorized.nonzero()[1] # This is from torch 'nonzero' returns a 2-D tensor where each row is the index for a nonzero value.
  non_zero_features += 1 #feature index 0 will have a special meaning
                         # so let us not produce it by adding +1 to everything
  return {"input_ids":non_zero_features}

vectorized = vectorize_example(dset['train'][0])

In [72]:
# Example from course

index_to_word = dict((k,v) for (v,k) in vectorizer.vocabulary_.items()) # Inverses the vocabulary key and value
words = []
for i in vectorized['input_ids']:
  words.append(index_to_word[i-1]) # move back to zero-index start

pprint(', '.join(words))

('about, acting, actors, all, american, and, are, arguing, at, between, both, '
 'but, by, character, classic, compare, complicated, crimes, english, '
 'enjoying, evaluate, fact, far, funny, good, hand, have, how, if, is, it, '
 'judge, just, language, like, looks, main, maybe, more, no, not, on, or, '
 'other, people, plot, plots, police, prefer, prime, quite, really, relation, '
 'series, similarities, simple, spirit, spot, superficial, suspect, than, '
 'that, the, there, they, thing, think, this, to, too, violent, way, we, weak, '
 'weirdo, writing')


In [73]:
# Apply the tokenizer to the whole dataset using .map()

# Multiprocessing significantly speeds up processing by parallelizing processes on the CPU.
# Set the num_proc parameter in map() to set the number of processes to use:

# Apply the tokenizer to the whole dataset using .map()
dset_tokenized = dset.map(vectorize_example,num_proc=4)
pprint(dset_tokenized["train"][0])

{'input_ids': [309,
               419,
               430,
               727,
               826,
               887,
               1115,
               1129,
               1300,
               1901,
               2248,
               2604,
               2625,
               3053,
               3337,
               3676,
               3726,
               4299,
               6087,
               6104,
               6294,
               6584,
               6663,
               7406,
               7801,
               8190,
               8322,
               8780,
               8929,
               9602,
               9630,
               9846,
               9890,
               10217,
               10475,
               10644,
               10871,
               11137,
               11681,
               12134,
               12202,
               12437,
               12504,
               12564,
               13028,
               13359,
               13362,
     

In [74]:
import torch

def collator(list_of_examples):
  batch = {'labels':torch.tensor(list(ex['label'] for ex in list_of_examples))} # Labels in to a single tensor
  tensors = []
  max_len = max(len(example['input_ids']) for example in list_of_examples) # Get the length of longest input
  # To build a tensor
  for e in list_of_examples:
    ids = torch.tensor(e['input_ids']) # Pick the input ids
    # https://pytorch.org/docs/stable/generated/torch.nn.functional.pad.html
    # pad(input, (left, right))
    padded = torch.nn.functional.pad(ids, (0, max_len - ids.shape[0]))
    tensors.append(padded)
  # https://pytorch.org/docs/stable/generated/torch.vstack.html
  batch['input_ids'] = torch.vstack(tensors) # Stack tensors in sequence vertically (row wise).
  return batch

In [75]:
import torch
import transformers

# A model wants a config, I can simply inherit from the base
# class for pretrained configs
class MLPConfig(transformers.PretrainedConfig):
    pass

# This is the model
class MLP(transformers.PreTrainedModel):

    config_class=MLPConfig

    # In the initialization method, one instantiates the layers
    # these will be, for the most part the trained parameters of the model
    def __init__(self,config):
        super().__init__(config)
        self.vocab_size=config.vocab_size #embedding matrix row count
        # Build and initialize embedding of vocab size +1 x hidden size (+1 because of the padding index 0!)
        self.embedding=torch.nn.Embedding(num_embeddings=self.vocab_size+1,embedding_dim=config.hidden_size,padding_idx=0)
        # Normally you would not initialize these yourself, but I have my reasons here ;)
        torch.nn.init.uniform_(self.embedding.weight.data,-0.001,0.001) #initialize the embeddings with small random values
        # Note! This is quite clever and keeps the embedding for 0, the padding, pure zeros
        # This takes care of the lower half of the network, now the upper half
        # Output layer: hidden size x output size
        self.output=torch.nn.Linear(in_features=config.hidden_size,out_features=config.nlabels)
        # Now we have the parameters of the model


    # The computation of the model is put into the forward() function
    # it receives a batch of data and optionally the correct `labels`
    #
    # If given `labels` it returns (loss,output)
    # if not, then it returns (output,)
    def forward(self,input_ids,labels=None): #nevermind the attention_mask, its time will come, data collator insists on adding it
        #1) sum up the embeddings of the items
        embedded=self.embedding(input_ids) #(batch,ids)->(batch,ids,embedding_dim)
        # Since the Embedding keeps the first row of the matrix pure zeros, we don't need to worry about the padding
        # so next we sum the embeddings across the word dimension
        # (batch,ids,embedding_dim) -> (batch,embedding_dim)
        embedded_summed=torch.sum(embedded,dim=1)

        #2) apply non-linearity
        # (batch,embedding_dim) -> (batch,embedding_dim)

        #### MODIFIED HERE FOR EXERCISE 5 -> commented out
        ####projected=torch.tanh(embedded_summed) #Note how non-linearity is applied here and not when configuring the layer in __init__()

        #3) and now apply the upper, output layer of the network
        # (batch,embedding_dim) -> (batch, num_of_classes i.e. 2 in our case)

        #### MODIFIED HERE FOR EXERCISE 5 -> base it off embedded_summed
        ##### OLD: logits=self.output(projected)
        logits=self.output(embedded_summed)

        # ...and that's all there is to it!

        #print("input_ids.shape",input_ids.shape)
        #print("embedded.shape",embedded.shape)
        #print("embedded_summed.shape",embedded_summed.shape)
        #print("projected.shape",projected.shape)
        #print("logits.shape",logits.shape)

        # We have labels, so we ought to calculate the loss
        if labels is not None:
            loss=torch.nn.CrossEntropyLoss() #This loss is meant for classification, so let's use it
            # You run it as loss(model_output,correct_labels)
            return (loss(logits,labels),logits)
        else:
            # No labels, so just return the logits
            return (logits,)

# Configure the model:
#   these parameters are used in the model's __init__()


mlp_config=MLPConfig(vocab_size=len(vectorizer.vocabulary_),hidden_size=20,nlabels=2)


### Model


In [76]:
# And we can make a model
mlp = MLP(mlp_config)
fake_batch = collator([dset_tokenized["train"][0],dset_tokenized["train"][1]])
mlp(**fake_batch) #** expands input_ids and labels as parameters of the call input_ids and labels as parameters to the call

(tensor(0.7865, grad_fn=<NllLossBackward0>),
 tensor([[0.1887, 0.0079],
         [0.1886, 0.0119]], grad_fn=<AddmmBackward0>))

### Training arguments

In [95]:
# https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments

trainer_args = transformers.TrainingArguments(
    "mlp_checkpoints", #save checkpoints here
    evaluation_strategy="steps", # Evaluation is done (and logged) every eval_steps.
    logging_strategy="steps", #  Logging is done every logging_steps.
    eval_steps=500, # Number of update steps between two evaluations if evaluation_strategy="steps".
    # Will default to the same value as logging_steps if not set.
    # Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps.
    logging_steps=500, #  Number of update steps between two logs if logging_strategy="steps".
    # Should be an integer or a float in range [0,1). If smaller than 1, will be interpreted as ratio of total training steps.
    learning_rate=1e-4, #learning rate of the gradient descent
    # float, optional, defaults to 5e-5) — The initial learning rate.
    max_steps=20000, #  (int, optional, defaults to -1)
    # If set to a positive number, the total number of training steps to perform.
    # Overrides num_train_epochs. For a finite dataset, training is reiterated through the dataset (if all data is exhausted)

    # until max_steps is reached.
    #num_train_epochs=5.0,
    load_best_model_at_end=True, # Whether or not to load the best model found during training at the end of training.
    # When this option is enabled, the best checkpoint will always be saved.
    per_device_train_batch_size = 64
)

pprint(trainer_args)

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=500,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_l

### Compute accuracy


In [78]:
import numpy as np
import evaluate

# Evaluate is a library that makes evaluating and comparing models
# and reporting their performance easier and more standardized.
# https://pypi.org/project/evaluate/

accuracy = evaluate.load('accuracy')

def compute_accuracy(outputs_and_labels):
  outputs, labels = outputs_and_labels
  preds = np.argmax(outputs, axis = -1) # Returns the indices of the maximum values along an axis.
  # https://numpy.org/doc/stable/reference/generated/numpy.argmax.html
  return accuracy.compute(predictions = preds, references = labels)

### Make a new model


In [96]:
mlp = MLP(mlp_config)

# Argument gives the number of steps of patience before early stopping
# i.e. training is stopped when the evaluation loss fails to improve
# certain number of times

early_stopping = transformers.EarlyStoppingCallback(5)
# ( early_stopping_patience: int = 1, early_stopping_threshold: Optional = 0.0 )
#  — Use with metric_for_best_model to stop training when the specified metric worsens
# for early_stopping_patience evaluation calls.

trainer = transformers.Trainer(
    model = mlp,
    args = trainer_args,
    train_dataset = dset_tokenized['train'],
    eval_dataset = dset_tokenized['test'].select(range(1000)), #make a smaller subset to evaluate on
    compute_metrics = compute_accuracy,
    data_collator = collator,
    callbacks = [early_stopping]
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy
500,0.5226,0.414814,0.852
1000,0.3167,0.326775,0.877
1500,0.2394,0.299593,0.888
2000,0.1951,0.292867,0.888
2500,0.1672,0.294402,0.882
3000,0.1408,0.300209,0.879
3500,0.1271,0.310201,0.877
4000,0.107,0.323284,0.873
4500,0.0952,0.336304,0.866


Checkpoint destination directory mlp_checkpoints/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory mlp_checkpoints/checkpoint-3500 already exists and is no

TrainOutput(global_step=4500, training_loss=0.2123383526272244, metrics={'train_runtime': 79.0895, 'train_samples_per_second': 16184.204, 'train_steps_per_second': 252.878, 'total_flos': 28613549664.0, 'train_loss': 0.2123383526272244, 'epoch': 11.51})

Reporting blog 😸

**LEARNING RATE**

TrainingArguments(

learning_rate=0.0001,
# All other unchanged
)

TrainOutput(global_step=4500, training_loss=0.18218419308132597, metrics={'train_runtime': 133.83, 'train_samples_per_second': 19128.74, 'train_steps_per_second': 149.443, 'total_flos': 61814021472.0, 'train_loss': 0.18218419308132597, 'epoch': 22.96})


TrainingArguments(

learning_rate=1e-05,
,
# All other unchanged
)



TrainOutput(global_step=20000, training_loss=0.33597693824768066, metrics={'train_runtime': 625.5423, 'train_samples_per_second': 4092.449, 'train_steps_per_second': 31.972, 'total_flos': 274545274752.0, 'train_loss': 0.33597693824768066, 'epoch': 102.04})



TrainingArguments(

learning_rate=1e-05,
,
# All other unchanged
)

TrainOutput(global_step=20000, training_loss=0.6240427291870118, metrics={'train_runtime': 616.7456, 'train_samples_per_second': 4150.82, 'train_steps_per_second': 32.428, 'total_flos': 274545274752.0, 'train_loss': 0.6240427291870118, 'epoch': 102.04})

**NUM_TRAIN_EPOCHS**

An epoch in machine learning means one complete pass of the training dataset through the algorithm

TrainingArguments(

num_train_epochs = 1,
,
# All other unchanged
)

TrainOutput(global_step=196, training_loss=0.6293549051090163, metrics={'train_runtime': 5.3816, 'train_samples_per_second': 4645.454, 'train_steps_per_second': 36.42, 'total_flos': 2702724192.0, 'train_loss': 0.6293549051090163, 'epoch': 1.0})

TrainingArguments(

num_train_epochs = 3,
,
# All other unchanged
)
TrainOutput(global_step=588, training_loss=0.5115975944363341, metrics={'train_runtime': 17.3097, 'train_samples_per_second': 4332.835, 'train_steps_per_second': 33.969, 'total_flos': 8088105312.0, 'train_loss': 0.5115975944363341, 'epoch': 3.0})

TrainingArguments(

num_train_epochs = 5,
,
# All other unchanged
)

TrainOutput(global_step=980, training_loss=0.43474208286830357, metrics={'train_runtime': 29.7027, 'train_samples_per_second': 4208.372, 'train_steps_per_second': 32.994, 'total_flos': 13483141056.0, 'train_loss': 0.43474208286830357, 'epoch': 5.0})



**PER_DEVICE_BATCH**




TrainingArguments(

per_device_batch = 8,
All other unchanged

)

TrainOutput(global_step=8500, training_loss=0.26540270457548254, metrics={'train_runtime': 58.3357, 'train_samples_per_second': 2742.744, 'train_steps_per_second': 342.843, 'total_flos': 4506209568.0, 'train_loss': 0.26540270457548254, 'epoch': 2.72})

TrainingArguments(

per_device_batch = 32,
# All other unchanged

)

TrainOutput(global_step=5000, training_loss=0.23753613052368164, metrics={'train_runtime': 59.7578, 'train_samples_per_second': 10709.907, 'train_steps_per_second': 334.685, 'total_flos': 14289180192.0, 'train_loss': 0.23753613052368164, 'epoch': 6.39})

TrainingArguments(

per_device_batch = 64,
# All other unchanged

)


TrainOutput(global_step=4500, training_loss=0.2123383526272244, metrics={'train_runtime': 79.0895, 'train_samples_per_second': 16184.204, 'train_steps_per_second': 252.878, 'total_flos': 28613549664.0, 'train_loss': 0.2123383526272244, 'epoch': 11.51})