# 1. Imports


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import time
import glob
import torch
import torch.nn as nn
from pathlib import Path

import re
import string
import math
from collections import defaultdict
import random
import warnings
from tqdm import tqdm
from nltk.corpus import stopwords
from prettytable import PrettyTable

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc,accuracy_score,auc,accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
from transformers import AdamW
from transformers import TrainingArguments, EarlyStoppingCallback
from transformers import Trainer
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer


import torch.nn.utils.prune as prune
from torch.utils.tensorboard import SummaryWriter
from torch.nn import Linear, Module

import wandb
import optuna
#from prettytable import PrettyTable
import copy
from datasets import Dataset,DatasetDict
from datasets import load_dataset, load_metric ,concatenate_datasets


In [2]:
data_path = 'data/'
tb_dir = 'tbs_HW2/'
results = 'results/'
models_trained_params = 'models_trained_params/'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
home_dir = os.getcwd()

# 2. Preparing the data

Fetching the data

In [3]:
def get_clean_data(name):
    data = pd.read_csv(Path(home_dir, name)).iloc[:,1:].rename({'filtered_text_2':'filtered_text'},axis=1)
#     clean_data=clean_data(data) #replace with the real funcion name
    return data


data=get_clean_data('preprocessed_df.csv')


Encoding the labels

In [4]:
features=data['filtered_text']
labels=data['main_category'].values


# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Fit the encoder on the labels and transform the labels
encoded_labels = label_encoder.fit_transform(labels)

# Replace the labels column in the dataframe with the encoded labels
data['encoded_main_category'] = encoded_labels

Splitting the data

In [5]:
def split_data(x, y, test_ratio=0.2, random_state=42 ,batch_size=32):
    train_size=int((1-test_ratio)*(x.shape[0]))
    test_size=int((x.shape[0])-train_size)

    train_end_idx = train_size
    test_end_idx = train_size + test_size
    # Make one list for all the reviews
    headlines = x.tolist()

    # Mini sample of reviews for train and test
    train_data = headlines[:train_end_idx]
    test_data = headlines[train_end_idx:test_end_idx]
    # Take mini sample of the labels and preprocess them such that we can use them in the model training loop
    labels = encoded_labels.tolist()
    train_labels = labels[:train_end_idx]
    test_labels = labels[train_end_idx:test_end_idx]

    # Use data set class in order to build train and test datasets
    train_df = pd.DataFrame({'text':train_data, 'labels':train_labels})#.dropna().drop_duplicates()
    test_df = pd.DataFrame({'text':test_data, 'labels':test_labels})#.dropna().drop_duplicates()
    train_df.to_csv('train_df.csv', index = False)
    test_df.to_csv('test_df.csv', index = False)
    
    data_files = {
    'train':'train_df.csv',
    'test':'test_df.csv'
    }
    
    raw_datasets = load_dataset("csv", data_files=data_files)
    

    return raw_datasets


In [6]:
dataset = split_data(features, labels)

Downloading and preparing dataset csv/default to C:/Users/liyag/.cache/huggingface/datasets/csv/default-2e78edd390e366fa/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to C:/Users/liyag/.cache/huggingface/datasets/csv/default-2e78edd390e366fa/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 388761
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 97191
    })
})

dataset

# Knowladge distilation 

In [8]:
def knowladge_distilation_training(student_model, teacher_model, train_dataset):
    epochs = 10
    optimizer = torch.optim.AdamW(student_model.parameters(), lr=1e-5)
    softmax = torch.nn.Softmax()
    relu = torch.nn.ReLU()
    # Knowledge distillation training loop
    for epoch in range(epochs):
        for batch in train_dataset:
            # Forward pass with the teacher model to generate soft targets
            input_ids = batch['input_ids']
            input_ids = torch.tensor(input_ids).unsqueeze(0).to(device)  # Add batch dimension
            attention_mask = batch['attention_mask']
            attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)  # Add batch dimension

            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids, attention_mask=attention_mask)
                teacher_logits = teacher_outputs.logits

            # Forward pass with the student model
            student_outputs = student_model(input_ids, attention_mask=attention_mask)
            student_logits = student_outputs.logits

            # Compute the distillation loss
            temperature = 2  # Temperature parameter for softening the logits
            soft_teacher_logits = softmax(teacher_logits / temperature)
            soft_student_logits = softmax(student_logits / temperature)
            distillation_loss = torch.nn.KLDivLoss()(torch.log_softmax(student_logits, dim=-1),
                                                     soft_teacher_logits.detach())

            # Optionally, add other losses such as cross-entropy or task-specific losses

            # Backpropagation and optimization
            optimizer.zero_grad()
            total_loss = distillation_loss  # You can add other losses here
            total_loss.backward()
            optimizer.step()

    # Optionally, you can fine-tune the student model on labeled data using traditional supervised learning techniques
    return student_model

# 3. Creating the model class

In [17]:
class OurAwesomeModel():
    def __init__(self,model_name,dataset):
        self.model_name=model_name
        self.model=AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=6,return_dict=True).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.dataset=dataset
        self.token_args={"max_length": 32, "truncation": True, "padding": "max_length"}
        
        self.tokenized_datasets = self.tokenization()
        
    # function that tokenize the data and returns train and test tokenized dataset in torch format
    def tokenization(self):
        tokenized_dataset=self.dataset.map(self.tokenizer, input_columns='text', fn_kwargs=self.token_args)
        tokenized_dataset.set_format('torch')
        return tokenized_dataset
    
    # function that defines the Trainer object and train the model
    def train(self,train_args):
        train_args = TrainingArguments(**train_args)
        trainer = Trainer(
            model=self.model,
            args=train_args,
            train_dataset=self.tokenized_datasets['train'],
            eval_dataset=self.tokenized_datasets['test'],
            compute_metrics=self.metric_fn)
        trainer.train()

    
    # definition of the metric we want to optimize
    def metric_fn(self, predictions):
        preds = predictions.predictions.argmax(axis=1)
        labels = predictions.label_ids
        return {'f1': f1_score(labels, preds, average='weighted')
               ,'accuracy':accuracy_score(labels,preds)}
    
    # help function for hyperparameter finetuning
    def model_init(self):
        return self.model
    
    # function that prefurms hyperparameter finetuning with optuna as backend. 
    # The function gets the train arguments from the function 'my_hp_space'.
    def hpm_search(self,train_args_dict):
        wandb.init(project="model1", name = self.model_name)
        train_args = TrainingArguments(**train_args_dict)
        trainer = Trainer(
            model=self.model,
            args=train_args,
            train_dataset=self.tokenized_datasets['train'],
            eval_dataset=self.tokenized_datasets['test'],
            model_init=self.model_init,
            compute_metrics=self.metric_fn)
        
        best_run=trainer.hyperparameter_search(direction="maximize", hp_space=self.my_hp_space,n_trials=10)
        best_hyperparameters = best_run.hyperparameters
        print(best_hyperparameters)
        train_args_dict.update(best_hyperparameters)
        print(train_args_dict)
        self.train(TrainingArguments(**train_args_dict))
        self.save_model()
        wandb.finish()
    
    # function that is called in every optuna trial, and return a combination of hyperparameters to try. 
    def my_hp_space(self,trial):
        return {"learning_rate": trial.suggest_float("learning_rate", 1e-6, 5e-5, log=True),
                "num_train_epochs": trial.suggest_categorical("num_train_epochs", [10]),
                "seed": trial.suggest_categorical("seed", [0]),
                "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16,32]),
                "gradient_accumulation_steps":trial.suggest_int("gradient_accumulation_steps",1,6),
                "warmup_steps":trial.suggest_int("warmup_steps",0,500),
                "weight_decay":trial.suggest_float("weight_decay",1e-4,1e-2),
                "per_device_eval_batch_size":trial.suggest_categorical("per_device_eval_batch_size",[16,32])}
    
    
    def evaluate(self, train_args_dict):
        train_args = TrainingArguments(**train_args_dict)
        trainer = Trainer(
            model=self.model,
            args=train_args,
            train_dataset=self.tokenized_datasets['train'],
            eval_dataset=self.tokenized_datasets['test'],
            compute_metrics=self.metric_fn)
        
        predictions = trainer.predict(self.tokenized_datasets['test'])
        result_dict = self.metric_fn(predictions)
        for k,v in result_dict.items():
            print(f'{k} value: {v}')
    
    # First contraction function
    # pruns the amount% of the smallest weights
    def pruning(self,amount):           
        for module_name, module in self.model.named_modules():
            if isinstance(module, Linear):# and 'ff' in module_name:
                print(f'\n{module_name}:\nold_total_weights = {module.weight.sum()}')
                prune.l1_unstructured(module, name='weight', amount=amount)
                prune.remove(module, name='weight')
                print(f'new_total_weights = {module.weight.sum()}')

    # second contraction function
    # saves the weights in half of the space 
    def half(self):
        self.model = self.model.half()
    
    # function that prints the models' weights. 
    # The weights are presented in their absolute value such that negative and positive weights wont offset
    def show_parameters(self):
        table = PrettyTable(["Modules", "Parameters","Sum of Tensor"])
        total_params = 0
        total_sum = 0
        for name, parameter in self.model.named_parameters():
            if not parameter.requires_grad: 
                continue
            params = parameter.numel()
            total_sum += parameter.abs().sum()
            total=float(str(parameter.abs().sum()).split(',')[0][7:])
            table.add_row([name, params,total])
            total_params+=params
        table.add_row(["Total Trainable Params",str(total_params),str(total_sum.item())])
        print(table)
        return total_params

    def save_model(self, model_type = None):
        if model_type == None:
            torch.save(self.model.state_dict(), f'{self.model_name}.pt')
        else:
            torch.save(self.model.state_dict(), f'{model_type}_{self.model_name}.pt')
        print('model was saved')
        
    def load_trained_model(self, model_type = None):
        if model_type == None:
            self.model.load_state_dict(torch.load(f'{self.model_name}.pt'))
        else:
            self.model.load_state_dict(torch.load(f'{model_type}_{self.model_name}.pt'))
        print('model was loaded')

# 4 Model initialization

Intializing params

In [10]:
wandb.finish()

In [11]:
wandb.init(project="model1")

OUT_PATH=Path(home_dir,'results')

args = {'output_dir':OUT_PATH,
 'overwrite_output_dir':True,
 'greater_is_better':True,
 'evaluation_strategy':'steps',
 'do_train':True,
 'logging_strategy':'epoch',
 'save_strategy':'epoch',
 'report_to':'wandb'}

[34m[1mwandb[0m: Currently logged in as: [33mliyag[0m ([33mdelta_lxr[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [13]:
model_names = ["xlnet-base-cased","bert-base-uncased"]
for model_name in model_names:
    #Initializing the model
    print('==========================================================================')
    print(f'============================== {model_name} ==============================')
    print('==========================================================================')
    this_model=OurAwesomeModel(model_name,dataset)
    this_model.show_parameters()
    this_model.evaluate(args)
    #finding the best hyperparameters and retraining the model
    print(f'------------------------------ regular {model_name} ------------------------------')
    this_model.hpm_search(args)
    this_model.show_parameters()
    this_model.evaluate(args)



Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Map:   0%|          | 0/388761 [00:00<?, ? examples/s]

Map:   0%|          | 0/97191 [00:00<?, ? examples/s]

+-------------------------------------------------+------------+---------------+
|                     Modules                     | Parameters | Sum of Tensor |
+-------------------------------------------------+------------+---------------+
|               transformer.mask_emb              |    768     |     7.254     |
|        transformer.word_embedding.weight        |  24576000  |   1074791.5   |
|          transformer.layer.0.rel_attn.q         |   589824   |   44567.0234  |
|          transformer.layer.0.rel_attn.k         |   589824   |   94692.3594  |
|          transformer.layer.0.rel_attn.v         |   589824   |   15121.5391  |
|          transformer.layer.0.rel_attn.o         |   589824   |   16153.6406  |
|          transformer.layer.0.rel_attn.r         |   589824   |   10870.3428  |
|      transformer.layer.0.rel_attn.r_r_bias      |    768     |    119.4765   |
|      transformer.layer.0.rel_attn.r_s_bias      |    768     |    135.992    |
|      transformer.layer.0.r

f1 value: 0.07812441269462927
accuracy value: 0.15471597164346493
------------------------------ regular xlnet-base-cased ------------------------------


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

[32m[I 2023-06-09 15:52:15,815][0m A new study created in memory with name: no-name-18028011-b32e-452c-89b7-c0134b0d2149[0m


Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.120128,0.598907,0.602031
1000,No log,1.006096,0.637887,0.639679
1500,No log,0.968242,0.652609,0.654268
2000,No log,0.940844,0.661977,0.664249
2500,No log,0.921573,0.66565,0.66929
3000,No log,0.906607,0.674113,0.675371
3500,No log,0.890034,0.67862,0.680773
4000,No log,0.891456,0.678572,0.680763


[32m[I 2023-06-09 16:30:01,035][0m Trial 0 finished with value: 1.3593341776112122 and parameters: {'learning_rate': 1.373489595987342e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 3, 'warmup_steps': 43, 'weight_decay': 0.0074106703716325575, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 1.3593341776112122.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▄▆▇▇███
eval/f1,▁▄▆▇▇███
eval/loss,█▅▃▃▂▂▁▁
eval/runtime,▆▃▂▁█▄▅▁
eval/samples_per_second,▃▆▇█▁▅▄█
eval/steps_per_second,▃▆▇█▁▅▄█
train/epoch,▁▂▃▄▅▆▇███
train/global_step,▁▂▃▄▅▆▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.68076
eval/f1,0.67857
eval/loss,0.89146
eval/runtime,157.0817
eval/samples_per_second,618.729
eval/steps_per_second,38.674
train/epoch,1.0
train/global_step,4049.0
train/learning_rate,0.0
train/loss,1.0345


Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.904572,0.676291,0.678108
1000,No log,0.927829,0.676025,0.678417
1500,No log,0.94313,0.681107,0.681966
2000,No log,0.959677,0.67678,0.678725
2500,No log,0.933554,0.683145,0.684415
3000,No log,0.9384,0.681046,0.683088
3500,No log,0.920184,0.68531,0.687183
4000,No log,0.90951,0.686306,0.687718
4500,No log,0.896039,0.687725,0.68885
5000,No log,0.884864,0.689374,0.691391


[32m[I 2023-06-09 17:04:31,214][0m Trial 1 finished with value: 1.3846022626670083 and parameters: {'learning_rate': 8.0042071943296e-06, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 2, 'warmup_steps': 226, 'weight_decay': 0.009505582380919406, 'per_device_eval_batch_size': 32}. Best is trial 1 with value: 1.3846022626670083.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁▃▁▄▃▅▅▆▇██
eval/f1,▁▁▃▁▄▃▅▆▆▇██
eval/loss,▄▆▇█▆▆▅▄▃▂▁▁
eval/runtime,█▅▄▁▆▄▁▅▇▄▂▆
eval/samples_per_second,▁▄▄█▃▅█▄▂▅▇▃
eval/steps_per_second,▁▄▄█▃▅█▄▂▅▇▃
train/epoch,▁▂▂▃▄▄▅▅▆▇▇███
train/global_step,▁▂▂▃▄▄▅▅▆▇▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.69337
eval/f1,0.69124
eval/loss,0.86642
eval/runtime,83.0871
eval/samples_per_second,1169.749
eval/steps_per_second,36.564
train/epoch,1.0
train/global_step,6074.0
train/learning_rate,0.0
train/loss,0.8006


Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.957291,0.685401,0.686247
1000,No log,0.979467,0.682548,0.683397
1500,No log,1.061554,0.680159,0.680989
2000,No log,1.078669,0.678191,0.679621
2500,No log,1.121005,0.67239,0.67534
3000,No log,1.145086,0.677387,0.677326
3500,No log,1.148397,0.674955,0.677779
4000,No log,1.126086,0.67687,0.676822
4500,No log,1.122853,0.676472,0.676997
5000,No log,1.10128,0.680873,0.682121


[32m[I 2023-06-09 17:59:00,790][0m Trial 2 finished with value: 1.3893931968850302 and parameters: {'learning_rate': 9.72585136490878e-06, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'warmup_steps': 450, 'weight_decay': 0.002903679962944256, 'per_device_eval_batch_size': 32}. Best is trial 2 with value: 1.3893931968850302.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▅▄▃▂▁▂▂▂▂▃▄▃▄▅▄▄▅▆▇▇████
eval/f1,▅▄▄▃▁▃▂▂▂▄▄▃▄▅▄▅▅▆▇▇████
eval/loss,▃▄▆▆▇██▇▇▇▆▆▆▅▄▄▄▃▂▂▂▁▁▁
eval/runtime,▃▁▁▁▂▂▂▂▂▂▁▂▁▂▁▂▂▂▄▆▇█▃▆
eval/samples_per_second,▆███▇▇▇▇▇▇█▇█▇█▇▇▇▅▃▂▁▆▃
eval/steps_per_second,▆███▇▇▇▇▇▇█▇█▇█▇▇▇▅▃▂▁▆▃
train/epoch,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇████
train/global_step,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇████
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.69575
eval/f1,0.69364
eval/loss,0.86803
eval/runtime,85.3835
eval/samples_per_second,1138.288
eval/steps_per_second,35.581
train/epoch,1.0
train/global_step,12149.0
train/learning_rate,0.0
train/loss,0.6406


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.873198,0.692193,0.694324
1000,No log,0.86828,0.695727,0.69739
1500,No log,0.855643,0.697153,0.698336
2000,No log,0.85887,0.698641,0.70024
2500,No log,0.850856,0.700345,0.701433
3000,No log,0.851248,0.699806,0.701598
3500,No log,0.842568,0.702038,0.703923
4000,No log,0.841271,0.701056,0.702472
4500,No log,0.83796,0.702868,0.704108
5000,No log,0.8315,0.703496,0.705117


[32m[I 2023-06-09 18:47:50,758][0m Trial 3 finished with value: 1.408663555451863 and parameters: {'learning_rate': 4.716373095627379e-06, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 4, 'warmup_steps': 332, 'weight_decay': 0.00376190193606706, 'per_device_eval_batch_size': 32}. Best is trial 3 with value: 1.408663555451863.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▃▃▅▅▆▇▆▇███
eval/f1,▁▃▄▅▆▅▇▆▇███
eval/loss,█▇▅▆▅▅▃▃▃▂▁▁
eval/runtime,▂▂▅▃█▁▂▃▃▃▃▃
eval/samples_per_second,▇▇▄▆▁█▇▆▆▆▆▆
eval/steps_per_second,▇▇▄▆▁█▇▆▆▆▆▆
train/epoch,▁▂▂▃▄▄▅▅▆▇▇███
train/global_step,▁▂▂▃▄▄▅▅▆▇▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.70531
eval/f1,0.70335
eval/loss,0.82764
eval/runtime,84.0005
eval/samples_per_second,1157.029
eval/steps_per_second,36.166
train/epoch,1.0
train/global_step,6074.0
train/learning_rate,0.0
train/loss,0.8372


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333332902596, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.963914,0.69041,0.691525
1000,No log,0.978342,0.691109,0.692317
1500,No log,0.951713,0.690907,0.6924
2000,No log,0.911814,0.69414,0.695003
2500,No log,0.95398,0.688653,0.691319
3000,No log,0.889982,0.697904,0.698233
3500,No log,0.908312,0.693674,0.695486
4000,No log,0.881859,0.699578,0.702195
4500,No log,0.859502,0.700771,0.702616
5000,No log,0.837281,0.705313,0.707679


[32m[I 2023-06-09 19:43:17,073][0m Trial 4 finished with value: 1.4262618535121545 and parameters: {'learning_rate': 1.5478032577602657e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 322, 'weight_decay': 0.008013760485701626, 'per_device_eval_batch_size': 32}. Best is trial 4 with value: 1.4262618535121545.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▁▁▂▁▃▂▄▄▆▆▇▇███
eval/f1,▂▂▂▃▁▄▂▄▅▆▆▇▇███
eval/loss,▇█▇▅▇▄▅▄▃▂▂▂▁▁▁▁
eval/runtime,▅▃▁▂▁▅▅▄▆▃▄▃█▆▄▄
eval/samples_per_second,▄▆▇▇█▄▄▅▃▆▅▆▁▃▅▅
eval/steps_per_second,▄▆█▇█▄▄▅▃▆▅▆▁▃▅▅
train/epoch,▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇███
train/global_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.71412
eval/f1,0.71214
eval/loss,0.80755
eval/runtime,84.2675
eval/samples_per_second,1153.363
eval/steps_per_second,36.052
train/epoch,1.0
train/global_step,8099.0
train/learning_rate,0.0
train/loss,0.7543


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.994766,0.70804,0.708944
1000,No log,1.036872,0.704104,0.705034
1500,No log,1.038139,0.705081,0.706598
2000,No log,1.028348,0.705412,0.707082
2500,No log,1.003674,0.705823,0.707205
3000,No log,0.968645,0.705709,0.706784
3500,No log,0.923456,0.706456,0.707679
4000,No log,0.899002,0.706659,0.708018


[32m[I 2023-06-09 20:11:24,967][0m Trial 5 finished with value: 1.4146772834597436 and parameters: {'learning_rate': 1.478613898874922e-06, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 3, 'warmup_steps': 171, 'weight_decay': 0.009318396304951801, 'per_device_eval_batch_size': 32}. Best is trial 4 with value: 1.4262618535121545.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,█▁▄▅▅▄▆▆
eval/f1,█▁▃▃▄▄▅▆
eval/loss,▆███▆▅▂▁
eval/runtime,▆▃▂▁▄▄█▇
eval/samples_per_second,▃▆▇█▅▅▁▂
eval/steps_per_second,▃▆▇█▅▅▁▂
train/epoch,▁▂▃▄▅▆▇███
train/global_step,▁▂▃▄▅▆▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.70802
eval/f1,0.70666
eval/loss,0.899
eval/runtime,84.0882
eval/samples_per_second,1155.823
eval/steps_per_second,36.129
train/epoch,1.0
train/global_step,4049.0
train/learning_rate,0.0
train/loss,0.5153


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333332902596, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.291916,0.682407,0.683633


[32m[I 2023-06-09 20:18:10,912][0m Trial 6 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.68363
eval/f1,0.68241
eval/loss,1.29192
eval/runtime,160.5545
eval/samples_per_second,605.346
eval/steps_per_second,37.838
train/epoch,0.12
train/global_step,500.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.288374,0.69246,0.693572
1000,No log,1.241664,0.691721,0.69241
1500,No log,1.122425,0.694665,0.696011
2000,No log,0.944313,0.699396,0.701114


[32m[I 2023-06-09 20:39:55,841][0m Trial 7 finished with value: 1.400510697336472 and parameters: {'learning_rate': 1.7031152456350014e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 5, 'warmup_steps': 366, 'weight_decay': 0.002920701632894948, 'per_device_eval_batch_size': 32}. Best is trial 4 with value: 1.4262618535121545.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▂▁▄█
eval/f1,▂▁▄█
eval/loss,█▇▅▁
eval/runtime,▂▃█▁
eval/samples_per_second,▇▆▁█
eval/steps_per_second,▇▆▁█
train/epoch,▁▃▅▆██
train/global_step,▁▃▅▆██
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.70111
eval/f1,0.6994
eval/loss,0.94431
eval/runtime,83.8781
eval/samples_per_second,1158.718
eval/steps_per_second,36.219
train/epoch,1.0
train/global_step,2429.0
train/learning_rate,0.0
train/loss,0.4721


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.017183333333135428, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.4983,0.685156,0.686133


[32m[I 2023-06-09 20:44:18,714][0m Trial 8 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.68613
eval/f1,0.68516
eval/loss,1.4983
eval/runtime,160.1391
eval/samples_per_second,606.916
eval/steps_per_second,37.936
train/epoch,0.08
train/global_step,500.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.856807,0.688943,0.689611


[32m[I 2023-06-09 20:48:39,150][0m Trial 9 pruned. [0m


{'learning_rate': 1.5478032577602657e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 322, 'weight_decay': 0.008013760485701626, 'per_device_eval_batch_size': 32}
{'output_dir': WindowsPath('C:/Users/liyag/OneDrive - mail.tau.ac.il/Desktop/NLP/results'), 'overwrite_output_dir': True, 'greater_is_better': True, 'evaluation_strategy': 'steps', 'do_train': True, 'logging_strategy': 'epoch', 'save_strategy': 'epoch', 'report_to': 'wandb', 'learning_rate': 1.5478032577602657e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 3, 'warmup_steps': 322, 'weight_decay': 0.008013760485701626, 'per_device_eval_batch_size': 32}


Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.872986,0.689693,0.689498
1000,No log,1.76089,0.674682,0.673962
1500,No log,1.395288,0.67709,0.678417
2000,No log,1.317701,0.68248,0.684271
2500,No log,1.315684,0.685224,0.687461
3000,No log,1.224269,0.682763,0.682625
3500,No log,1.198329,0.685566,0.686092
4000,No log,1.207603,0.689087,0.690753
4500,No log,1.084959,0.691835,0.692544
5000,No log,1.00105,0.69569,0.697441


model was saved


0,1
eval/accuracy,▄▄▁▂▃▄▃▃▄▅▆▆▇▇███
eval/f1,▄▄▁▂▃▃▃▃▄▅▅▆▇▇███
eval/loss,██▇▅▄▄▄▃▄▃▂▂▂▁▁▁▁
eval/runtime,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/samples_per_second,▁████████████████
eval/steps_per_second,█▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂
train/epoch,▁▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇███
train/global_step,▁▁▁▂▂▃▃▄▄▅▅▆▆▇▇▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.70988
eval/f1,0.70779
eval/loss,0.83153
eval/runtime,83.3805
eval/samples_per_second,1165.632
eval/steps_per_second,36.435
train/epoch,1.0
train/global_step,8099.0
train/learning_rate,0.0
train/loss,0.4835


+-------------------------------------------------+------------+---------------+
|                     Modules                     | Parameters | Sum of Tensor |
+-------------------------------------------------+------------+---------------+
|               transformer.mask_emb              |    768     |     7.254     |
|        transformer.word_embedding.weight        |  24576000  |   1073323.0   |
|          transformer.layer.0.rel_attn.q         |   589824   |   44516.6953  |
|          transformer.layer.0.rel_attn.k         |   589824   |    94528.0    |
|          transformer.layer.0.rel_attn.v         |   589824   |   15092.6914  |
|          transformer.layer.0.rel_attn.o         |   589824   |   16119.0615  |
|          transformer.layer.0.rel_attn.r         |   589824   |   10848.5449  |
|      transformer.layer.0.rel_attn.r_r_bias      |    768     |    117.9462   |
|      transformer.layer.0.rel_attn.r_s_bias      |    768     |    136.0362   |
|      transformer.layer.0.r

f1 value: 0.7080168983557641
accuracy value: 0.7100554578098796


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Map:   0%|          | 0/388761 [00:00<?, ? examples/s]

Map:   0%|          | 0/97191 [00:00<?, ? examples/s]

+---------------------------------------------------------+------------+---------------+
|                         Modules                         | Parameters | Sum of Tensor |
+---------------------------------------------------------+------------+---------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |   956970.75   |
|        bert.embeddings.position_embeddings.weight       |   393216   |   4574.8262   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |    13.3915    |
|             bert.embeddings.LayerNorm.weight            |    768     |    652.2689   |
|              bert.embeddings.LayerNorm.bias             |    768     |     31.436    |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |   19902.5742  |
|      bert.encoder.layer.0.attention.self.query.bias     |    768     |    167.2101   |
|      bert.encoder.layer.0.attention.self.key.weight     |   589824   |   19553.2656  |
|       bert.encoder.

f1 value: 0.04756803369114388
accuracy value: 0.1665689209906267
------------------------------ regular bert-base-uncased ------------------------------


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

[32m[I 2023-06-09 21:47:34,226][0m A new study created in memory with name: no-name-3dfd12f6-d3da-4867-9da1-75eeb7b62d97[0m


VBox(children=(Label(value='0.000 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333332902596, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.002143,0.644742,0.646233
1000,No log,0.938153,0.662803,0.666564
1500,No log,0.908089,0.67785,0.678674
2000,No log,0.884724,0.684254,0.684786
2500,No log,0.8837,0.682898,0.685825
3000,No log,0.835247,0.701074,0.700939
3500,No log,0.834478,0.701091,0.703347
4000,No log,0.827585,0.704114,0.703892
4500,No log,0.80897,0.712631,0.713111
5000,No log,0.798826,0.713769,0.715272


[32m[I 2023-06-09 22:42:40,696][0m Trial 0 finished with value: 1.4758326496018417 and parameters: {'learning_rate': 4.2512816728948675e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 1, 'warmup_steps': 344, 'weight_decay': 0.006283907150985664, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 1.4758326496018417.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁▃▃▄▄▅▅▅▆▆▇▆▇▇▇▇▇███████
eval/f1,▁▂▃▄▄▅▅▅▆▆▆▆▇▇▇▇▇███████
eval/loss,█▆▆▅▅▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
eval/runtime,▄▄▅▄▆▁▇▄▅▆▃▇▁█▄▆▅▅▇▇▇▃▆▅
eval/samples_per_second,▅▅▃▅▃█▂▅▄▃▆▂█▁▅▃▄▄▂▂▂▆▃▄
eval/steps_per_second,▅▅▃▅▃█▂▅▄▃▆▂█▁▅▃▄▄▂▂▂▆▃▄
train/epoch,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇████
train/global_step,▁▁▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇████
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.73864
eval/f1,0.73719
eval/loss,0.73138
eval/runtime,99.0215
eval/samples_per_second,981.514
eval/steps_per_second,61.35
train/epoch,1.0
train/global_step,12149.0
train/learning_rate,0.0
train/loss,0.8321


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.819083,0.7316,0.732311
1000,No log,0.849077,0.729343,0.730109
1500,No log,0.807408,0.732253,0.732918
2000,No log,0.748402,0.735984,0.73726


[32m[I 2023-06-09 22:59:49,880][0m Trial 1 finished with value: 1.473243629480157 and parameters: {'learning_rate': 2.6888595823894276e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 6, 'warmup_steps': 14, 'weight_decay': 0.006626619115607247, 'per_device_eval_batch_size': 16}. Best is trial 0 with value: 1.4758326496018417.[0m


0,1
eval/accuracy,▃▁▄█
eval/f1,▃▁▄█
eval/loss,▆█▅▁
eval/runtime,█▂▆▁
eval/samples_per_second,▁▇▃█
eval/steps_per_second,▁▇▃█
train/epoch,▁▃▆███
train/global_step,▁▃▆███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.73726
eval/f1,0.73598
eval/loss,0.7484
eval/runtime,99.2493
eval/samples_per_second,979.261
eval/steps_per_second,61.209
train/epoch,1.0
train/global_step,2024.0
train/learning_rate,0.0
train/loss,0.4976


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666414434, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.791558,0.727387,0.729337
1000,No log,0.858375,0.720278,0.719182
1500,No log,0.894733,0.712331,0.710518
2000,No log,0.841431,0.714847,0.715303
2500,No log,0.872561,0.711455,0.71234
3000,No log,0.877086,0.716177,0.716692
3500,No log,0.856135,0.71493,0.71732
4000,No log,0.85679,0.71668,0.716177
4500,No log,0.864885,0.715608,0.715766
5000,No log,0.868332,0.710892,0.71234


[32m[I 2023-06-10 00:49:33,466][0m Trial 2 finished with value: 1.4861161532429588 and parameters: {'learning_rate': 3.190830237536043e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'warmup_steps': 240, 'weight_decay': 0.0014452414420046068, 'per_device_eval_batch_size': 16}. Best is trial 2 with value: 1.4861161532429588.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▅▃▁▂▁▂▂▂▁▄▃▃▃▂▃▃▄▄▅▄▄▅▅▆▆▆▅▆▆▆▇▇▇▇▇█████
eval/f1,▅▃▁▂▁▂▂▂▁▃▃▃▃▃▃▃▄▃▅▄▄▄▅▆▆▆▅▆▆▆▇▇▇▇▇█████
eval/loss,▄▇█▆▇▆▆▇▇▆▆▅▇▇▆▅▅▅▅▄▅▄▄▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁▁
eval/runtime,▅▄▆▂▆▇▄▅▇▂▂▇▄▅▆▇▁▇▄▅▅▆▃▇▄▅▄█▂█▆▅▅▆▄▃▇▅▅▃
eval/samples_per_second,▄▅▃▇▃▂▅▄▂▇▇▂▄▄▃▂█▂▅▄▄▃▆▂▅▄▅▁▇▁▃▄▄▃▅▆▂▄▄▆
eval/steps_per_second,▄▅▃▇▃▂▅▄▂▇▇▂▄▄▃▂█▂▅▄▄▃▆▂▅▄▅▁▇▁▃▄▄▃▅▆▂▄▄▆
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.74376
eval/f1,0.74235
eval/loss,0.72347
eval/runtime,98.4426
eval/samples_per_second,987.286
eval/steps_per_second,61.711
train/epoch,1.0
train/global_step,24298.0
train/learning_rate,0.0
train/loss,0.6405


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01718333333362049, max=1.0)…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,0.889665,0.737773,0.738258
1000,No log,0.928884,0.734948,0.735243
1500,No log,0.956085,0.7344,0.734996
2000,No log,0.960743,0.732673,0.733082
2500,No log,0.949342,0.732524,0.732949
3000,No log,0.929264,0.732641,0.732856
3500,No log,0.891245,0.733513,0.733936
4000,No log,0.863092,0.734274,0.734862


[32m[I 2023-06-10 01:17:00,191][0m Trial 3 finished with value: 1.4691362907112078 and parameters: {'learning_rate': 2.57245317971525e-06, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 6, 'warmup_steps': 76, 'weight_decay': 0.0077149454026886844, 'per_device_eval_batch_size': 32}. Best is trial 2 with value: 1.4861161532429588.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,█▄▄▁▁▁▂▄
eval/f1,█▄▄▁▁▁▂▃
eval/loss,▃▆██▇▆▃▁
eval/runtime,▂▁█▆▆▅▁▁
eval/samples_per_second,▇█▁▃▃▄██
eval/steps_per_second,▇█▁▃▃▄██
train/epoch,▁▂▃▄▅▆▇███
train/global_step,▁▂▃▄▅▆▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.73486
eval/f1,0.73427
eval/loss,0.86309
eval/runtime,51.6278
eval/samples_per_second,1882.532
eval/steps_per_second,58.844
train/epoch,1.0
train/global_step,4049.0
train/learning_rate,0.0
train/loss,0.3267


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666899498, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.011748,0.730944,0.730757
1000,No log,1.032112,0.727625,0.727547
1500,No log,1.066266,0.728502,0.729131
2000,No log,1.03579,0.728037,0.728483
2500,No log,0.988677,0.730336,0.730819
3000,No log,0.939157,0.729653,0.729707
3500,No log,0.858958,0.731917,0.732475
4000,No log,0.81089,0.733184,0.734008


[32m[I 2023-06-10 01:41:57,055][0m Trial 4 finished with value: 1.467192258080892 and parameters: {'learning_rate': 6.381587463469347e-06, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 32, 'gradient_accumulation_steps': 3, 'warmup_steps': 204, 'weight_decay': 0.008165108889678311, 'per_device_eval_batch_size': 16}. Best is trial 2 with value: 1.4861161532429588.[0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▄▁▃▂▅▃▆█
eval/f1,▅▁▂▂▄▄▆█
eval/loss,▇▇█▇▆▅▂▁
eval/runtime,▅▄▃▁▄▅█▆
eval/samples_per_second,▄▅▆█▅▄▁▃
eval/steps_per_second,▄▅▆█▅▄▁▃
train/epoch,▁▂▃▄▅▆▇███
train/global_step,▁▂▃▄▅▆▇███
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.73401
eval/f1,0.73318
eval/loss,0.81089
eval/runtime,101.1945
eval/samples_per_second,960.438
eval/steps_per_second,60.033
train/epoch,1.0
train/global_step,4049.0
train/learning_rate,0.0
train/loss,0.3341


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016916666666414434, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.25293,0.718447,0.718709


[32m[I 2023-06-10 01:45:15,454][0m Trial 5 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.71871
eval/f1,0.71845
eval/loss,1.25293
eval/runtime,98.5007
eval/samples_per_second,986.704
eval/steps_per_second,61.675
train/epoch,0.06
train/global_step,500.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.518942,0.714587,0.713595


[32m[I 2023-06-10 01:48:30,658][0m Trial 6 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.71359
eval/f1,0.71459
eval/loss,1.51894
eval/runtime,98.6977
eval/samples_per_second,984.734
eval/steps_per_second,61.552
train/epoch,0.06
train/global_step,500.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.548297,0.716773,0.716311


[32m[I 2023-06-10 01:50:58,656][0m Trial 7 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.71631
eval/f1,0.71677
eval/loss,1.5483
eval/runtime,51.8206
eval/samples_per_second,1875.53
eval/steps_per_second,58.625
train/epoch,0.06
train/global_step,500.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,2.097112,0.706798,0.70772


[32m[I 2023-06-10 01:53:30,386][0m Trial 8 pruned. [0m


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁
train/global_step,▁

0,1
eval/accuracy,0.70772
eval/f1,0.7068
eval/loss,2.09711
eval/runtime,100.0294
eval/samples_per_second,971.624
eval/steps_per_second,60.732
train/epoch,0.02
train/global_step,500.0


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,1.893293,0.712568,0.712298


[32m[I 2023-06-10 01:56:25,360][0m Trial 9 pruned. [0m


{'learning_rate': 3.190830237536043e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'warmup_steps': 240, 'weight_decay': 0.0014452414420046068, 'per_device_eval_batch_size': 16}
{'output_dir': WindowsPath('C:/Users/liyag/OneDrive - mail.tau.ac.il/Desktop/NLP/results'), 'overwrite_output_dir': True, 'greater_is_better': True, 'evaluation_strategy': 'steps', 'do_train': True, 'logging_strategy': 'epoch', 'save_strategy': 'epoch', 'report_to': 'wandb', 'learning_rate': 3.190830237536043e-05, 'num_train_epochs': 1, 'seed': 0, 'per_device_train_batch_size': 16, 'gradient_accumulation_steps': 1, 'warmup_steps': 240, 'weight_decay': 0.0014452414420046068, 'per_device_eval_batch_size': 16}




Step,Training Loss,Validation Loss,F1,Accuracy
500,No log,2.14605,0.712637,0.714336
1000,No log,2.507567,0.707754,0.707267
1500,No log,2.476681,0.707443,0.709027
2000,No log,1.485119,0.697412,0.698377
2500,No log,1.436344,0.70262,0.704942
3000,No log,1.480121,0.712002,0.711949
3500,No log,1.337037,0.699879,0.698871
4000,No log,1.42841,0.702242,0.701372
4500,No log,1.372612,0.705264,0.705281
5000,No log,1.395945,0.702004,0.702503


model was saved


0,1
eval/accuracy,▄▄▃▃▁▄▁▂▂▃▂▃▃▃▃▄▃▃▄▄▃▄▄▅▄▄▄▅▅▅▅▆▆▆▇▇▇███
eval/f1,▄▄▃▃▁▄▁▂▃▃▂▂▃▃▃▄▃▃▄▄▃▄▄▅▄▄▄▅▆▅▅▆▆▆▇▇████
eval/loss,▆▇██▄▄▃▄▃▃▄▃▃▃▃▃▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁
eval/runtime,▄▆▄▇▂▃▅▅▃▁▇▁▇▄▆▁▇▂▄▅▆▃▂▇▃▇▄▇▂▇▃▅▅█▂▂█▆▆▃
eval/samples_per_second,▅▃▅▂▇▆▄▄▆█▂█▂▅▃█▂▇▅▄▃▆▇▂▆▂▅▂▇▂▆▄▄▁▇▇▁▃▃▆
eval/steps_per_second,▅▃▅▂▇▆▄▄▆█▂█▂▅▃█▂▇▅▄▃▆▇▂▆▂▅▂▇▂▆▄▄▁▇▇▁▃▃▆
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,▁
train/loss,▁

0,1
eval/accuracy,0.73475
eval/f1,0.73344
eval/loss,0.76583
eval/runtime,98.3466
eval/samples_per_second,988.249
eval/steps_per_second,61.771
train/epoch,1.0
train/global_step,24298.0
train/learning_rate,0.0
train/loss,0.3683


+---------------------------------------------------------+------------+---------------+
|                         Modules                         | Parameters | Sum of Tensor |
+---------------------------------------------------------+------------+---------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |  955872.1875  |
|        bert.embeddings.position_embeddings.weight       |   393216   |    4577.082   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |    13.9256    |
|             bert.embeddings.LayerNorm.weight            |    768     |    652.7019   |
|              bert.embeddings.LayerNorm.bias             |    768     |    31.7363    |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |   20028.5293  |
|      bert.encoder.layer.0.attention.self.query.bias     |    768     |    167.1584   |
|      bert.encoder.layer.0.attention.self.key.weight     |   589824   |   19688.5469  |
|       bert.encoder.

f1 value: 0.733591514935646
accuracy value: 0.734841703449908


Training loop

In [43]:
model_names = ["xlnet-base-cased","bert-base-uncased"]
models={}

for model_name in model_names:
    #Initializing the model
    print('=============================================================================')
    print(f'============================== {model_name} ==============================')
    print('=============================================================================')
    this_model=OurAwesomeModel(model_name,dataset)
    print(f'------------------------------ regular {model_name} ------------------------------')
    # load the best parameters of the model
    this_model.load_trained_model()
    print(f'------------------------------ {model_name} weights ------------------------------')
    this_model.show_parameters()
    print(f'------------------------------ {model_name} metrics ------------------------------')
    this_model.evaluate(args)
    print('')
    
    
    #pruning the model
    print(f'------------------------------ pruned {model_name} ------------------------------')
    pruning_model=copy.deepcopy(this_model)
    pruning_model.pruning(0.5)
    pruning_model.train({'output_dir':OUT_PATH, 'num_train_epochs':1})
    pruning_model.save_model('pruned')
    print(f'------------------------------ pruned {model_name} weights ------------------------------')
    pruning_model.show_parameters()
    print(f'------------------------------ pruned {model_name} metrics ------------------------------')
    pruning_model.evaluate(args)
    print('')
    
    
    # quantize of the model
    print(f'------------------------------ half {model_name} ------------------------------')
    half_model=copy.deepcopy(this_model)
    half_model.half()
    half_model.save_model('half')
    print(f'------------------------------ half {model_name} weights ------------------------------')
    half_model.show_parameters()
    print(f'------------------------------ half {model_name} metrics ------------------------------')
    half_model.evaluate(args)
    print('')
    
    
#     print(f'------------------------------ distiled {model_name} ------------------------------')
#     teacher_model = copy.deepcopy(this_model)
#     teacher_model.model.to(device).eval()
#     student_model = OurAwesomeModel('distilbert-base-uncased',dataset)    
#     train_dataset = teacher_model.dataset['train']
#     tokenizer = student_model.tokenizer
#     # Tokenize and encode the text data
#     train_dataset = train_dataset.map(lambda example: tokenizer(example['text'], truncation=True, padding='max_length'), batched=True)
#     # Convert the labels to integers
#     train_dataset = train_dataset.map(lambda example: {'labels': example['labels']}, batched=True)
#     student_model.model = knowladge_distilation_training(student_model.model.to(device), teacher_model.model.to(device), train_dataset)
#     student_model.save_model('student')
#     print(f'------------------------------ student {model_name} weights ------------------------------')
#     student_model.show_parameters()
#     print(f'------------------------------ student {model_name} metrics ------------------------------')
#     student_model.evaluate(args)
    
    
        
    models[model_name]= {'trained model': this_model, 'pruned_model':pruning_model,'half_model':half_model, 'student_model':student_model} 




Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

------------------------------ regular xlnet-base-cased ------------------------------
model was loaded
------------------------------ xlnet-base-cased weights ------------------------------
+-------------------------------------------------+------------+---------------+
|                     Modules                     | Parameters | Sum of Tensor |
+-------------------------------------------------+------------+---------------+
|               transformer.mask_emb              |    768     |     7.254     |
|        transformer.word_embedding.weight        |  24576000  |   1073323.0   |
|          transformer.layer.0.rel_attn.q         |   589824   |   44516.6953  |
|          transformer.layer.0.rel_attn.k         |   589824   |    94528.0    |
|          transformer.layer.0.rel_attn.v         |   589824   |   15092.6914  |
|          transformer.layer.0.rel_attn.o         |   589824   |   16119.0615  |
|          transformer.layer.0.rel_attn.r         |   589824   |   10848.5449  |

f1 value: 0.7080168983557641
accuracy value: 0.7100554578098796

------------------------------ pruned xlnet-base-cased ------------------------------

transformer.layer.0.ff.layer_1:
old_total_weights = -103.30775451660156
new_total_weights = -91.01393127441406

transformer.layer.0.ff.layer_2:
old_total_weights = -84.21345520019531
new_total_weights = -85.44734191894531

transformer.layer.1.ff.layer_1:
old_total_weights = -572.9107666015625
new_total_weights = -556.9412841796875

transformer.layer.1.ff.layer_2:
old_total_weights = 48.340579986572266
new_total_weights = 62.087646484375

transformer.layer.2.ff.layer_1:
old_total_weights = -391.30279541015625
new_total_weights = -354.6177978515625

transformer.layer.2.ff.layer_2:
old_total_weights = -18.800872802734375
new_total_weights = -9.930244445800781

transformer.layer.3.ff.layer_1:
old_total_weights = 245.47509765625
new_total_weights = 186.15032958984375

transformer.layer.3.ff.layer_2:
old_total_weights = -102.96053314208984
ne



Step,Training Loss
500,1.1588
1000,1.1151
1500,1.1495
2000,1.1048
2500,1.0924
3000,1.0977
3500,1.1026
4000,1.071
4500,1.0325
5000,1.0379


model was saved
------------------------------ pruned xlnet-base-cased weights ------------------------------
+-------------------------------------------------+------------+---------------+
|                     Modules                     | Parameters | Sum of Tensor |
+-------------------------------------------------+------------+---------------+
|               transformer.mask_emb              |    768     |     7.254     |
|        transformer.word_embedding.weight        |  24576000  |  1074652.125  |
|          transformer.layer.0.rel_attn.q         |   589824   |   44642.8477  |
|          transformer.layer.0.rel_attn.k         |   589824   |   94493.9531  |
|          transformer.layer.0.rel_attn.v         |   589824   |   15240.1621  |
|          transformer.layer.0.rel_attn.o         |   589824   |   16221.5801  |
|          transformer.layer.0.rel_attn.r         |   589824   |   11345.1953  |
|      transformer.layer.0.rel_attn.r_r_bias      |    768     |    114.6121   |

f1 value: 0.7110512444419043
accuracy value: 0.712349908942186

------------------------------ half xlnet-base-cased ------------------------------
model was saved
------------------------------ half xlnet-base-cased weights ------------------------------
+-------------------------------------------------+------------+---------------+
|                     Modules                     | Parameters | Sum of Tensor |
+-------------------------------------------------+------------+---------------+
|               transformer.mask_emb              |    768     |     7.2539    |
|        transformer.word_embedding.weight        |  24576000  |      inf      |
|          transformer.layer.0.rel_attn.q         |   589824   |    44512.0    |
|          transformer.layer.0.rel_attn.k         |   589824   |      inf      |
|          transformer.layer.0.rel_attn.v         |   589824   |    15096.0    |
|          transformer.layer.0.rel_attn.o         |   589824   |    16120.0    |
|          tran

f1 value: 0.7080762324706387
accuracy value: 0.7101171919210626



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

------------------------------ regular bert-base-uncased ------------------------------
model was loaded
------------------------------ bert-base-uncased weights ------------------------------
+---------------------------------------------------------+------------+---------------+
|                         Modules                         | Parameters | Sum of Tensor |
+---------------------------------------------------------+------------+---------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |  955872.1875  |
|        bert.embeddings.position_embeddings.weight       |   393216   |    4577.082   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |    13.9256    |
|             bert.embeddings.LayerNorm.weight            |    768     |    652.7019   |
|              bert.embeddings.LayerNorm.bias             |    768     |    31.7363    |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |   20028.5293  |
|     

f1 value: 0.733591514935646
accuracy value: 0.734841703449908

------------------------------ pruned bert-base-uncased ------------------------------

bert.encoder.layer.0.attention.self.query:
old_total_weights = 23.72532844543457
new_total_weights = 31.292236328125

bert.encoder.layer.0.attention.self.key:
old_total_weights = 5.339641571044922
new_total_weights = 4.593292236328125

bert.encoder.layer.0.attention.self.value:
old_total_weights = -19.7689151763916
new_total_weights = -14.732784271240234

bert.encoder.layer.0.attention.output.dense:
old_total_weights = -11.822534561157227
new_total_weights = -5.044295787811279

bert.encoder.layer.0.intermediate.dense:
old_total_weights = -78.56645202636719
new_total_weights = -75.34112548828125

bert.encoder.layer.0.output.dense:
old_total_weights = -199.65719604492188
new_total_weights = -181.06861877441406

bert.encoder.layer.1.attention.self.query:
old_total_weights = -172.47718811035156
new_total_weights = -147.8240203857422

bert.en



Step,Training Loss
500,0.7383
1000,0.7519
1500,0.8124
2000,0.7752
2500,0.7881
3000,0.7911
3500,0.8183
4000,0.7992
4500,0.7903
5000,0.7871


model was saved
------------------------------ pruned bert-base-uncased weights ------------------------------
+---------------------------------------------------------+------------+---------------+
|                         Modules                         | Parameters | Sum of Tensor |
+---------------------------------------------------------+------------+---------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |   957280.625  |
|        bert.embeddings.position_embeddings.weight       |   393216   |   4589.5972   |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |    14.4349    |
|             bert.embeddings.LayerNorm.weight            |    768     |    652.8584   |
|              bert.embeddings.LayerNorm.bias             |    768     |    31.8757    |
|      bert.encoder.layer.0.attention.self.query.bias     |    768     |    167.1329   |
|     bert.encoder.layer.0.attention.self.query.weight    |   589824   |   17511.2344  |

f1 value: 0.7393312876568089
accuracy value: 0.7400582358448827

------------------------------ half bert-base-uncased ------------------------------
model was saved
------------------------------ half bert-base-uncased weights ------------------------------
+---------------------------------------------------------+------------+---------------+
|                         Modules                         | Parameters | Sum of Tensor |
+---------------------------------------------------------+------------+---------------+
|          bert.embeddings.word_embeddings.weight         |  23440896  |      inf      |
|        bert.embeddings.position_embeddings.weight       |   393216   |     4576.0    |
|       bert.embeddings.token_type_embeddings.weight      |    1536    |    13.9219    |
|             bert.embeddings.LayerNorm.weight            |    768     |     652.5     |
|              bert.embeddings.LayerNorm.bias             |    768     |    31.7344    |
|     bert.encoder.layer.0.at

f1 value: 0.733601302532763
accuracy value: 0.7348519924684385

