# Contrastive Learning From Scratch - DistilBERT

An attempt to build contrastive learning model from scratch. Parts include:

- Loading and preparing Wiki-1M data for model input
- Contrastive learning model
  - Forward passing using pre-trained model
  - Constrastive layer
  - Calculate loss
- Training procedure
  - Default trainer optimizer
  - Default trainer hyper-parameters

In [1]:
import os

# Set Project home
PROJECT_HOME = os.path.join('/',
                            'notebooks',
                            'contrastive-learning-in-distilled-models')
%cd {PROJECT_HOME}

# Load project code
%reload_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, f'{PROJECT_HOME}/src')

import distilface

/notebooks/contrastive-learning-in-distilled-models


In [2]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(42)

In [3]:
# Hyperparameters

pooling_method = 'avg_last4'
batch_size = 128
learning_rate = 1e-5
epochs = 1
max_steps = 30_000
temperature = 0.05
max_len = 32
fp16 = False

## 1. Loading and Preparing Wiki-1M data

Use huggingface `datasets` library to load local file data.

In [4]:
import numpy as np

from datasets import load_dataset

data_files = {'train': 'data/training/wiki1m_for_simcse.txt'}
# data_files = {'train': 'data/training/wiki5k.txt'}
datasets = load_dataset('text', data_files=data_files, cache_dir='./data/')

Using custom data configuration default-235a9b97f9ebd10e
Reusing dataset text (./data/text/default-235a9b97f9ebd10e/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Unsupervised / Self-supervised dataset

column_names = datasets["train"].column_names
sent0_cname = column_names[0]
sent1_cname = column_names[0]

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [7]:
def prepare_features(examples):
    total = len(examples[sent0_cname])

    # Avoid "None" fields 
    for idx in range(total):
        if examples[sent0_cname][idx] is None:
            examples[sent0_cname][idx] = " "
        if examples[sent1_cname][idx] is None:
            examples[sent1_cname][idx] = " "

    sentences = examples[sent0_cname] + examples[sent1_cname]

    sent_features = tokenizer(
        sentences,
        max_length=max_len,
        truncation=True,
        padding=True,
    )

    features = {}
    for key in sent_features:
        features[key] = [[sent_features[key][i], sent_features[key][i+total]] for i in range(total)]

    return features

In [8]:
train_dataset = datasets["train"].map(prepare_features,
                                      batched=True,
                                    #   num_proc=24,
                                      remove_columns=column_names)

Loading cached processed dataset at ./data/text/default-235a9b97f9ebd10e/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-305eab4ce3c7d88c.arrow


In [9]:
train_dataset.num_rows

1000000

## 2. Contrastive Learning Model

In [10]:
import torch
import torch.nn as nn

from transformers import AutoTokenizer, DistilBertModel, DistilBertPreTrainedModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPooling

from distilface.modules.pooler import Pooler
from distilface.modules.similarity import Similarity


class DistilBertCLModel(DistilBertPreTrainedModel):
    def __init__(self, config, pooler_type=pooling_method, temp=temperature):
        super().__init__(config)

        self.config = config
        self.pooler_type = pooler_type
        self.temp = temperature

        self.distilbert = DistilBertModel(config)
        self.pooler = Pooler(pooler_type)
        self.sim = Similarity(temp=temp)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        if self.training:
            return self.cl_forward(self.distilbert, input_ids, attention_mask)
        else:
            return self.sent_emb(self.distilbert, input_ids, attention_mask)

    def cl_forward(self, encoder, input_ids=None, attention_mask=None):
        batch_size = input_ids.size(0)
        num_sent = input_ids.size(1)  # Number of sentences in one instance: 2 sentences

        # Flatten all input tensors
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)

        # Pre-trained Model Encoder
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )

        # Pooling
        pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)

        # Separate representation
        z1, z2 = pooler_output[:, 0], pooler_output[:, 1]

        # Cosine similarity
        cos_sim = self.sim(z1.unsqueeze(1), z2.unsqueeze(0))

        # Calculate contrastive loss
        criterion = nn.CrossEntropyLoss()
        labels = torch.arange(cos_sim.size(0)).long().to(self.device)
        loss = criterion(cos_sim, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=cos_sim,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def sent_emb(self, encoder, input_ids=None, attention_mask=None):
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        pooler_output = self.pooler(attention_mask, outputs)

        return BaseModelOutputWithPooling(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )


pretrained_model_name = 'distilbert-base-uncased'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(pretrained_model_name)

model = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

model.eval();

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertCLModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertCLModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertCLModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### 2.1 Initial DistilBERT embeddings performance

In [11]:
import senteval


def prepare(params, samples):
    return

def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model(**batch)

    pooled_result = outputs.pooler_output.cpu()

    return pooled_result


def evaluate_model():
    PATH_TO_DATA = "./data"

    params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
    tasks = ["STSBenchmark", 'STS12', 'STS13', 'STS14', 'STS15']

    se = senteval.engine.SE(params, batcher, prepare)
    results = se.eval(tasks)

    print('STS12: ', results["STS12"]["all"]["spearman"]["all"])
    print('STS13: ', results["STS13"]["all"]["spearman"]["all"])
    print('STS14: ', results["STS14"]["all"]["spearman"]["all"])
    print('STS15: ', results["STS15"]["all"]["spearman"]["all"])
    print('STSB: ', results["STSBenchmark"]["test"]["spearman"][0])

    return results

## 3. Trainer

In [12]:
total_max_steps = 0

In [13]:
# import mlflow
import pandas as pd
from transformers import Trainer, TrainingArguments
from transformers import default_data_collator

max_steps = 20_000
training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='steps',
    save_steps=max_steps,
    fp16=fp16,
)

model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

train_result = trainer.train()

total_max_steps += max_steps
pd.DataFrame(trainer.state.log_history).to_csv(f'logs/cl-distilled_tuning_v1_ms{total_max_steps}_fp16.csv', index=False)
print("total max steps", total_max_steps)

model.eval()

results = evaluate_model()
results

max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 1000000
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 20000


Step,Training Loss
500,0.0319
1000,0.0004
1500,0.0002
2000,0.0002
2500,0.0002
3000,0.0002
3500,0.0001
4000,0.0002
4500,0.0002
5000,0.0002


Saving model checkpoint to output/checkpoint-20000
Configuration saved in output/checkpoint-20000/config.json
Model weights saved in output/checkpoint-20000/pytorch_model.bin
tokenizer config file saved in output/checkpoint-20000/tokenizer_config.json
Special tokens file saved in output/checkpoint-20000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




total max steps 20000




STS12:  0.637212533218761
STS13:  0.7529444307360512
STS14:  0.6948705973694814
STS15:  0.7822498620872882
STSB:  0.7379671965388565


{'STSBenchmark': {'train': {'pearson': (0.7678508672906414, 0.0),
   'spearman': SpearmanrResult(correlation=0.7492511484181752, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.775584840004512, 1.3155644919051673e-301),
   'spearman': SpearmanrResult(correlation=0.7807088690424137, pvalue=3.469885327224e-308),
   'nsamples': 1500},
  'test': {'pearson': (0.7427095835894062, 4.182162470396313e-242),
   'spearman': SpearmanrResult(correlation=0.7379671965388565, pvalue=1.864630832097609e-237),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.76159330329887,
    'mean': 0.7620484302948531,
    'wmean': 0.7651771455529042},
   'spearman': {'all': 0.7558693547098442,
    'mean': 0.7559757379998152,
    'wmean': 0.7529166573767724}}},
 'STS12': {'MSRpar': {'pearson': (0.6310103604410828, 1.5347949579921226e-84),
   'spearman': SpearmanrResult(correlation=0.6302719731616713, pvalue=2.7397613594588785e-84),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.8242715697241659, 5.8464

In [14]:
max_steps = 30_000
training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='steps',
    save_steps=max_steps,
    fp16=fp16,
)

model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

train_result = trainer.train()

total_max_steps += max_steps
pd.DataFrame(trainer.state.log_history).to_csv(f'logs/cl-distilled_tuning_v1_ms{total_max_steps}_fp16.csv', index=False)
print("total max steps", total_max_steps)

model.eval()

results = evaluate_model()
results

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 1000000
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 30000


Step,Training Loss
500,0.0001
1000,0.0002
1500,0.0001
2000,0.0001
2500,0.0001
3000,0.0001
3500,0.0001
4000,0.0001
4500,0.0001
5000,0.0001


Saving model checkpoint to output/checkpoint-30000
Configuration saved in output/checkpoint-30000/config.json
Model weights saved in output/checkpoint-30000/pytorch_model.bin
tokenizer config file saved in output/checkpoint-30000/tokenizer_config.json
Special tokens file saved in output/checkpoint-30000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




total max steps 50000




STS12:  0.6260459277914104
STS13:  0.7471598657856633
STS14:  0.6921465422817232
STS15:  0.7734346729911253
STSB:  0.7325411235402595


{'STSBenchmark': {'train': {'pearson': (0.7597768653216086, 0.0),
   'spearman': SpearmanrResult(correlation=0.7433067533104418, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.7651189205870126, 1.069200511524517e-288),
   'spearman': SpearmanrResult(correlation=0.7708146867517126, pvalue=1.2261069638036015e-295),
   'nsamples': 1500},
  'test': {'pearson': (0.7377013499431754, 3.374561396902202e-237),
   'spearman': SpearmanrResult(correlation=0.7325411235402595, pvalue=2.9334018529269246e-232),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.7533916606101079,
    'mean': 0.7541990452839321,
    'wmean': 0.7571772996275019},
   'spearman': {'all': 0.74941510975312,
    'mean': 0.7488875212008047,
    'wmean': 0.7463684242317242}}},
 'STS12': {'MSRpar': {'pearson': (0.6325861887023838, 4.4327937715219186e-85),
   'spearman': SpearmanrResult(correlation=0.6332742743951332, pvalue=2.57144928382386e-85),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.8088841079831236, 1.0

In [15]:
max_steps = 50_000
training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='steps',
    save_steps=max_steps,
    fp16=fp16,
)

model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

train_result = trainer.train()

total_max_steps += max_steps
pd.DataFrame(trainer.state.log_history).to_csv(f'logs/cl-distilled_tuning_v1_ms{total_max_steps}_fp16.csv', index=False)
print("total max steps", total_max_steps)

model.eval()

results = evaluate_model()
results

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 1000000
  Num Epochs = 7
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 50000


Step,Training Loss
500,0.0
1000,0.0001
1500,0.0001
2000,0.0
2500,0.0001
3000,0.0
3500,0.0001
4000,0.0
4500,0.0001
5000,0.0001


Saving model checkpoint to output/checkpoint-50000
Configuration saved in output/checkpoint-50000/config.json
Model weights saved in output/checkpoint-50000/pytorch_model.bin
tokenizer config file saved in output/checkpoint-50000/tokenizer_config.json
Special tokens file saved in output/checkpoint-50000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




total max steps 100000




STS12:  0.614372104178948
STS13:  0.7505327524875732
STS14:  0.6939203341472341
STS15:  0.7706971269812799
STSB:  0.7267087721527442


{'STSBenchmark': {'train': {'pearson': (0.7546207956142396, 0.0),
   'spearman': SpearmanrResult(correlation=0.7402684027330102, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.7543053436876465, 4.759019317691326e-276),
   'spearman': SpearmanrResult(correlation=0.7610751302082767, pvalue=6.884404346841527e-284),
   'nsamples': 1500},
  'test': {'pearson': (0.7304971106970108, 2.4664327970295807e-230),
   'spearman': SpearmanrResult(correlation=0.7267087721527442, pvalue=8.182553779852748e-227),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.7464215366701639,
    'mean': 0.7464744166662989,
    'wmean': 0.7507103019435455},
   'spearman': {'all': 0.7449506834692334,
    'mean': 0.7426841016980105,
    'wmean': 0.7417184908928054}}},
 'STS12': {'MSRpar': {'pearson': (0.6360962884618206, 2.7160149581685924e-86),
   'spearman': SpearmanrResult(correlation=0.6317024602046954, pvalue=8.903153079443301e-85),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.800651285125258, 1.

In [16]:
max_steps = 100_000
training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='steps',
    save_steps=max_steps,
    fp16=fp16,
)

model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

train_result = trainer.train()

total_max_steps += max_steps
pd.DataFrame(trainer.state.log_history).to_csv(f'logs/cl-distilled_tuning_v1_ms{total_max_steps}_fp16.csv', index=False)
print("total max steps", total_max_steps)

model.eval()

results = evaluate_model()
results

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 1000000
  Num Epochs = 13
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 100000


Step,Training Loss
500,0.0001
1000,0.0001
1500,0.0001
2000,0.0001
2500,0.0001
3000,0.0
3500,0.0001
4000,0.0
4500,0.0001
5000,0.0001


Saving model checkpoint to output/checkpoint-100000
Configuration saved in output/checkpoint-100000/config.json
Model weights saved in output/checkpoint-100000/pytorch_model.bin
tokenizer config file saved in output/checkpoint-100000/tokenizer_config.json
Special tokens file saved in output/checkpoint-100000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




total max steps 200000




STS12:  0.6069538714967191
STS13:  0.735732133116423
STS14:  0.6837063208271212
STS15:  0.7595820749185737
STSB:  0.7067137124736896


{'STSBenchmark': {'train': {'pearson': (0.7391279125284309, 0.0),
   'spearman': SpearmanrResult(correlation=0.7266625058019277, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.7341071135717661, 4.1812006539743096e-254),
   'spearman': SpearmanrResult(correlation=0.7452231239442958, pvalue=6.33589630222159e-266),
   'nsamples': 1500},
  'test': {'pearson': (0.7100977627151148, 4.7411887633193927e-212),
   'spearman': SpearmanrResult(correlation=0.7067137124736896, pvalue=3.5961504835061294e-209),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.7283480725963465,
    'mean': 0.7277775962717706,
    'wmean': 0.7336151894144346},
   'spearman': {'all': 0.7306278223802867,
    'mean': 0.726199780739971,
    'wmean': 0.7267009319973278}}},
 'STS12': {'MSRpar': {'pearson': (0.640095434362673, 1.0786759437182156e-87),
   'spearman': SpearmanrResult(correlation=0.6270819809327377, pvalue=3.289724721683588e-83),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.7656451162171457, 1.

In [17]:
max_steps = 300_000
training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='steps',
    save_steps=max_steps,
    fp16=fp16,
)

model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

train_result = trainer.train()

total_max_steps += max_steps
pd.DataFrame(trainer.state.log_history).to_csv(f'logs/cl-distilled_tuning_v1_ms{total_max_steps}_fp16.csv', index=False)
print("total max steps", total_max_steps)

model.eval()

results = evaluate_model()
results

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 1000000
  Num Epochs = 39
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 300000


Step,Training Loss
500,0.0001
1000,0.0001
1500,0.0001
2000,0.0
2500,0.0
3000,0.0
3500,0.0001
4000,0.0
4500,0.0
5000,0.0


Saving model checkpoint to output/checkpoint-300000
Configuration saved in output/checkpoint-300000/config.json
Model weights saved in output/checkpoint-300000/pytorch_model.bin
tokenizer config file saved in output/checkpoint-300000/tokenizer_config.json
Special tokens file saved in output/checkpoint-300000/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




total max steps 500000




STS12:  0.6112652270539377
STS13:  0.7080050714833392
STS14:  0.6693391541151468
STS15:  0.7697527028965817
STSB:  0.7026194465206369


{'STSBenchmark': {'train': {'pearson': (0.735939060476371, 0.0),
   'spearman': SpearmanrResult(correlation=0.7181388507748124, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.7356142119514139, 1.132407978585887e-255),
   'spearman': SpearmanrResult(correlation=0.7415878249449327, pvalue=5.422727242420206e-262),
   'nsamples': 1500},
  'test': {'pearson': (0.7094473764591789, 1.7084951366985236e-211),
   'spearman': SpearmanrResult(correlation=0.7026194465206369, pvalue=9.671932538309588e-206),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.7265588466540083,
    'mean': 0.7270002162956546,
    'wmean': 0.7316484595205129},
   'spearman': {'all': 0.7234595172546394,
    'mean': 0.7207820407467941,
    'wmean': 0.719735072702104}}},
 'STS12': {'MSRpar': {'pearson': (0.6688717310619451, 2.0117595776782192e-98),
   'spearman': SpearmanrResult(correlation=0.6392453570529878, pvalue=2.1500369868459414e-87),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.7528498271128174, 5.

In [None]:
max_steps = 500_000
training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=learning_rate,
    num_train_epochs=epochs,
    max_steps=max_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='steps',
    save_steps=max_steps,
    fp16=fp16,
)

model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

train_result = trainer.train()

total_max_steps += max_steps
pd.DataFrame(trainer.state.log_history).to_csv(f'logs/cl-distilled_tuning_v1_ms{total_max_steps}_fp16.csv', index=False)
print("total max steps", total_max_steps)

model.eval()

results = evaluate_model()
results

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
***** Running training *****
  Num examples = 1000000
  Num Epochs = 64
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 500000


Step,Training Loss
500,0.0001
1000,0.0001
1500,0.0001
2000,0.0001
2500,0.0001
3000,0.0001
3500,0.0001
4000,0.0001
4500,0.0001
5000,0.0001


In [None]:
results = evaluate_model()
results