# Contrastive Learning From Scratch - DistilBERT

An attempt to build contrastive learning model from scratch. Parts include:

- Loading and preparing Wiki-1M data for model input
- Contrastive learning model
  - Forward passing using pre-trained model
  - Constrastive layer
  - Calculate loss
- Training procedure
  - Default trainer optimizer
  - Default trainer hyper-parameters

In [1]:
import os

# Set Project home
PROJECT_HOME = os.path.join('/',
                            'Users',
                            'ng-ka',
                            'OMSCS',
                            'DL',
                            'DLProject',
                            'contrastive-learning-in-distilled-models')
%cd {PROJECT_HOME}

# Load project code
%reload_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, './src')

#import distilface
import src.distilface as distilface

C:\Users\ng-ka\OMSCS\DL\DLProject\contrastive-learning-in-distilled-models


## 1. Loading and Preparing Wiki-1M data

Use huggingface `datasets` library to load local file data.

In [2]:
import numpy as np

from datasets import load_dataset

data_files = {'train': 'data/training/wiki1m_for_simcse.txt'}
# data_files = {'train': 'data/training/wiki5k.txt'}
datasets = load_dataset('text', data_files=data_files)

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-84caea1147087fa9
Reusing dataset text (C:\Users\ng-ka\.cache\huggingface\datasets\text\default-84caea1147087fa9\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 66.67it/s]


In [3]:
# Unsupervised / Self-supervised dataset

column_names = datasets["train"].column_names
sent0_cname = column_names[0]
sent1_cname = column_names[0]

print('column_names:', column_names)
print('sent0_cname:', sent0_cname, '| sent1_cname:', sent1_cname)

column_names: ['text']
sent0_cname: text | sent1_cname: text


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
def prepare_features(examples):
    total = len(examples[sent0_cname])

    # Avoid "None" fields 
    for idx in range(total):
        if examples[sent0_cname][idx] is None:
            examples[sent0_cname][idx] = " "
        if examples[sent1_cname][idx] is None:
            examples[sent1_cname][idx] = " "
    
    sentences = examples[sent0_cname] + examples[sent1_cname]

    sent_features = tokenizer(
        sentences,
        max_length=32,
        truncation=True,
        padding=True,
    )

    features = {}
    for key in sent_features:
        features[key] = [[sent_features[key][i], sent_features[key][i+total]] for i in range(total)]

    return features

In [6]:
train_dataset = datasets["train"].map(prepare_features,
                                      batched=True,
                                    #   num_proc=24,
                                      remove_columns=column_names)

Loading cached processed dataset at C:\Users\ng-ka\.cache\huggingface\datasets\text\default-84caea1147087fa9\0.0.0\4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8\cache-1293cb42b0393e2c.arrow


In [7]:
train_dataset.num_rows

1000000

In [8]:
str(train_dataset['input_ids'][0][0])

'[101, 26866, 1999, 2148, 2660, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'

In [9]:
str(train_dataset['input_ids'][0][1])

'[101, 26866, 1999, 2148, 2660, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'

In [10]:
train_dataset.features.keys()

dict_keys(['attention_mask', 'input_ids'])

Sentence 1 and Sentence 2 are the same sentence

## 2. Contrastive Learning Model

In [41]:
import torch
import torch.nn as nn

from transformers import AutoTokenizer, DistilBertModel, DistilBertPreTrainedModel, AutoConfig
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPooling

from src.distilface.modules.pooler import Pooler
from src.distilface.modules.similarity import Similarity

#from torch.cuda.amp import autocast 

#scaler = GradScaler()

class DistilBertCLModel(DistilBertPreTrainedModel):
    def __init__(self, config, pooler_type='avg_first_last', temp=0.001):
        super().__init__(config)

        self.config = config
        self.pooler_type = pooler_type
        self.temp = 0.001

        self.distilbert = DistilBertModel(config)
        self.pooler = Pooler(pooler_type)
        self.sim = Similarity(temp=temp)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        if self.training:
            return self.cl_forward(self.distilbert, input_ids, attention_mask)
        else:
            return self.sent_emb(self.distilbert, input_ids, attention_mask)

    def cl_forward(self, encoder, input_ids=None, attention_mask=None):
        batch_size = input_ids.size(0)#64#input_ids.size(0)
        num_sent = input_ids.size(1)  # Number of sentences in one instance: 2 sentences

        # Flatten all input tensors
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)

        # Pre-trained Model Encoder
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )

        # Pooling
        pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)

        # Separate representation
        z1, z2 = pooler_output[:, 0], pooler_output[:, 1]

        # Cosine similarity
        cos_sim = self.sim(z1.unsqueeze(1), z2.unsqueeze(0))

        # Calculate contrastive loss
        criterion = nn.CrossEntropyLoss()
        labels = torch.arange(cos_sim.size(0)).long().to(self.device)
        loss = criterion(cos_sim, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=cos_sim,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def sent_emb(self, encoder, input_ids=None, attention_mask=None):
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        pooler_output = self.pooler(attention_mask, outputs)

        return BaseModelOutputWithPooling(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )


pretrained_model_name = 'distilbert-base-uncased'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(pretrained_model_name)

model = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

#model.eval();


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\ng-ka/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\ng-ka/.cache\huggingface\transfor

### 2.1 Initial DistilBERT embeddings performance

In [42]:
import senteval
#import SentEval.senteval as senteval
#import SentEval_simcse.senteval as senteval
#import SentEval_simcse.senteval.engine as se_engine


def prepare(params, samples):
    return

def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model(**batch)

    pooled_result = outputs.pooler_output.cpu()

    return pooled_result


def evaluate_model():
    PATH_TO_DATA = "./data"

    params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
    tasks = ["STSBenchmark", 'STS12', 'STS13', 'STS14', 'STS15']

    se = senteval.engine.SE(params, batcher, prepare)
    #se = se_engine.SE(params, batcher, prepare)
    results = se.eval(tasks)

    print('STS12: ', results["STS12"]["all"]["spearman"]["all"])
    print('STS13: ', results["STS13"]["all"]["spearman"]["all"])
    print('STS14: ', results["STS14"]["all"]["spearman"]["all"])
    print('STS15: ', results["STS15"]["all"]["spearman"]["all"])
    print('STSB: ', results["STSBenchmark"]["test"]["spearman"][0])

    return results

In [17]:
results = evaluate_model()
results

  sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
  sent2 = np.array([s.split() for s in sent2])[not_empty_idx]


STS12:  0.47596608898755144
STS13:  0.618674507613396
STS14:  0.5294342415078909
STS15:  0.6969415778069489
STSB:  0.5905314299135291


{'STSBenchmark': {'train': {'pearson': (0.607072923019217, 0.0),
   'spearman': SpearmanrResult(correlation=0.5955188195020807, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.6714570536745258, 3.2552287443463876e-197),
   'spearman': SpearmanrResult(correlation=0.6842681833563105, pvalue=1.122076247507796e-207),
   'nsamples': 1500},
  'test': {'pearson': (0.5632499403360769, 2.9511246107835275e-116),
   'spearman': SpearmanrResult(correlation=0.5905314299135291, pvalue=2.169060496705334e-130),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.6130684398138361,
    'mean': 0.6139266390099399,
    'wmean': 0.6112621097209918},
   'spearman': {'all': 0.6145998749710233,
    'mean': 0.6234394775906401,
    'wmean': 0.6101509979372606}}},
 'STS12': {'MSRpar': {'pearson': (0.42118590394249555, 1.3145639639932755e-33),
   'spearman': SpearmanrResult(correlation=0.45991561737043646, pvalue=1.5821923016995031e-40),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.6352879593428346

## 3. Trainer

In [43]:
import mlflow

from transformers import Trainer, TrainingArguments
from transformers import default_data_collator

training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=5e-05,
    per_device_train_batch_size= 64,
    per_device_eval_batch_size = 64,
    weight_decay=0.0,
    num_train_epochs=2,
    max_steps= 20000,
    logging_steps=5000,
    save_steps=5000,
    fp16=True
)

model.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
Using amp half precision backend


In [21]:
os.getcwd()

'C:\\Users\\ng-ka\\OMSCS\\DL\\DLProject\\contrastive-learning-in-distilled-models'

In [44]:
model_path = 'trained_model/distilbert_cl'

#train_result = trainer.train(model_path=model_path)
train_result = trainer.train()
torch.save(model, './0_001temp_model.pth')

***** Running training *****
  Num examples = 1000000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 20000


Step,Training Loss
5000,0.0011
10000,0.0001
15000,0.0002
20000,0.0001


Saving model checkpoint to output\checkpoint-5000
Configuration saved in output\checkpoint-5000\config.json
Model weights saved in output\checkpoint-5000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-5000\tokenizer_config.json
Special tokens file saved in output\checkpoint-5000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-10000
Configuration saved in output\checkpoint-10000\config.json
Model weights saved in output\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-10000\tokenizer_config.json
Special tokens file saved in output\checkpoint-10000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-15000
Configuration saved in output\checkpoint-15000\config.json
Model weights saved in output\checkpoint-15000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-15000\tokenizer_config.json
Special tokens file saved in output\checkpoint-15000\special_tokens_map.json
Saving model check

## 4. Evaluate DistilBert CL Model performance

In [45]:
model.eval()

results = evaluate_model()
results

  sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
  sent2 = np.array([s.split() for s in sent2])[not_empty_idx]


STS12:  0.44181732871886
STS13:  0.5550227020734463
STS14:  0.4927035606920931
STS15:  0.6338052684486225
STSB:  0.5381977129616206


{'STSBenchmark': {'train': {'pearson': (0.5633161573597276, 0.0),
   'spearman': SpearmanrResult(correlation=0.555619143532377, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.6278288747780895, 3.140816697763277e-165),
   'spearman': SpearmanrResult(correlation=0.6500256591318583, pvalue=7.663657392517889e-181),
   'nsamples': 1500},
  'test': {'pearson': (0.5100715292980883, 3.311410990404457e-92),
   'spearman': SpearmanrResult(correlation=0.5381977129616206, pvalue=2.170536918669308e-104),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.5651793669725098,
    'mean': 0.5670721871453018,
    'wmean': 0.5660218520781493},
   'spearman': {'all': 0.5716460142352061,
    'mean': 0.5812808385419519,
    'wmean': 0.569247518664754}}},
 'STS12': {'MSRpar': {'pearson': (0.3636008874177728, 7.385776535127104e-25),
   'spearman': SpearmanrResult(correlation=0.4087784845408527, pvalue=1.4199592757653228e-31),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.5955179687482708, 3.584

In [15]:
mlflow.end_run()

# temp 0.01

In [16]:
# import torch
# import torch.nn as nn

# from transformers import AutoTokenizer, DistilBertModel, DistilBertPreTrainedModel, AutoConfig
# from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPooling

# from src.distilface.modules.pooler import Pooler
# from src.distilface.modules.similarity import Similarity

#from torch.cuda.amp import autocast 

#scaler = GradScaler()

class DistilBertCLModel(DistilBertPreTrainedModel):
    def __init__(self, config, pooler_type='avg_first_last', temp=0.01):
        super().__init__(config)

        self.config = config
        self.pooler_type = pooler_type
        self.temp = 0.01

        self.distilbert = DistilBertModel(config)
        self.pooler = Pooler(pooler_type)
        self.sim = Similarity(temp=temp)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        if self.training:
            return self.cl_forward(self.distilbert, input_ids, attention_mask)
        else:
            return self.sent_emb(self.distilbert, input_ids, attention_mask)

    def cl_forward(self, encoder, input_ids=None, attention_mask=None):
        batch_size = input_ids.size(0)#64#input_ids.size(0)
        num_sent = input_ids.size(1)  # Number of sentences in one instance: 2 sentences

        # Flatten all input tensors
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)

        # Pre-trained Model Encoder
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )

        # Pooling
        pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)

        # Separate representation
        z1, z2 = pooler_output[:, 0], pooler_output[:, 1]

        # Cosine similarity
        cos_sim = self.sim(z1.unsqueeze(1), z2.unsqueeze(0))

        # Calculate contrastive loss
        criterion = nn.CrossEntropyLoss()
        labels = torch.arange(cos_sim.size(0)).long().to(self.device)
        loss = criterion(cos_sim, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=cos_sim,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def sent_emb(self, encoder, input_ids=None, attention_mask=None):
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        pooler_output = self.pooler(attention_mask, outputs)

        return BaseModelOutputWithPooling(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )


pretrained_model_name = 'distilbert-base-uncased'

config = AutoConfig.from_pretrained(pretrained_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model2 = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\ng-ka/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\ng-ka/.cache\huggingface\transfor

In [17]:
training_args2 = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=False,
    learning_rate=5e-05,
    per_device_train_batch_size= 64,
    per_device_eval_batch_size = 64,
    weight_decay=0.0,
    num_train_epochs=2,
    max_steps= 20000,
    logging_steps=5000,
    save_steps=5000,
    fp16=True
)

model2.train()

trainer2 = Trainer(
    model=model2,
    args=training_args2,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
Using amp half precision backend


In [18]:
model_path = 'trained_model/distilbert_cl'

#train_result = trainer.train(model_path=model_path)
train_result2 = trainer2.train()
torch.save(model2, './0_01temp_model.pth')

***** Running training *****
  Num examples = 1000000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 20000


Step,Training Loss
5000,0.0002
10000,0.0001
15000,0.0001
20000,0.0001


Saving model checkpoint to output\checkpoint-5000
Configuration saved in output\checkpoint-5000\config.json
Model weights saved in output\checkpoint-5000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-5000\tokenizer_config.json
Special tokens file saved in output\checkpoint-5000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-10000
Configuration saved in output\checkpoint-10000\config.json
Model weights saved in output\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-10000\tokenizer_config.json
Special tokens file saved in output\checkpoint-10000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-15000
Configuration saved in output\checkpoint-15000\config.json
Model weights saved in output\checkpoint-15000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-15000\tokenizer_config.json
Special tokens file saved in output\checkpoint-15000\special_tokens_map.json
Saving model check

In [19]:
#import senteval
#import SentEval.senteval as senteval
#import SentEval_simcse.senteval as senteval
#import SentEval_simcse.senteval.engine as se_engine


def prepare(params, samples):
    return

def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model2(**batch)

    pooled_result = outputs.pooler_output.cpu()

    return pooled_result


def evaluate_model():
    PATH_TO_DATA = "./data"

    params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
    tasks = ["STSBenchmark", 'STS12', 'STS13', 'STS14', 'STS15']

    se = senteval.engine.SE(params, batcher, prepare)
    #se = se_engine.SE(params, batcher, prepare)
    results = se.eval(tasks)

    print('STS12: ', results["STS12"]["all"]["spearman"]["all"])
    print('STS13: ', results["STS13"]["all"]["spearman"]["all"])
    print('STS14: ', results["STS14"]["all"]["spearman"]["all"])
    print('STS15: ', results["STS15"]["all"]["spearman"]["all"])
    print('STSB: ', results["STSBenchmark"]["test"]["spearman"][0])

    return results

In [20]:
model2.eval()

results2 = evaluate_model()
results2

  sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
  sent2 = np.array([s.split() for s in sent2])[not_empty_idx]


STS12:  0.5018383531025148
STS13:  0.6031743482177943
STS14:  0.5420846725534382
STS15:  0.709963833540791
STSB:  0.6169321318040776


{'STSBenchmark': {'train': {'pearson': (0.6391098590596108, 0.0),
   'spearman': SpearmanrResult(correlation=0.626583628661332, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.6898940758840367, 1.9177910221092845e-212),
   'spearman': SpearmanrResult(correlation=0.699399105438588, pvalue=9.441469501593988e-221),
   'nsamples': 1500},
  'test': {'pearson': (0.5972605440525323, 4.3553688664292627e-134),
   'spearman': SpearmanrResult(correlation=0.6169321318040776, pvalue=2.0477681368590674e-145),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.6424793053926677,
    'mean': 0.6420881596653932,
    'wmean': 0.641250114025058},
   'spearman': {'all': 0.6412911964593353,
    'mean': 0.6476382886346658,
    'wmean': 0.6377002027224968}}},
 'STS12': {'MSRpar': {'pearson': (0.487659215981259, 4.6368531953735704e-46),
   'spearman': SpearmanrResult(correlation=0.5018279806741894, pvalue=4.349175756398707e-49),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.6963944884125152, 7.0

In [21]:
mlflow.end_run()

# 0.05 temp

In [22]:
# import torch
# import torch.nn as nn

# from transformers import AutoTokenizer, DistilBertModel, DistilBertPreTrainedModel, AutoConfig
# from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPooling

# from src.distilface.modules.pooler import Pooler
# from src.distilface.modules.similarity import Similarity

#from torch.cuda.amp import autocast 

#scaler = GradScaler()

class DistilBertCLModel(DistilBertPreTrainedModel):
    def __init__(self, config, pooler_type='avg_first_last', temp=0.05):
        super().__init__(config)

        self.config = config
        self.pooler_type = pooler_type
        self.temp = 0.05

        self.distilbert = DistilBertModel(config)
        self.pooler = Pooler(pooler_type)
        self.sim = Similarity(temp=temp)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        if self.training:
            return self.cl_forward(self.distilbert, input_ids, attention_mask)
        else:
            return self.sent_emb(self.distilbert, input_ids, attention_mask)

    def cl_forward(self, encoder, input_ids=None, attention_mask=None):
        batch_size = input_ids.size(0)#64#input_ids.size(0)
        num_sent = input_ids.size(1)  # Number of sentences in one instance: 2 sentences

        # Flatten all input tensors
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)

        # Pre-trained Model Encoder
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )

        # Pooling
        pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)

        # Separate representation
        z1, z2 = pooler_output[:, 0], pooler_output[:, 1]

        # Cosine similarity
        cos_sim = self.sim(z1.unsqueeze(1), z2.unsqueeze(0))

        # Calculate contrastive loss
        criterion = nn.CrossEntropyLoss()
        labels = torch.arange(cos_sim.size(0)).long().to(self.device)
        loss = criterion(cos_sim, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=cos_sim,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def sent_emb(self, encoder, input_ids=None, attention_mask=None):
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        pooler_output = self.pooler(attention_mask, outputs)

        return BaseModelOutputWithPooling(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )


pretrained_model_name = 'distilbert-base-uncased'

config = AutoConfig.from_pretrained(pretrained_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model3 = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\ng-ka/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\ng-ka/.cache\huggingface\transfor

In [23]:
# import mlflow

# from transformers import Trainer, TrainingArguments
# from transformers import default_data_collator

#model3 = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)

training_args3 = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=False,
    learning_rate=5e-05,
    per_device_train_batch_size= 64,
    per_device_eval_batch_size = 64,
    weight_decay=0.0,
    num_train_epochs=2,
    max_steps= 20000,
    logging_steps=5000,
    save_steps=5000,
    fp16=True
)

model3.train()

trainer3 = Trainer(
    model=model3,
    args=training_args3,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\ng-ka/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9ab5ee.126183e36667471617ae2f0835fab707baa54b731f991507ebbb55ea85adb12a
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertCLModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertCLModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertCLModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequ

In [24]:
model_path = 'trained_model/distilbert_cl'

#train_result = trainer.train(model_path=model_path)
train_result3 = trainer3.train()
torch.save(model3, './0_05temp_model.pth')

***** Running training *****
  Num examples = 1000000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 20000


Step,Training Loss
5000,0.0001
10000,0.0001
15000,0.0001
20000,0.0001


Saving model checkpoint to output\checkpoint-5000
Configuration saved in output\checkpoint-5000\config.json
Model weights saved in output\checkpoint-5000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-5000\tokenizer_config.json
Special tokens file saved in output\checkpoint-5000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-10000
Configuration saved in output\checkpoint-10000\config.json
Model weights saved in output\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-10000\tokenizer_config.json
Special tokens file saved in output\checkpoint-10000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-15000
Configuration saved in output\checkpoint-15000\config.json
Model weights saved in output\checkpoint-15000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-15000\tokenizer_config.json
Special tokens file saved in output\checkpoint-15000\special_tokens_map.json
Saving model check

In [25]:
#import senteval
#import SentEval.senteval as senteval
#import SentEval_simcse.senteval as senteval
#import SentEval_simcse.senteval.engine as se_engine


def prepare(params, samples):
    return

def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model3(**batch)

    pooled_result = outputs.pooler_output.cpu()

    return pooled_result


def evaluate_model():
    PATH_TO_DATA = "./data"

    params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
    tasks = ["STSBenchmark", 'STS12', 'STS13', 'STS14', 'STS15']

    se = senteval.engine.SE(params, batcher, prepare)
    #se = se_engine.SE(params, batcher, prepare)
    results = se.eval(tasks)

    print('STS12: ', results["STS12"]["all"]["spearman"]["all"])
    print('STS13: ', results["STS13"]["all"]["spearman"]["all"])
    print('STS14: ', results["STS14"]["all"]["spearman"]["all"])
    print('STS15: ', results["STS15"]["all"]["spearman"]["all"])
    print('STSB: ', results["STSBenchmark"]["test"]["spearman"][0])

    return results

In [26]:
model3.eval()

results3 = evaluate_model()
results3

  sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
  sent2 = np.array([s.split() for s in sent2])[not_empty_idx]


STS12:  0.6056512503604072
STS13:  0.7664013170536642
STS14:  0.6853466562577095
STS15:  0.7770547451580875
STSB:  0.7409549122778031


{'STSBenchmark': {'train': {'pearson': (0.7636742615728462, 0.0),
   'spearman': SpearmanrResult(correlation=0.7448408409102731, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.7790914556746467, 4.3252980564806194e-306),
   'spearman': SpearmanrResult(correlation=0.7829357887267215, pvalue=4.2210875529375e-311),
   'nsamples': 1500},
  'test': {'pearson': (0.7422504324127708, 1.1912635357145677e-241),
   'spearman': SpearmanrResult(correlation=0.7409549122778031, pvalue=2.2567862805118097e-240),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.7610125124249868,
    'mean': 0.7616720498867545,
    'wmean': 0.7629304426972037},
   'spearman': {'all': 0.7549624254583777,
    'mean': 0.7562438473049325,
    'wmean': 0.7508426635969324}}},
 'STS12': {'MSRpar': {'pearson': (0.6241236099371741, 3.214061362436737e-82),
   'spearman': SpearmanrResult(correlation=0.6155870786598757, pvalue=2.0165685333515305e-79),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.8292242043157813, 3

In [27]:
mlflow.end_run()

# 0.1 temp

In [30]:
class DistilBertCLModel(DistilBertPreTrainedModel):
    def __init__(self, config, pooler_type='avg_first_last', temp=0.1):
        super().__init__(config)

        self.config = config
        self.pooler_type = pooler_type
        self.temp = 0.1

        self.distilbert = DistilBertModel(config)
        self.pooler = Pooler(pooler_type)
        self.sim = Similarity(temp=temp)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        if self.training:
            return self.cl_forward(self.distilbert, input_ids, attention_mask)
        else:
            return self.sent_emb(self.distilbert, input_ids, attention_mask)

    def cl_forward(self, encoder, input_ids=None, attention_mask=None):
        batch_size = input_ids.size(0)#64#input_ids.size(0)
        num_sent = input_ids.size(1)  # Number of sentences in one instance: 2 sentences

        # Flatten all input tensors
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)

        # Pre-trained Model Encoder
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )

        # Pooling
        pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)

        # Separate representation
        z1, z2 = pooler_output[:, 0], pooler_output[:, 1]

        # Cosine similarity
        cos_sim = self.sim(z1.unsqueeze(1), z2.unsqueeze(0))

        # Calculate contrastive loss
        criterion = nn.CrossEntropyLoss()
        labels = torch.arange(cos_sim.size(0)).long().to(self.device)
        loss = criterion(cos_sim, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=cos_sim,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def sent_emb(self, encoder, input_ids=None, attention_mask=None):
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        pooler_output = self.pooler(attention_mask, outputs)

        return BaseModelOutputWithPooling(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )
    
config = AutoConfig.from_pretrained(pretrained_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model4 = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\ng-ka/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\ng-ka/.cache\huggingface\transfor

In [31]:
# import mlflow

# from transformers import Trainer, TrainingArguments
# from transformers import default_data_collator

#model3 = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)

training_args4 = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=False,
    learning_rate=5e-05,
    per_device_train_batch_size= 64,
    per_device_eval_batch_size = 64,
    weight_decay=0.0,
    num_train_epochs=2,
    max_steps= 20000,
    logging_steps=5000,
    save_steps=5000,
    fp16=True
)

model4.train()

trainer4 = Trainer(
    model=model4,
    args=training_args4,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
Using amp half precision backend


In [32]:
model_path = 'trained_model/distilbert_cl'

#train_result = trainer.train(model_path=model_path)
train_result4 = trainer4.train()
torch.save(model4, './0_1temp_model.pth')

***** Running training *****
  Num examples = 1000000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 20000


Step,Training Loss
5000,0.006
10000,0.0044
15000,0.0043
20000,0.0041


Saving model checkpoint to output\checkpoint-5000
Configuration saved in output\checkpoint-5000\config.json
Model weights saved in output\checkpoint-5000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-5000\tokenizer_config.json
Special tokens file saved in output\checkpoint-5000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-10000
Configuration saved in output\checkpoint-10000\config.json
Model weights saved in output\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-10000\tokenizer_config.json
Special tokens file saved in output\checkpoint-10000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-15000
Configuration saved in output\checkpoint-15000\config.json
Model weights saved in output\checkpoint-15000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-15000\tokenizer_config.json
Special tokens file saved in output\checkpoint-15000\special_tokens_map.json
Saving model check

In [33]:
#import senteval
#import SentEval.senteval as senteval
#import SentEval_simcse.senteval as senteval
#import SentEval_simcse.senteval.engine as se_engine


def prepare(params, samples):
    return

def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model4(**batch)

    pooled_result = outputs.pooler_output.cpu()

    return pooled_result


def evaluate_model():
    PATH_TO_DATA = "./data"

    params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
    tasks = ["STSBenchmark", 'STS12', 'STS13', 'STS14', 'STS15']

    se = senteval.engine.SE(params, batcher, prepare)
    #se = se_engine.SE(params, batcher, prepare)
    results = se.eval(tasks)

    print('STS12: ', results["STS12"]["all"]["spearman"]["all"])
    print('STS13: ', results["STS13"]["all"]["spearman"]["all"])
    print('STS14: ', results["STS14"]["all"]["spearman"]["all"])
    print('STS15: ', results["STS15"]["all"]["spearman"]["all"])
    print('STSB: ', results["STSBenchmark"]["test"]["spearman"][0])

    return results

In [34]:
model4.eval()

results4 = evaluate_model()
results4

  sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
  sent2 = np.array([s.split() for s in sent2])[not_empty_idx]


STS12:  0.5038198940159697
STS13:  0.6404188064258012
STS14:  0.5539911624009456
STS15:  0.6540062512499364
STSB:  0.5631252826099555


{'STSBenchmark': {'train': {'pearson': (0.6219086558842281, 0.0),
   'spearman': SpearmanrResult(correlation=0.5975192167388444, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.6220616320913329, 2.2314321216343802e-161),
   'spearman': SpearmanrResult(correlation=0.6349512169816413, pvalue=4.235121978076463e-170),
   'nsamples': 1500},
  'test': {'pearson': (0.5617961659198661, 1.5365510632076718e-115),
   'spearman': SpearmanrResult(correlation=0.5631252826099555, pvalue=3.400716345614139e-116),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.6104371686656989,
    'mean': 0.6019221512984757,
    'wmean': 0.6123275641653827},
   'spearman': {'all': 0.6049080443331466,
    'mean': 0.5985319054434804,
    'wmean': 0.5985297365812711}}},
 'STS12': {'MSRpar': {'pearson': (0.47395765375148974, 2.8991109682396544e-43),
   'spearman': SpearmanrResult(correlation=0.4672447933970129, pvalue=6.11751567793586e-42),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.6692804822163766, 

In [35]:
mlflow.end_run()

# 1 temp

In [36]:
class DistilBertCLModel(DistilBertPreTrainedModel):
    def __init__(self, config, pooler_type='avg_first_last', temp=1):
        super().__init__(config)

        self.config = config
        self.pooler_type = pooler_type
        self.temp = 1

        self.distilbert = DistilBertModel(config)
        self.pooler = Pooler(pooler_type)
        self.sim = Similarity(temp=temp)

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None):
        if self.training:
            return self.cl_forward(self.distilbert, input_ids, attention_mask)
        else:
            return self.sent_emb(self.distilbert, input_ids, attention_mask)

    def cl_forward(self, encoder, input_ids=None, attention_mask=None):
        batch_size = input_ids.size(0)#64#input_ids.size(0)
        num_sent = input_ids.size(1)  # Number of sentences in one instance: 2 sentences

        # Flatten all input tensors
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len)

        # Pre-trained Model Encoder
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )

        # Pooling
        pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1)))  # (bs, num_sent, hidden)

        # Separate representation
        z1, z2 = pooler_output[:, 0], pooler_output[:, 1]

        # Cosine similarity
        cos_sim = self.sim(z1.unsqueeze(1), z2.unsqueeze(0))

        # Calculate contrastive loss
        criterion = nn.CrossEntropyLoss()
        labels = torch.arange(cos_sim.size(0)).long().to(self.device)
        loss = criterion(cos_sim, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=cos_sim,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def sent_emb(self, encoder, input_ids=None, attention_mask=None):
        outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        pooler_output = self.pooler(attention_mask, outputs)

        return BaseModelOutputWithPooling(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )
    
config = AutoConfig.from_pretrained(pretrained_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model5 = DistilBertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\ng-ka/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\ng-ka/.cache\huggingface\transfor

In [37]:
# import mlflow

# from transformers import Trainer, TrainingArguments
# from transformers import default_data_collator

training_args5 = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=False,
    learning_rate=5e-05,
    per_device_train_batch_size= 64,
    per_device_eval_batch_size = 64,
    weight_decay=0.0,
    num_train_epochs=2,
    max_steps= 20000,
    logging_steps=5000,
    save_steps=5000,
    fp16=True
)

model5.train()

trainer5 = Trainer(
    model=model5,
    args=training_args4,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
Using amp half precision backend


In [38]:
model_path = 'trained_model/distilbert_cl'

#train_result = trainer.train(model_path=model_path)
train_result5 = trainer5.train()
torch.save(model5, './1temp_model.pth')

***** Running training *****
  Num examples = 1000000
  Num Epochs = 2
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 20000


Step,Training Loss
5000,3.2067
10000,3.1995
15000,3.1973
20000,3.1961


Saving model checkpoint to output\checkpoint-5000
Configuration saved in output\checkpoint-5000\config.json
Model weights saved in output\checkpoint-5000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-5000\tokenizer_config.json
Special tokens file saved in output\checkpoint-5000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-10000
Configuration saved in output\checkpoint-10000\config.json
Model weights saved in output\checkpoint-10000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-10000\tokenizer_config.json
Special tokens file saved in output\checkpoint-10000\special_tokens_map.json
Saving model checkpoint to output\checkpoint-15000
Configuration saved in output\checkpoint-15000\config.json
Model weights saved in output\checkpoint-15000\pytorch_model.bin
tokenizer config file saved in output\checkpoint-15000\tokenizer_config.json
Special tokens file saved in output\checkpoint-15000\special_tokens_map.json
Saving model check

In [39]:
#import senteval
#import SentEval.senteval as senteval
#import SentEval_simcse.senteval as senteval
#import SentEval_simcse.senteval.engine as se_engine


def prepare(params, samples):
    return

def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model5(**batch)

    pooled_result = outputs.pooler_output.cpu()

    return pooled_result


def evaluate_model():
    PATH_TO_DATA = "./data"

    params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
    tasks = ["STSBenchmark", 'STS12', 'STS13', 'STS14', 'STS15']

    se = senteval.engine.SE(params, batcher, prepare)
    #se = se_engine.SE(params, batcher, prepare)
    results = se.eval(tasks)

    print('STS12: ', results["STS12"]["all"]["spearman"]["all"])
    print('STS13: ', results["STS13"]["all"]["spearman"]["all"])
    print('STS14: ', results["STS14"]["all"]["spearman"]["all"])
    print('STS15: ', results["STS15"]["all"]["spearman"]["all"])
    print('STSB: ', results["STSBenchmark"]["test"]["spearman"][0])

    return results

In [40]:
model5.eval()

results5 = evaluate_model()
results5

  sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
  sent2 = np.array([s.split() for s in sent2])[not_empty_idx]


STS12:  0.43052863516877926
STS13:  0.4545311337853271
STS14:  0.3925715299069684
STS15:  0.4786150299588344
STSB:  0.3236416143176774


{'STSBenchmark': {'train': {'pearson': (0.4615603362668978,
    2.403907071103749e-301),
   'spearman': SpearmanrResult(correlation=0.44040623851761684, pvalue=1.9830712984173826e-271),
   'nsamples': 5749},
  'dev': {'pearson': (0.4775079329207626, 2.8424217770169393e-86),
   'spearman': SpearmanrResult(correlation=0.5119757905872877, pvalue=5.3176257500031175e-101),
   'nsamples': 1500},
  'test': {'pearson': (0.3237255679553281, 5.213444696553968e-35),
   'spearman': SpearmanrResult(correlation=0.3236416143176774, pvalue=5.437368047298008e-35),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.44654659850642947,
    'mean': 0.42093127904766287,
    'wmean': 0.4423029474721763},
   'spearman': {'all': 0.4400559581707548,
    'mean': 0.425341214474194,
    'wmean': 0.4341864785886402}}},
 'STS12': {'MSRpar': {'pearson': (0.2238989258812755, 5.632657830685157e-10),
   'spearman': SpearmanrResult(correlation=0.24478479926802538, pvalue=1.0742750796582411e-11),
   'nsamples': 750},
  