# Contrastive Learning From Scratch

An attempt to build contrastive learning model from scratch. Parts include:

- Loading and preparing Wiki-1M data for model input
- Contrastive learning model
  - Forward passing using pre-trained model
  - Constrastive layer
  - Calculate loss
- Training procedure
  - Default trainer optimizer
  - Default trainer hyper-parameters

In [1]:
import os

# Set Project home
PROJECT_HOME = os.path.join('/',
                            'workspace',
                            'gatech',
                            'cs7643-deep-learning',
                            'contrastive-learning-in-distilled-models')
%cd {PROJECT_HOME}

# Load project code
%reload_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, './src')

import distilface

/workspace/gatech/cs7643-deep-learning/contrastive-learning-in-distilled-models


## 1. Loading and Preparing Wiki-1M data

Use huggingface `datasets` library to load local file data.

In [2]:
import numpy as np

from datasets import load_dataset

# data_files = {'train': 'data/training/wiki1m_for_simcse.txt'}
# data_files = {'train': 'data/training/wiki5k.txt'}
data_files = {'train': 'data/training/wiki5n.txt'}
datasets = load_dataset('text', data_files=data_files, cache_dir='./data/')

Using custom data configuration default-6f9f73136c15fb9f
Reusing dataset text (./data/text/default-6f9f73136c15fb9f/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
# Unsupervised / Self-supervised dataset

column_names = datasets["train"].column_names
sent0_cname = column_names[0]
sent1_cname = column_names[0]

print('column_names:', column_names)
print('sent0_cname:', sent0_cname, '| sent1_cname:', sent1_cname)

column_names: ['text']
sent0_cname: text | sent1_cname: text


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [5]:
def prepare_features(examples):
    total = len(examples[sent0_cname])

    # Avoid "None" fields 
    for idx in range(total):
        if examples[sent0_cname][idx] is None:
            examples[sent0_cname][idx] = " "
        if examples[sent1_cname][idx] is None:
            examples[sent1_cname][idx] = " "

    sentences = examples[sent0_cname] + examples[sent1_cname]

    sent_features = tokenizer(
        sentences,
        max_length=32,
        truncation=True,
        padding=True,
    )

    features = {}
    for key in sent_features:
        features[key] = [[sent_features[key][i], sent_features[key][i+total]] for i in range(total)]

    return features

In [6]:
train_dataset = datasets["train"].map(prepare_features,
                                      batched=True,
                                    #   num_proc=24,
                                      remove_columns=column_names)

Loading cached processed dataset at ./data/text/default-6f9f73136c15fb9f/0.0.0/4b86d314f7236db91f0a0f5cda32d4375445e64c5eda2692655dd99c2dac68e8/cache-3de2cf246b8c327b.arrow


In [7]:
train_dataset.num_rows

5

In [8]:
str(train_dataset['input_ids'][0][0])

'[101, 26866, 1999, 2148, 2660, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'

In [9]:
str(train_dataset['input_ids'][0][1])

'[101, 26866, 1999, 2148, 2660, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]'

In [10]:
train_dataset.features.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

Sentence 1 and Sentence 2 are the same sentence

## 2. Contrastive Learning Model

In [13]:
import torch
from transformers import AutoTokenizer, BertModel, BertPreTrainedModel, AutoConfig
from src.distilface.models.cl_bert import BertCLModel

pretrained_model_name = 'bert-base-uncased'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(pretrained_model_name)

model = BertCLModel.from_pretrained(pretrained_model_name, config=config).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)

model.eval();

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertCLModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertCLModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertCLModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 3. Trainer

In [14]:
# import mlflow

from transformers import Trainer, TrainingArguments
from transformers import default_data_collator

training_args = TrainingArguments(
    output_dir='output',
    overwrite_output_dir=True,
    learning_rate=5e-05,
    weight_decay=0.0,
    num_train_epochs=1,
    eval_steps=100,
    max_steps=5,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator
)

max_steps is given, it will override any value given in num_train_epochs


In [16]:
model.train()
train_result = trainer.train()

***** Running training *****
  Num examples = 5
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5
Before input shape torch.Size([5, 2, 32])
After input shape torch.Size([10, 32])
pooler_output before: torch.Size([10, 768])
pooler_output after: torch.Size([5, 2, 768])
z1 torch.Size([5, 768])
z2 torch.Size([5, 768])



z1.unsqueeze(1): torch.Size([5, 1, 768])
z2.unsqueeze(0): torch.Size([1, 5, 768])


Step,Training Loss


Before input shape torch.Size([5, 2, 32])
After input shape torch.Size([10, 32])
pooler_output before: torch.Size([10, 768])
pooler_output after: torch.Size([5, 2, 768])
z1 torch.Size([5, 768])
z2 torch.Size([5, 768])



z1.unsqueeze(1): torch.Size([5, 1, 768])
z2.unsqueeze(0): torch.Size([1, 5, 768])
Before input shape torch.Size([5, 2, 32])
After input shape torch.Size([10, 32])
pooler_output before: torch.Size([10, 768])
pooler_output after: torch.Size([5, 2, 768])
z1 torch.Size([5, 768])
z2 torch.Size([5, 768])



z1.unsqueeze(1): torch.Size([5, 1, 768])
z2.unsqueeze(0): torch.Size([1, 5, 768])
Before input shape torch.Size([5, 2, 32])
After input shape torch.Size([10, 32])
pooler_output before: torch.Size([10, 768])
pooler_output after: torch.Size([5, 2, 768])
z1 torch.Size([5, 768])
z2 torch.Size([5, 768])



z1.unsqueeze(1): torch.Size([5, 1, 768])
z2.unsqueeze(0): torch.Size([1, 5, 768])
Before input shape torch.Size([5, 2, 32])
After input shape torch.Size([10, 32])
pooler_out

In [56]:
cos_sim = nn.CosineSimilarity(dim=-1)

## 4. Evaluate Bert CL Model performance

In [55]:
import senteval


def prepare(params, samples):
    return

def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(**batch)
    model.train()

    pooled_result = outputs.pooler_output.cpu()

    return pooled_result


def evaluate_model():
    PATH_TO_DATA = "./data"

    params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
    tasks = ["STSBenchmark", 'STS12', 'STS13', 'STS14', 'STS15']

    se = senteval.engine.SE(params, batcher, prepare)
    results = se.eval(tasks)

    print('STS12: ', results["STS12"]["all"]["spearman"]["all"])
    print('STS13: ', results["STS13"]["all"]["spearman"]["all"])
    print('STS14: ', results["STS14"]["all"]["spearman"]["all"])
    print('STS15: ', results["STS15"]["all"]["spearman"]["all"])
    print('STSB: ', results["STSBenchmark"]["test"]["spearman"][0])

    return results

In [17]:
model.eval()

results = evaluate_model()
results

STS12:  0.5675054534807098
STS13:  0.7281554777556237
STS14:  0.6335945639499094
STS15:  0.7600618289222154
STSB:  0.7113600318308351


{'STSBenchmark': {'train': {'pearson': (0.7414994287646051, 0.0),
   'spearman': SpearmanrResult(correlation=0.7102657603877065, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.7561389569108177, 3.798134217661319e-278),
   'spearman': SpearmanrResult(correlation=0.7619754084096394, pvalue=5.964770119842861e-285),
   'nsamples': 1500},
  'test': {'pearson': (0.7272632396921386, 2.519373640229751e-227),
   'spearman': SpearmanrResult(correlation=0.7113600318308351, pvalue=3.899147457192333e-213),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.7409343456431441,
    'mean': 0.7416338751225204,
    'wmean': 0.7417692001471257},
   'spearman': {'all': 0.7242768878083036,
    'mean': 0.7278670668760604,
    'wmean': 0.7194305114717322}}},
 'STS12': {'MSRpar': {'pearson': (0.4222923718591043, 8.57578903746429e-34),
   'spearman': SpearmanrResult(correlation=0.44805503021367254, pvalue=2.5934246235314064e-38),
   'nsamples': 750},
  'MSRvid': {'pearson': (0.8216320722327597, 9.