In [1]:
%cd ../..

/workspace/gatech/cs7643-deep-learning/contrastive-learning-in-distilled-models


In [1]:
! pwd

/workspace/gatech/cs7643-deep-learning/contrastive-learning-in-distilled-models/notebooks/evaluation


In [2]:
import transformers
import torch
import torch.nn as nn
import senteval

In [2]:
# batch = [
#     ['A man with a hard hat is dancing.', 'A man wearing a hard hat is dancing.'],
#     ['A young child is riding a horse.', 'A child is riding a horse.'],
# ]

In [4]:
# sentences = [' '.join(s) for s in batch]
# batch2 = tokenizer.batch_encode_plus(
#     sentences,
#     return_tensors='pt',
#     padding=True,
# )

## SentEval for BERT

Use BERT Model outputs (first + last avg.) for evaluating SentEval STS Benchmark dataset

In [5]:
from transformers import BertTokenizer, BertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertModel.from_pretrained("bert-base-uncased").to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
sum(param.numel() for param in model.parameters())

109482240

In [7]:
def prepare(params, samples):
    return


def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model(**batch, output_hidden_states=True, return_dict=True)

    # Pooler
    attention_mask = batch.attention_mask
    last_hidden = outputs.last_hidden_state
    hidden_states = outputs.hidden_states

    first_hidden = hidden_states[0]
    last_hidden = hidden_states[-1]
    pooled_result = (
        (first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)
    ).sum(1) / attention_mask.sum(-1).unsqueeze(-1)

    return pooled_result.cpu()

In [8]:
PATH_TO_DATA = "./data"

params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
tasks = ["STSBenchmark"]

se = senteval.engine.SE(params, batcher, prepare)
results = se.eval(tasks)

In [9]:
results

{'STSBenchmark': {'train': {'pearson': (0.5619505330308854, 0.0),
   'spearman': SpearmanrResult(correlation=0.5384285918923115, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.6321576404364262, 3.5670578268133463e-168),
   'spearman': SpearmanrResult(correlation=0.6371150398903414, pvalue=1.3235747770641853e-171),
   'nsamples': 1500},
  'test': {'pearson': (0.5327127002648916, 6.359296565156033e-102),
   'spearman': SpearmanrResult(correlation=0.5386682573723041, pvalue=1.3268371385626441e-104),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.5731794732386623,
    'mean': 0.5756069579107344,
    'wmean': 0.5694831813530928},
   'spearman': {'all': 0.5613396961884458,
    'mean': 0.5714039630516523,
    'wmean': 0.5556237901646753}}}}

In [10]:
stsb_spearman = results["STSBenchmark"]["test"]["spearman"][0]
stsb_spearman

0.5386682573723041

## SentEval for DistilBERT

Use DistilBERT model outputs (first + last avg.) for evaluating SentEval STS Benchmark dataset

In [11]:
from transformers import DistilBertTokenizer, DistilBertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
model.eval()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
sum(param.numel() for param in model.parameters())

66362880

In [13]:
def prepare(params, samples):
    return


def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model(**batch, output_hidden_states=True, return_dict=True)

    # Pooler
    attention_mask = batch.attention_mask
    hidden_states = outputs.hidden_states

    first_hidden = hidden_states[0]
    last_hidden = hidden_states[-1]
    pooled_result = (
        (first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)
    ).sum(1) / attention_mask.sum(-1).unsqueeze(-1)

    return pooled_result.cpu()

In [14]:
PATH_TO_DATA = "./data"

params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
tasks = ["STSBenchmark"]

se = senteval.engine.SE(params, batcher, prepare)
results = se.eval(tasks)

In [15]:
results

{'STSBenchmark': {'train': {'pearson': (0.6070898833059735, 0.0),
   'spearman': SpearmanrResult(correlation=0.595530733737618, pvalue=0.0),
   'nsamples': 5749},
  'dev': {'pearson': (0.6714695361612271, 3.181586253494327e-197),
   'spearman': SpearmanrResult(correlation=0.684281683692436, pvalue=1.0932311093348739e-207),
   'nsamples': 1500},
  'test': {'pearson': (0.5632638275139215, 2.904861571468197e-116),
   'spearman': SpearmanrResult(correlation=0.590522771009074, pvalue=2.1926696952431418e-130),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.6130842362054584,
    'mean': 0.6139410823270407,
    'wmean': 0.6112778003604056},
   'spearman': {'all': 0.6146094901418286,
    'mean': 0.6234450628130427,
    'wmean': 0.6101598997470715}}}}

In [16]:
stsb_spearman = results["STSBenchmark"]["test"]["spearman"][0]
stsb_spearman

0.590522771009074

## Data2Vec

In [17]:
import torch
import transformers
import senteval

from transformers import RobertaTokenizer, Data2VecTextModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Data2VecTextModel.from_pretrained("facebook/data2vec-text-base").to(device)
tokenizer = RobertaTokenizer.from_pretrained("facebook/data2vec-text-base")
model.eval()

Some weights of the model checkpoint at facebook/data2vec-text-base were not used when initializing Data2VecTextModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing Data2VecTextModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecTextModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Data2VecTextModel were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['data2vec_text.pooler.dense.weight', 'data2vec_text.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [18]:
sum(param.numel() for param in model.parameters())

124645632

In [19]:
def prepare(params, samples):
    return


def batcher(params, batch):
    sentences = [" ".join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors="pt",
        padding=True,
    )

    for k in batch:
        batch[k] = batch[k].to(device)

    with torch.no_grad():
        outputs = model(**batch, output_hidden_states=True, return_dict=True)

    # Pooler
    attention_mask = batch.attention_mask
    hidden_states = outputs.hidden_states

    first_hidden = hidden_states[0]
    last_hidden = hidden_states[-1]
    pooled_result = (
        (first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)
    ).sum(1) / attention_mask.sum(-1).unsqueeze(-1)

    return pooled_result.cpu()

In [20]:
PATH_TO_DATA = "./data"

params = {"task_path": PATH_TO_DATA, "usepytorch": True, "kfold": 10}
tasks = ["STSBenchmark"]

se = senteval.engine.SE(params, batcher, prepare)
results = se.eval(tasks)

In [21]:
results

{'STSBenchmark': {'train': {'pearson': (0.4077605518815012,
    3.019824514080955e-229),
   'spearman': SpearmanrResult(correlation=0.43023015272142645, pvalue=8.902496901736272e-258),
   'nsamples': 5749},
  'dev': {'pearson': (0.4987420550448654, 3.808025041749306e-95),
   'spearman': SpearmanrResult(correlation=0.538374823495507, pvalue=1.7567365855034568e-113),
   'nsamples': 1500},
  'test': {'pearson': (0.38863728141271414, 6.03015552446043e-51),
   'spearman': SpearmanrResult(correlation=0.4227786076610967, pvalue=6.670376017457212e-61),
   'nsamples': 1379},
  'all': {'pearson': {'all': 0.4201148641848045,
    'mean': 0.43171329611302695,
    'wmean': 0.42052147732987727},
   'spearman': {'all': 0.45186988160019936,
    'mean': 0.4637945279593434,
    'wmean': 0.4478404129813855}}}}

In [22]:
stsb_spearman = results["STSBenchmark"]["test"]["spearman"][0]
stsb_spearman

0.4227786076610967