In [10]:
import os

PROJECT_HOME = os.path.join(
    '/',
    'workspace',
    'gatech',
    'cs7643-deep-learning',
    'contrastive-learning-in-distilled-models')

%cd {PROJECT_HOME}

/workspace/gatech/cs7643-deep-learning/contrastive-learning-in-distilled-models


## How STS evaluation works

Refer to: https://github.com/facebookresearch/SentEval/blob/main/senteval/sts.py

In [11]:
import io

In [12]:
def loadFile(fpath):
    data = {'X_A': [], 'X_B': [], 'y': []}
    with io.open(fpath, 'r', encoding='utf-8') as f:
        for line in f:
            text = line.strip().split('\t')
            data['X_A'].append(text[5].split())
            data['X_B'].append(text[6].split())
            data['y'].append(text[4])

    data['y'] = [float(s) for s in data['y']]
    return data

In [13]:
data = loadFile('data/downstream/STS/STSBenchmark/sts-dev.csv')

In [15]:
print(data['X_A'][0])
print(data['X_B'][0])
print(data['y'][0])

['A', 'man', 'with', 'a', 'hard', 'hat', 'is', 'dancing.']
['A', 'man', 'wearing', 'a', 'hard', 'hat', 'is', 'dancing.']
5.0


In [16]:
sorted_corpus = sorted(zip(data['X_A'],
                           data['X_B'],
                           data['y']),
                       key=lambda z: (len(z[0]), len(z[1]), z[2]))

In [30]:
import numpy as np
import transformers
import torch
import torch.nn as nn

from transformers import BertTokenizer, BertModel


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.eval();

In [32]:
def batcher(params, batch):
    sentences = [' '.join(s) for s in batch]
    batch = tokenizer.batch_encode_plus(
        sentences,
        return_tensors='pt',
        padding=True,
    )
    
    for k in batch:
        batch[k] = batch[k].to(device)
    
    with torch.no_grad():
        outputs = model(**batch, output_hidden_states=True, return_dict=True)
    
    # Pooler
    attention_mask = batch.attention_mask
    last_hidden = outputs.last_hidden_state
    hidden_states = outputs.hidden_states

    first_hidden = hidden_states[0]
    last_hidden = hidden_states[-1]
    pooled_result = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)

    return pooled_result.cpu()

In [31]:
embed = {}
bsize = 5

for txt_type in ['X_A', 'X_B']:
    embed[txt_type] = []

    for ii in range(0, len(data['y']), bsize):
        batch = data[txt_type][ii:ii + bsize]
        embeddings = batcher(None, batch)
        embed[txt_type].append(embeddings)

    embed[txt_type] = np.vstack(embed[txt_type])

embed['y'] = np.array(data['y'])

In [37]:
print(embed['X_A'].shape)

(1500, 768)


In [38]:
print(embed['X_B'].shape)

(1500, 768)


In [36]:
print(embed['y'].shape)

(1500,)


In [40]:
def encode_labels(labels, nclass=5):
    """
    Label encoding from Tree LSTM paper (Tai, Socher, Manning)
    """
    Y = np.zeros((len(labels), nclass)).astype('float32')
    for j, y in enumerate(labels):
        for i in range(nclass):
            if i+1 == np.floor(y) + 1:
                Y[j, i] = y - np.floor(y)
            if i+1 == np.floor(y):
                Y[j, i] = np.floor(y) - y + 1
    return Y

In [51]:
from senteval.utils import cosine

In [42]:
sentA = embed['X_A']
sentB = embed['X_B']
Y = encode_labels(embed['y'])

In [54]:
sim = []
for i in range(sentA.shape[0]):
    sim.append(cosine(sentA[i, :], sentB[i, :]))
sim = np.array(sim)

In [57]:
from scipy.stats import pearsonr

pearsonr(sim, embed['y'])

(0.6321576605053028, 3.5669448020499635e-168)