In [1]:
import sys
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from tqdm import tqdm
from scipy.stats import pearsonr
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaTokenizer, RobertaConfig, RobertaForSequenceClassification

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
# Load all the data
trainpath = 'data/train-en-es.csv'
testpath = 'data/test-en-es.csv'
valpath = 'data/validation-en-es.csv'

traindata = pd.read_csv(trainpath)
testdata = pd.read_csv(testpath)
valdata = pd.read_csv(valpath)

traindata['score'] = traindata['similarity_score'].apply(lambda x: (x)/5.0)
testdata['score'] = testdata['similarity_score'].apply(lambda x: (x)/5.0)
valdata['score'] = valdata['similarity_score'].apply(lambda x: (x)/5.0)

In [4]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [5]:
# compute the sequence length using 95% samples logic
lengths = []
for _, row in traindata.iterrows():
    lengths.append(len(row['sentence1']))
    lengths.append(len(row['sentence2']))

lengths.sort()
MAX_LEN = lengths[int(0.95*len(lengths))]
print(MAX_LEN)

144


In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def convert_sentences_to_features(sentences, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for i in range(0, len(sentences), 2):
        encoded_dict = tokenizer.encode_plus(sentences[i], sentences[i+1], add_special_tokens=True, max_length=max_len, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation_strategy='longest_first')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0).to(device)
    attention_masks = torch.cat(attention_masks, dim=0).to(device)

    return input_ids, attention_masks

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

### Fine-Tune BERT Model

In [7]:
# Hyperparameters
EPOCHS = 10
BATCH_SIZE = 32
LEARN_RATE = 1e-5

In [8]:
x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['sentence1'])
    x_train.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_train = torch.tensor(traindata['score'].values, dtype=torch.float).unsqueeze(1).to(device)

trainset = TensorDataset(input_ids, attention_masks, y_train)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)

config = RobertaConfig.from_pretrained(
    'roberta-base',
    num_labels=1,
    output_attentions=False,
    output_hidden_states=False,
)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config).to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE, betas=(0.9, 0.999))
loss_fn = nn.MSELoss()

for epoch in tqdm(range(EPOCHS)):
    model.train()
    t_loss = 0
    for batch in trainloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        t_loss += loss.item()
    print(f'Epoch: {epoch}\tLoss: {t_loss / len(trainloader)}')

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

 10%|█         | 1/10 [01:14<11:12, 74.67s/it]

Epoch: 0	Loss: 0.1100216900308927


 20%|██        | 2/10 [02:29<09:57, 74.74s/it]

Epoch: 1	Loss: 0.09334876433842712


 30%|███       | 3/10 [03:44<08:43, 74.73s/it]

Epoch: 2	Loss: 0.0883062352736791


 40%|████      | 4/10 [04:58<07:28, 74.70s/it]

Epoch: 3	Loss: 0.08480852393226491


 50%|█████     | 5/10 [06:13<06:13, 74.69s/it]

Epoch: 4	Loss: 0.08200745754357841


 60%|██████    | 6/10 [07:28<04:58, 74.70s/it]

Epoch: 5	Loss: 0.07993226413511567


 70%|███████   | 7/10 [08:42<03:44, 74.72s/it]

Epoch: 6	Loss: 0.07663390522615778


 80%|████████  | 8/10 [09:57<02:29, 74.73s/it]

Epoch: 7	Loss: 0.07188544080903132


 90%|█████████ | 9/10 [11:12<01:14, 74.73s/it]

Epoch: 8	Loss: 0.06619576180560721


100%|██████████| 10/10 [12:27<00:00, 74.72s/it]

Epoch: 9	Loss: 0.06196797795386778





In [20]:
model_untrained = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config).to(device)

x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['sentence1'])
    x_train.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_train = torch.tensor(traindata['score'], dtype=torch.float).to(device)
trainset = TensorDataset(input_ids, attention_masks, y_train)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in trainloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        outputs = model_untrained(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: -0.02


In [16]:
x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['sentence1'])
    x_train.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_train = torch.tensor(traindata['score'], dtype=torch.float).to(device)
trainset = TensorDataset(input_ids, attention_masks, y_train)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in trainloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.61


In [9]:
model_untrained = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config).to(device)

x_val = []
for _, row in valdata.iterrows():
    x_val.append(row['sentence1'])
    x_val.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_val, tokenizer, MAX_LEN)
y_val = torch.tensor(valdata['score'], dtype=torch.float).to(device)
valset = TensorDataset(input_ids, attention_masks, y_val)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in valloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        outputs = model_untrained(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.03


In [10]:
x_val = []
for _, row in valdata.iterrows():
    x_val.append(row['sentence1'])
    x_val.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_val, tokenizer, MAX_LEN)
y_val = torch.tensor(valdata['score'], dtype=torch.float).to(device)
valset = TensorDataset(input_ids, attention_masks, y_val)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in valloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.12


In [11]:
model_untrained = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config).to(device)

x_test = []
for _, row in testdata.iterrows():
    x_test.append(row['sentence1'])
    x_test.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_test, tokenizer, MAX_LEN)
y_test = torch.tensor(testdata['score'], dtype=torch.float).to(device)

testset = TensorDataset(input_ids, attention_masks, y_test)
testloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in testloader:
        input_ids, attention_masks, labels = tuple(t.to(device) for t in batch)
        outputs = model_untrained(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.03


In [12]:
x_test = []
for _, row in testdata.iterrows():
    x_test.append(row['sentence1'])
    x_test.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_test, tokenizer, MAX_LEN)
y_test = torch.tensor(testdata['score'], dtype=torch.float)

testset = TensorDataset(input_ids, attention_masks, y_test)
testloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in testloader:
        input_ids, attention_masks, labels = tuple(t for t in batch)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.11


In [None]:
tokenizerpath = 'models/cl-tokenizer.pt'
modelpath = 'models/cl-roberta.pt'
model.save_pretrained(modelpath)
tokenizer.save_pretrained(tokenizerpath)