In [1]:
import sys
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from scipy.stats import pearsonr
from torch.utils.data import TensorDataset, DataLoader
from transformers import RobertaTokenizer, RobertaConfig, RobertaForSequenceClassification

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
# Load all the data
trainpath = 'data/train.csv'
testpath = 'data/test.csv'
valpath = 'data/validation.csv'

traindata = pd.read_csv(trainpath)
testdata = pd.read_csv(testpath)
valdata = pd.read_csv(valpath)

traindata['score'] = traindata['score'].apply(lambda x: (x)/5.0)
testdata['score'] = testdata['score'].apply(lambda x: (x)/5.0)
valdata['score'] = valdata['score'].apply(lambda x: (x)/5.0)

In [3]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [4]:
# compute the sequence length using 95% samples logic
lengths = []
for _, row in traindata.iterrows():
    lengths.append(len(row['sentence1']))
    lengths.append(len(row['sentence2']))

lengths.sort()
MAX_LEN = lengths[int(0.95*len(lengths))]
print(MAX_LEN)

133


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def convert_sentences_to_features(sentences, tokenizer, max_len):
    input_ids = []
    attention_masks = []

    for i in range(0, len(sentences), 2):
        encoded_dict = tokenizer.encode_plus(sentences[i], sentences[i+1], add_special_tokens=True, max_length=max_len, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation_strategy='longest_first')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

### Fine-Tune BERT Model

In [6]:
# Hyperparameters
EPOCHS = 10
BATCH_SIZE = 32
LEARN_RATE = 1e-5

In [7]:
x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['sentence1'])
    x_train.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_train = torch.tensor(traindata['score'].values, dtype=torch.float).unsqueeze(1)

trainset = TensorDataset(input_ids, attention_masks, y_train)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)

config = RobertaConfig.from_pretrained(
    'roberta-base',
    num_labels=1,
    output_attentions=False,
    output_hidden_states=False,
)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)
optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE, betas=(0.9, 0.999))
loss_fn = nn.MSELoss()

for epoch in range(EPOCHS):
    model.train()
    t_loss = 0
    for batch in trainloader:
        input_ids, attention_masks, labels = tuple(t for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        t_loss += loss.item()
    print(f'Epoch: {epoch}\tLoss: {t_loss / len(trainloader)}')

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Epoch: 0	Loss: 0.06735592926335003
Epoch: 1	Loss: 0.039815687605490285
Epoch: 2	Loss: 0.03456575179265605
Epoch: 3	Loss: 0.03075480734825962
Epoch: 4	Loss: 0.028129169774345226
Epoch: 5	Loss: 0.025685256652327047
Epoch: 6	Loss: 0.022824762264887492
Epoch: 7	Loss: 0.021502806158322427
Epoch: 8	Loss: 0.019411155214119288
Epoch: 9	Loss: 0.017693477325762312


In [8]:
model_untrained = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)

x_val = []
for _, row in valdata.iterrows():
    x_val.append(row['sentence1'])
    x_val.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_val, tokenizer, MAX_LEN)
y_val = torch.tensor(valdata['score'], dtype=torch.float)
valset = TensorDataset(input_ids, attention_masks, y_val)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in valloader:
        input_ids, attention_masks, labels = tuple(t for t in batch)
        outputs = model_untrained(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: -0.02


In [9]:
x_val = []
for _, row in valdata.iterrows():
    x_val.append(row['sentence1'])
    x_val.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_val, tokenizer, MAX_LEN)
y_val = torch.tensor(valdata['score'], dtype=torch.float)
valset = TensorDataset(input_ids, attention_masks, y_val)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in valloader:
        input_ids, attention_masks, labels = tuple(t for t in batch)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.91


In [10]:
model_untrained = RobertaForSequenceClassification.from_pretrained('roberta-base', config=config)

x_test = []
for _, row in testdata.iterrows():
    x_test.append(row['sentence1'])
    x_test.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_test, tokenizer, MAX_LEN)
y_test = torch.tensor(testdata['score'], dtype=torch.float)

testset = TensorDataset(input_ids, attention_masks, y_test)
testloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in testloader:
        input_ids, attention_masks, labels = tuple(t for t in batch)
        outputs = model_untrained(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: -0.09


In [11]:
x_test = []
for _, row in testdata.iterrows():
    x_test.append(row['sentence1'])
    x_test.append(row['sentence2'])

input_ids, attention_masks = convert_sentences_to_features(x_test, tokenizer, MAX_LEN)
y_test = torch.tensor(testdata['score'], dtype=torch.float)

testset = TensorDataset(input_ids, attention_masks, y_test)
testloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for batch in testloader:
        input_ids, attention_masks, labels = tuple(t for t in batch)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.91


In [None]:
tokenizerpath = 'models/tokenizer.pt'
modelpath = 'models/roberta.pt'
model.save_pretrained(modelpath)
tokenizer.save_pretrained(tokenizerpath)