In [1]:
!pip install transformers



In [2]:
import os
import time
import random
import numpy as np
import pandas as pd

from sklearn.metrics import roc_curve, auc

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertForSequenceClassification
from transformers import AdamW
from transformers import DistilBertTokenizerFast

In [3]:
RANDOM_SEED = 2020
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

DATA_PATH = "/content/"

## Dataset

In [4]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_df = pd.read_csv(os.path.join(DATA_PATH, "sat_train.tsv"), sep="\t")
valid_df = pd.read_csv(os.path.join(DATA_PATH, "sat_valid.tsv"), sep="\t")
test_df = pd.read_csv(os.path.join(DATA_PATH, "sat_test.tsv"), sep="\t")

train_encodings = tokenizer(train_df["context"].values.tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_df["context"].values.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_df["context"].values.tolist(), truncation=True, padding=True)

train_dataset = CustomDataset(train_encodings, train_df["label"].values)
valid_dataset = CustomDataset(valid_encodings, valid_df["label"].values)
test_dataset = CustomDataset(test_encodings, test_df["label"].values)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

## Test function

In [6]:
def train(model: nn.Module, loader: DataLoader, optimizer: torch.optim.Optimizer, device: str):
    model.train()

    epoch_loss = 0

    for _, batch in enumerate(loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()


    return epoch_loss / len(loader)


def evaluate(model: nn.Module, loader: DataLoader, device: str):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            epoch_loss += loss.item()

    return epoch_loss / len(loader)


def test(model: nn.Module, loader: DataLoader, device: str):

    with torch.no_grad():
        y_real = []
        y_pred = []
        model.eval()

        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            output = model(input_ids, attention_mask=attention_mask)[0]
            y_pred += [output.cpu()]
            y_real += [batch["labels"]]
            
        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)[:,1]

    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)

    return auroc


def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Before fine-tuning

In [7]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

before_tuning_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
_ = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

## Fine tuning

In [8]:
N_EPOCHS = 20
optimizer = AdamW(model.parameters(), lr=5e-5)


for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_loader, optimizer, device)
    valid_loss = evaluate(model, valid_loader, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\t Val. Loss: {valid_loss:.5f}")

test_auroc = test(model, test_loader, device)
print(f'| Test AUROC: {test_auroc:.5f}')

Epoch: 01 | Time: 0m 0s
	Train Loss: 0.53651
	 Val. Loss: 0.36603
Epoch: 02 | Time: 0m 0s
	Train Loss: 0.41014
	 Val. Loss: 0.34873
Epoch: 03 | Time: 0m 0s
	Train Loss: 0.32635
	 Val. Loss: 0.44384
Epoch: 04 | Time: 0m 0s
	Train Loss: 0.26509
	 Val. Loss: 0.56301
Epoch: 05 | Time: 0m 0s
	Train Loss: 0.11553
	 Val. Loss: 0.37966
Epoch: 06 | Time: 0m 0s
	Train Loss: 0.13106
	 Val. Loss: 0.35931
Epoch: 07 | Time: 0m 0s
	Train Loss: 0.09925
	 Val. Loss: 0.37978
Epoch: 08 | Time: 0m 0s
	Train Loss: 0.10690
	 Val. Loss: 0.53429
Epoch: 09 | Time: 0m 0s
	Train Loss: 0.08450
	 Val. Loss: 0.33815
Epoch: 10 | Time: 0m 0s
	Train Loss: 0.06087
	 Val. Loss: 0.39603
Epoch: 11 | Time: 0m 0s
	Train Loss: 0.04214
	 Val. Loss: 0.37322
Epoch: 12 | Time: 0m 0s
	Train Loss: 0.03168
	 Val. Loss: 0.39570
Epoch: 13 | Time: 0m 0s
	Train Loss: 0.02711
	 Val. Loss: 1.00184
Epoch: 14 | Time: 0m 0s
	Train Loss: 0.02792
	 Val. Loss: 1.09086
Epoch: 15 | Time: 0m 0s
	Train Loss: 0.02373
	 Val. Loss: 1.27300
Epoch: 16 

In [9]:
_ = model.cpu()

before_test_auroc = test(before_tuning_model, test_loader, "cpu")
test_auroc = test(model, test_loader, "cpu")

print(f'SAT Dataset Test AUROC: {before_test_auroc:.5f}')
print(f'SAT Dataset Test AUROC: {test_auroc:.5f}')

SAT Dataset Test AUROC: 0.76923
SAT Dataset Test AUROC: 0.84615


In [10]:
def predict_problem(model, problem, device):
    sat_encodings = [tokenizer(sentence) for sentence in problem]
    with torch.no_grad():
        outputs = []
        for sat_encoding in sat_encodings:
            input_ids = torch.LongTensor([sat_encoding["input_ids"]]).to(device)
            attention_mask = torch.LongTensor(sat_encoding["attention_mask"]).to(device)
            output = model(input_ids, attention_mask=attention_mask)
            outputs += [output[0]]
        output = torch.cat(outputs)[:,1]
    return output.tolist()


def predict_problem_with_models(model_list, problem):
    scores = {}
    for model_name, classifier in model_list:
        score = predict_problem(classifier, problem, "cpu")
        scores[model_name] = score

    score_df = pd.DataFrame(scores).T
    score_df.columns = [f"answer_{i}_score" for i in range(1,6)]
    selected_answer = pd.Series(np.argmin(score_df.values, 1)+1, index=score_df.index, name="selected_answer")
    return pd.concat([selected_answer, score_df], 1)

In [11]:
model_list = [
    ("before_tuning_BERT", before_tuning_model),
    ("after_tuning_BERT", model),
]

In [12]:
problem_1 = [ 
    "Competitive activities can be more than just performance showcases which the best is recognized and the rest are overlooked.",
    "The provision of timely, constructive feedback to participants on performance is an asset that some competitions and contests offer.",
    "The provision of that type of feedback can be interpreted as shifting the emphasis to demonstrating superior performance but not necessarily excellence.",
    "The emphasis on superiority is what we typically see as fostering a detrimental effect of competition.",
    "Information about performance can be very helpful, not only to the participant who does not win or place but also to those who do.",
]
problem_1_label = [0, 1, 1, 1, 1]

In [13]:
predict_problem_with_models(model_list, problem_1)

Unnamed: 0,selected_answer,answer_1_score,answer_2_score,answer_3_score,answer_4_score,answer_5_score
before_tuning_BERT,1,0.138504,0.169909,0.157586,0.144427,0.157007
after_tuning_BERT,1,0.199188,3.37938,3.4991,3.51126,3.327176


In [14]:
problem_2 = [ 
    "People from more individualistic cultural contexts tend to be motivated to maintain self-focused agency or control 1 as these serve as the basis of one’s self-worth.",
    "With this form of agency comes the belief that individual successes 2 depending primarily on one’s own abilities and actions, and thus, whether by influencing the environment or trying to accept one’s circumstances, the use of control ultimately centers on the individual.",
    "The independent self may be more 3 driven to cope by appealing to a sense of agency or control.",
    "Research has shown 4 that East Asians prefer to receive, but not seek, more social support rather than seek personal control in certain cases.",
    "Therefore, people 5 who hold a more interdependent self-construal may prefer to cope in a way that promotes harmony in relationships.",
]
problem_2_label = [1, 0, 1, 1, 1]

In [15]:
predict_problem_with_models(model_list, problem_2)

Unnamed: 0,selected_answer,answer_1_score,answer_2_score,answer_3_score,answer_4_score,answer_5_score
before_tuning_BERT,5,0.122632,0.116739,0.130244,0.120807,0.113969
after_tuning_BERT,3,3.455905,3.372821,2.742554,3.573965,3.516525
