In [1]:
# Imports

import re

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs
from transformers import BertTokenizer

In [2]:
ADD_UNEDITED = True
ADD_TF = False
ADD_TF_UNEDITED = False
VAL_ADD_UNEDITED = False

PRETRAINED_WEIGHTS = 'bert-base-cased'

BATCH_SIZE = 32
NUM_EPOCHS = 6
VAL_STEPS = 100

MODEL_PATH = './models/funniness_bert_finetuned'

In [3]:
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [4]:
train_df = pd.read_csv('data/task-1/train.csv')
tf_df = pd.read_csv('data/task-1/train_funlines.csv')
dev_df = pd.read_csv('data/task-1/dev.csv')
test_df = pd.read_csv('data/task-1/test.csv')

In [5]:
def extract_data(df):
    raw_data = df['original']
    edit_data = df['edit']
    original_data = pd.Series([re.sub('<|\/>', '', s) for s in raw_data])
    edited_data = pd.Series([re.sub('<.*\/>', e, s) for s, e in zip(raw_data, edit_data)])
    grade_data = df['meanGrade']
    return original_data, edited_data, grade_data

In [6]:
# Original training set
train_unedited_data, train_edited_data, train_edited_score = extract_data(train_df)
train_unedited_score = pd.Series([0] * len(train_unedited_data))

# Funline for training set
tf_unedited_data, tf_edited_data, tf_edited_score = extract_data(tf_df)
tf_unedited_score = pd.Series([0] * len(tf_unedited_data))

# Validation (dev) set
val_unedited_data, val_edited_data, val_edited_score = extract_data(dev_df)
val_unedited_score = pd.Series([0] * len(val_unedited_data))

# Test set
_, test_edited_data, test_edited_score = extract_data(test_df)

In [7]:
full_train_data = train_edited_data
full_train_score = train_edited_score

full_val_data = val_edited_data
full_val_score = val_edited_score

if ADD_UNEDITED:
    full_train_data = full_train_data.append(train_unedited_data, ignore_index=True)
    full_train_score = full_train_score.append(train_unedited_score, ignore_index=True)

if ADD_TF:
    full_train_data = full_train_data.append(tf_edited_data, ignore_index=True)
    full_train_score = full_train_score.append(tf_edited_score, ignore_index=True)

if ADD_TF_UNEDITED:
    full_train_data = full_train_data.append(tf_unedited_data, ignore_index=True)
    full_train_score = full_train_score.append(tf_unedited_score, ignore_index=True)

if VAL_ADD_UNEDITED:
    full_val_data = full_val_data.append(val_unedited_data, ignore_index=True)
    full_val_score = full_val_score.append(val_unedited_score, ignore_index=True)

In [8]:
full_train_dataset = pd.concat([full_train_data, full_train_score], axis=1)
full_train_dataset.columns = ['text', 'score']

full_val_dataset = pd.concat([full_val_data, full_val_score], axis=1)
full_val_dataset.columns = ['text', 'score']

full_test_dataset = pd.concat([test_edited_data, test_edited_score], axis=1)
full_test_dataset.columns = ['text', 'score']

In [9]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_WEIGHTS)

In [10]:
class Task1Dataset(Dataset):
    def __init__(self, tokenizer, input_set):
        self.tokenizer = tokenizer
        self.x_train = input_set['text']
        self.y_train = input_set['score']
        
    def collate_fn_padd(self, batch):
        batch_sentences = [s['text'] for s in batch]
        batch_scores = [s['score'] for s in batch]
        encodings = self.tokenizer(batch_sentences,
                                   return_tensors='pt',
                                   padding=True,
                                   truncation=True,
                                   max_length=128)
        encodings['score'] = torch.tensor(batch_scores)
        return encodings

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, i):
        item = {
            'text': self.x_train.iloc[i],
            'score': self.y_train.iloc[i]
        }
        return item

In [11]:
train_dataset = Task1Dataset(tokenizer, full_train_dataset)
val_dataset = Task1Dataset(tokenizer, full_val_dataset)

print("Datasets created.")

Datasets created.


In [12]:
from transformers import BertModel, BertPreTrainedModel


class BertRegressor(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        self.bert = BertModel(config)

        self.linear = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(config.hidden_size, 1)
        )

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        return self.linear(outputs[1])

In [13]:
from transformers import Trainer, TrainingArguments

class BertRegressorTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('score')
        outputs = model(**inputs)

        loss_fn = nn.MSELoss().to(device)
        loss = loss_fn(outputs.view(-1), labels.to(device=device, dtype=torch.float))

        return (loss.sqrt(), outputs) if return_outputs else loss

In [14]:
model = BertRegressor.from_pretrained(PRETRAINED_WEIGHTS)

training_args = TrainingArguments(
    output_dir = './experiment/bert_regressor',
    overwrite_output_dir = True,
    learning_rate = 0.0001,
    logging_steps = VAL_STEPS,
    evaluation_strategy = 'steps',
    label_names = ['score'],
    eval_steps = VAL_STEPS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    num_train_epochs = NUM_EPOCHS,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss'
)

trainer = BertRegressorTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = train_dataset.collate_fn_padd
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertRegressor: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertRegressor from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertRegressor from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertRegressor were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['linear.1.weight', 'l

In [15]:
trainer.train()

Step,Training Loss,Validation Loss,Runtime,Samples Per Second
100,0.4219,0.642403,5.3098,455.577
200,0.3275,0.687756,5.3348,453.436
300,0.3311,0.63213,5.3312,453.742
400,0.3332,0.683219,5.3412,452.893
500,0.2936,0.628278,5.3399,453.008
600,0.3005,0.576745,5.3482,452.301
700,0.2326,0.610686,5.3527,451.918
800,0.2218,0.622573,5.3369,453.26
900,0.2167,0.629465,5.3432,452.726
1000,0.2335,0.688966,5.3758,449.98


TrainOutput(global_step=3624, training_loss=0.14549282224433574, metrics={'train_runtime': 1444.6403, 'train_samples_per_second': 2.509, 'total_flos': 2302834835745792, 'epoch': 6.0})

In [16]:
trainer.save_model(MODEL_PATH)

In [17]:
model = BertRegressor.from_pretrained(MODEL_PATH)

In [18]:
def compute_metrics(target, output):
    sq_error = (output - target)**2
    mse = torch.mean(sq_error)
    rmse = torch.sqrt(mse)
    return mse.item(), rmse.item()


def predict_funniness(text, tokenizer, model): 
    model.eval()
    encodings = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    return model(**encodings)


def evaluate(model, tokenizer, dataset, testing=False):
    with torch.no_grad():
        text = dataset['text']
        score = dataset['score'].tolist()
        score = torch.Tensor(score)

        pred = predict_funniness(text.tolist(), tokenizer, model)
        mse, rmse = compute_metrics(score, pred.view(-1))
        print(f'Eval on {"test set"if testing else "val set"} MSE:{mse} RMSE: {rmse}')

In [19]:
evaluate(model, tokenizer, full_test_dataset, testing=True)

Eval on test set MSE:0.3338930606842041 RMSE: 0.5778347849845886


In [20]:
with torch.no_grad():
    text = full_val_dataset['text']
    score = full_val_dataset['score'].tolist()
    score = torch.Tensor(score)

    pred = predict_funniness(text.tolist(), tokenizer, model)

In [21]:
pred.mean()

tensor(0.8213)