# TEAM 4 Group Project: BRAINTEASER: A Novel Task Defying Common Sense

In [None]:
import numpy as np
import torch
import transformers
import datasets
from datasets import load_dataset, load_metric
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from transformers import AutoTokenizer
import wandb

if torch.cuda.is_available():
    device = torch.device("cuda:0")

# do this once
np_load_old = np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

## Data Preprocessing

In [50]:
data_path = "./data/"
sp_data_path = data_path + "SP-train.npy"
wp_data_path = data_path + "WP-train.npy"
sp_test_path = data_path + "SP_eval_data_for_practice.npy"
wp_test_path = data_path + "WP_eval_data_for_practice.npy"

In [76]:
sp_train = np.load(sp_data_path)
wp_train = np.load(wp_data_path)
sp_test = np.load(sp_test_path)
wp_test = np.load(wp_test_path)
print(len(sp_train))
print(len(wp_train))
print(len(sp_test))
print(len(wp_test))

507
396
120
120


In [52]:
def combine_sp_wp(sp, wp):
    '''
    Combine sp and wp into one single dataset
    '''
    combined_p = [x for x in sp]
    for x in wp:
        combined_p.append(x)
    return combined_p

In [53]:
# For final training
train_data = combine_sp_wp(sp_train, wp_train)
random.seed(0)
# For our own testing, we split the training set to get accura y
train_data_sp, val_data_sp = train_test_split(sp_train, test_size=0.1, random_state = 17)
train_data_wp, val_data_wp = train_test_split(wp_train, test_size=0.1, random_state = 17)
train_data_split = combine_sp_wp(train_data_sp, train_data_wp)
val_data_split = combine_sp_wp(val_data_sp, val_data_wp)

In [54]:
print(len(train_data))
print(train_data[0])
print(train_data[-1])

903
{'id': 'SP-0', 'question': 'Mr. and Mrs. Mustard have six daughters and each daughter has one brother. But there are only 9 people in the family, how is that possible?', 'answer': 'Each daughter shares the same brother.', 'distractor1': 'Some daughters get married and have their own family.', 'distractor2': 'Some brothers were not loved by family and moved away.', 'distractor(unsure)': 'None of above.', 'label': 1, 'choice_list': ['Some daughters get married and have their own family.', 'Each daughter shares the same brother.', 'Some brothers were not loved by family and moved away.', 'None of above.'], 'choice_order': [1, 0, 2, 3]}
{'id': 'WP-163_CR', 'question': "What kind of ice doesn't contain water?", 'answer': 'Dry ice.', 'distractor1': 'Flaked ice.', 'distractor2': 'Glacier ice.', 'distractor(unsure)': 'None of above.', 'label': 1, 'choice_list': ['Flaked ice.', 'Dry ice.', 'Glacier ice.', 'None of above.'], 'choice_order': [1, 0, 2, 3]}


In [55]:
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [56]:
def preprocess_function(data):
    # Repeat each quesiton four times to go with the four possibilities of second sentences.
    questions = [[i["question"]] * 4 for i in data]
    # Grab all choices possible for each context.
    choices = [i["choice_list"] for i in data]
    # Flatten everything
    questions = sum(questions, [])
    choices = sum(choices, [])
    # Tokenize
    tokenized_qa = tokenizer(questions, choices, truncation=True, padding=True)
    # Un-flatten
    return {k: [v[i : i+4] for i in range(0, len(v), 4)] for k, v in tokenized_qa.items()}

In [57]:
examples = train_data[:5]
# 'input_ids', 'token_type_ids', 'attention_mask', (#data, 4, #feature)
features = preprocess_function(examples) 
idx = 3
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(4)]

['[CLS] a woman shoots her husband. then she holds him underwater for over 5 minutes. finally, she hangs him. but 5 minutes later, they both go out and enjoy a wonderful dinner together. how can this be? [SEP] the woman gets arrested for murder after dinner. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] a woman shoots her husband. then she holds him underwater for over 5 minutes. finally, she hangs him. but 5 minutes later, they both go out and enjoy a wonderful dinner together. how can this be? [SEP] the woman gets a new partner. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] a woman shoots her husband. then she holds him underwater for over 5 minutes. finally, she hangs him. but 5 minutes later, they both go out and enjoy a wonderful dinner together. how can this be? [SEP] the woman was a photographer. she shot a picture of her husband, developed it, and 

## Load Model

In [58]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
batch_size = 16
model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight'

## Data Collator

In [59]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [60]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

## Train

### Train on train-val split

In [63]:
accepted_keys = ["input_ids", "attention_mask"]
train_features_split = preprocess_function(train_data_split)
val_features_split = preprocess_function(val_data_split)
for i in range(len(train_data_split)):
    for k in accepted_keys:
        train_data_split[i][k] = train_features_split[k][i]
for i in range(len(val_data_split)):
    for k in accepted_keys:
        val_data_split[i][k] = val_features_split[k][i]

In [64]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="wandb"
)
trainer = Trainer(
    model,
    args,
    train_dataset=train_data_split,
    eval_dataset=val_data_split,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [65]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.959879,0.615385
2,No log,0.720914,0.78022
3,No log,0.598198,0.824176
4,No log,0.543436,0.813187
5,No log,0.530709,0.813187


TrainOutput(global_step=65, training_loss=0.6553781949556791, metrics={'train_runtime': 31.6067, 'train_samples_per_second': 128.454, 'train_steps_per_second': 2.057, 'total_flos': 1460458799772000.0, 'train_loss': 0.6553781949556791, 'epoch': 5.0})

In [32]:
trainer.save_model("./bert-base-uncased-finetuned/")

### Train on whole dataset for submission

In [66]:
accepted_keys = ["input_ids", "attention_mask"]
train_features = preprocess_function(train_data)
for i in range(len(train_data)):
    for k in accepted_keys:
        train_data[i][k] = train_features[k][i]

In [67]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned",
    do_eval=False,
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="wandb"
)
trainer = Trainer(
    model,
    args,
    train_dataset=train_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [68]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=75, training_loss=0.14995562235514323, metrics={'train_runtime': 34.407, 'train_samples_per_second': 131.223, 'train_steps_per_second': 2.18, 'total_flos': 1624130906643000.0, 'train_loss': 0.14995562235514323, 'epoch': 5.0})

In [34]:
trainer.save_model("./bert-base-uncased-finetuned/")

## Generate predictions for test data

In [89]:
accepted_keys = ["input_ids", "attention_mask"]
sp_test_features = preprocess_function(sp_test)
for i in range(len(sp_test)):
    for k in accepted_keys:
        sp_test[i][k] = sp_test_features[k][i]
wp_test_features = preprocess_function(wp_test)
for i in range(len(wp_test)):
    for k in accepted_keys:
        wp_test[i][k] = wp_test_features[k][i]

In [90]:
test_args = TrainingArguments(
    output_dir = "sample-test",
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = batch_size,   
    dataloader_drop_last = False    
)
# init trainer
trainer = Trainer(
              model = model, 
              args = test_args,
              compute_metrics = compute_metrics)
sp_test_results = trainer.predict(sp_test)
wp_test_results = trainer.predict(wp_test)

In [95]:
sp_test_pred = sp_test_results.predictions.argmax(axis=1)
wp_test_pred = wp_test_results.predictions.argmax(axis=1)
sp_test_pred = [str(i) for i in sp_test_pred]
wp_test_pred = [str(i) for i in wp_test_pred]

In [97]:
with open('sp_pred.txt', 'w') as f:
    f.write('\n'.join(sp_test_pred))
with open('wp_pred.txt', 'w') as f:
    f.write('\n'.join(wp_test_pred))