In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

from datasets import load_metric

class PlaceHolderBERT(nn.Module):
    def __init__(self, num_out=1, sigmoid=False, return_CLS_representation=False):
        super().__init__()
        #self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.linear = nn.Linear(768,num_out)
        self.return_CLS_representation = return_CLS_representation
        self.sigmoid_bool = sigmoid
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        #embeddings = self.tokenizer(x, return_tensors='pt', padding=True)
        #embeddings.to(device)
        representations = self.bert(**x).last_hidden_state
        cls_representation = representations[:,0,:]
        pred = self.linear(cls_representation)
        if self.return_CLS_representation:
            return cls_representation
        if self.sigmoid_bool:
            return self.sigmoid(pred)
        return pred
    
    
def train(model, dataloader, num_epochs=1): #can scrap keyword
    #optimizer as usual
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_function = torch.nn.MSELoss()
    #learning rate scheduler
    num_training_steps = num_epochs * len(dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    #auto logging; progress bar
    progress_bar = tqdm(range(num_training_steps))

    #training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in dataloader: #tryin unpacking text from 'labels' as in model development
            batch = {k: v.to(device) for k, v in batch.items()}
            features = {k: v for k, v in batch.items() if k != 'labels'}
            preds = model(features)
            loss = loss_function(preds, batch['labels'].float()) #replace .loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            

def evaluate(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()
    num_correct = 0
    num_samples = 0
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        features = {k: v for k, v in batch.items() if k != 'labels'}
        with torch.no_grad():
            preds = model(features)
            preds = torch.where(preds < .5, 0, 1)
            labels = batch['labels'].reshape(preds.shape)
            num_correct += (preds==labels).sum()
            num_samples += preds.size(0)
    return float(num_correct)/float(num_samples)*100 




imdb = load_dataset('imdb')
sst2 = load_dataset('glue','sst2')

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

#tokenize function
def tokenize_imdb(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

def tokenize_sst2(examples):
    return tokenizer(examples['sentence'], padding="max_length", truncation=True)

#pre-tokenize entire dataset
tokenized_imdb = imdb.map(tokenize_imdb, batched=True)
tokenized_sst2 = sst2.map(tokenize_sst2, batched=True)

tokenized_imdb = tokenized_imdb.remove_columns(["text"])
tokenized_imdb = tokenized_imdb.rename_column("label", "labels")
tokenized_imdb.set_format("torch")

tokenized_sst2 = tokenized_sst2.remove_columns(["sentence","idx"])
tokenized_sst2 = tokenized_sst2.rename_column("label", "labels")
tokenized_sst2.set_format("torch")


### Only for practive
imdb_small_train = tokenized_imdb['train'].shuffle(seed=42).select(range(1000))
imdb_small_test = tokenized_imdb['test'].shuffle(seed=42).select(range(500))
###
imdb_train_loader = DataLoader(imdb_small_train, shuffle=True, batch_size=8)
imdb_test_loader = DataLoader(imdb_small_test, shuffle=True, batch_size=8)

sst2_small_train = tokenized_sst2["train"].shuffle(seed=42).select(range(1000))
sst2_small_test = tokenized_sst2["validation"].shuffle(seed=42).select(range(500)) #actual test set is fucked up

sst2_train_loader = DataLoader(sst2_small_train, shuffle=True, batch_size=8)
sst2_test_loader = DataLoader(sst2_small_test, shuffle=True, batch_size=8)

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

Dataset available here:
https://sheng-z.github.io/ReCoRD-explorer/

In [2]:
import json
data_path = '/home/ubuntu/NLP-brain-biased-robustness/data/record/'
f1 = open(data_path+'train.json') #65709
f2 = open(data_path+'dev.json') #7481

train_set = json.load(f1)
dev_set = json.load(f2)

f1.close()
f2.close()

Useful: https://sheng-z.github.io/ReCoRD-explorer/dataset-readme.txt

In [3]:
def data_split():
    CNN = []
    Daily_mail = []
    for i in range(len(train_set['data'])):
        example = train_set['data'][i]
        if example['source'] == 'CNN':
            CNN.append(example)
        if example['source'] == 'Daily mail':
            Daily_mail.append(example)
    for i in range(len(dev_set['data'])):
        example = dev_set['data'][i]
        if example['source'] == 'CNN':
            CNN.append(example)
        if example['source'] == 'Daily mail':
            Daily_mail.append(example)
    assert len(CNN) + len(Daily_mail) == len(train_set['data']) + len(dev_set['data'])
    return CNN, Daily_mail

CNN, Daily_mail = data_split()
del train_set
del dev_set

In [5]:
CNN[0]

{'id': 'f15689cd256daa03fcfd8c357f1376a8a7017b64',
 'source': 'CNN',
 'passage': {'text': "Caracas, Venezuela (CNN) -- It's been more than 180 years since Venezuelans saw Simon Bolivar's face. But the revolutionary leader's thick sideburns, bushy eyebrows and steely gaze popped out from behind picture frames Tuesday in new 3-D images unveiled by President Hugo Chavez. Researchers used several software programs to reconstruct the face of the man who liberated Bolivia, Colombia, Ecuador, Panama, Peru and Venezuela from the Spanish crown. Scans of Bolivar's skeletal remains, which investigators exhumed two years ago, factored into their calculations. So did historical paintings, photos of restored uniforms Bolivar wore and images of middle-aged Venezuelans, officials said.\n@highlight\nResearchers use computer programs to reconstruct Simon Bolivar's face\n@highlight\nVenezuelan President Hugo Chavez unveils new, 3-D portraits of Bolivar\n@highlight\nResearchers use data from skeletal rema

In [None]:
"""
Official evaluation script for ReCoRD v1.0.
(Some functions are adopted from the SQuAD evaluation script.)
"""

from __future__ import print_function
from collections import Counter
import string
import re
import argparse
import json
import sys


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(dataset, predictions):
    f1 = exact_match = total = 0
    correct_ids = []
    for passage in dataset:
        for qa in passage['qas']:
            total += 1
            if qa['id'] not in predictions:
                message = 'Unanswered question {} will receive score 0.'.format(qa['id'])
                print(message, file=sys.stderr)
                continue

            ground_truths = list(map(lambda x: x['text'], qa['answers']))
            prediction = predictions[qa['id']]

            _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
            if int(_exact_match) == 1:
                correct_ids.append(qa['id'])
            exact_match += _exact_match

            f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total
    print('* Exact_match: {}\n* F1: {}'.format(exact_match, f1))

    return {'exact_match': exact_match, 'f1': f1}, correct_ids


if __name__ == '__main__':
    expected_version = '1.0'
    parser = argparse.ArgumentParser('Official evaluation script for ReCoRD v1.0.')
    parser.add_argument('data_file', help='The dataset file in JSON format.')
    parser.add_argument('pred_file', help='The model prediction file in JSON format.')
    parser.add_argument('--output_correct_ids', action='store_true',
                        help='Output the correctly answered query IDs.')
    args = parser.parse_args()

    with open(args.data_file) as data_file:
        dataset_json = json.load(data_file)
        if dataset_json['version'] != expected_version:
            print('Evaluation expects v-{}, but got dataset with v-{}'.format(
                expected_version, dataset_json['version']), file=sys.stderr)
        dataset = dataset_json['data']

    with open(args.pred_file) as pred_file:
        predictions = json.load(pred_file)

    metrics, correct_ids = evaluate(dataset, predictions)

    if args.output_correct_ids:
        print('Output {} correctly answered question IDs.'.format(len(correct_ids)))
        with open('correct_ids.json', 'w') as f:
            json.dump(correct_ids, f)
