In [None]:
import os
import easydict
import torch
import json
import random
import numpy as np
import copy
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import EvalPrediction
from datasets import Dataset, DatasetDict
from transformers import TrainingArguments, AdapterTrainer, EvalPrediction, AdapterSetup
from transformers.adapters.composition import Fuse
from transformers import GPT2AdapterModel
from adapter_setting import TASK_ID_to_NAME, TASK_NAME_to_ID, TASK_DICT, DATA_ATTRS

In [None]:
## hyper parameters
args = easydict.EasyDict({
    'model_dir_root' : './1.trained_model_single',
    'seed' : 1234,
    'adapter_type' : 'houlsby',
    'stage1_epoch' : 5,
    'tasks' : [0,1,2,3,4],
    'stage1_batch_size' : 128,
    'token_weight' : 5,
    'data_dir' : './data',
    'lm_lambda' : 0.25,
    'lm_gen_percentage' : 0.2,
    'max_len' : 128,
    'debug' : False,
    'verbose' : False
})
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
def get_model_dir(task_name):
    return os.path.join(args.model_dir_root, task_name)

In [None]:
def create_dataset(task_name, dataset_type):
    
    train_dataset_path = TASK_DICT[task_name][dataset_type]
    with open(train_dataset_path, 'r') as f:
        raw_ds = json.load(f)
        new_raw_ds = []
        for i1 in range(len(raw_ds['data'])):
            for i2 in range(len(raw_ds['data'][i1]['paragraphs'])):
                raw_ds['data'][i1]['paragraphs'][i2]['pid'] = "%d_%d"%(i1, i2)
            new_raw_ds.append(raw_ds["data"][i1]["paragraphs"])
        raw_ds = new_raw_ds

    qa_input_list = []
    for d in raw_ds:
        context = d[0]['context']
        question = d[0]['qas'][0]['question']
        answer = d[0]['qas'][0]['answers'][0]['text']

        qa_input = context + ' ' + question
        if len(qa_input) > 128:
            continue
        qa_input_list.append(qa_input)
        
    return qa_input_list

In [None]:
import pickle
with open('extra_data.pickle', 'rb') as fr:
    extra_dataset_dict = pickle.load(fr)

In [None]:
def get_dataset():
    train_dataset_dict = {'inputs':[],
                          'labels':[]}
    valid_dataset_dict = {'inputs':[],
                          'labels':[]}
    test_dataset_dict = {'inputs':[],
                         'labels':[]}

    for task_id in args.tasks:
        task_name = TASK_ID_to_NAME[task_id]
        train_qa_input_list = create_dataset(task_name, 'train')
        valid_qa_input_list = create_dataset(task_name, 'eval')
        test_qa_input_list = create_dataset(task_name, 'test')
        
        train_dataset_dict['inputs'].extend(train_qa_input_list)
        train_dataset_dict['labels'].extend([task_id]*len(train_qa_input_list))
        valid_dataset_dict['inputs'].extend(valid_qa_input_list)
        valid_dataset_dict['labels'].extend([task_id]*len(valid_qa_input_list))
        test_dataset_dict['inputs'].extend(test_qa_input_list)
        test_dataset_dict['labels'].extend([task_id]*len(test_qa_input_list))
        
    return train_dataset_dict, valid_dataset_dict, test_dataset_dict

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["inputs"], padding='max_length', truncation=True)

train_dataset_dict, valid_dataset_dict, test_dataset_dict = get_dataset()

dataset_dict = DatasetDict({'train':Dataset.from_dict(train_dataset_dict), 
                            'valid':Dataset.from_dict(valid_dataset_dict),
                            'test':Dataset.from_dict(test_dataset_dict)})

In [None]:
tokenized_datasets = dataset_dict.map(tokenize_function, batch_size=64, batched=True)

In [None]:
shuffled_train = tokenized_datasets['train'].shuffle(seed=1234).select(range(5000))
# shuffled_train = tokenized_datasets['train'].shuffle(seed=1234)

In [None]:
# print(shuffled_train[0])

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

In [None]:
import evaluate
metric = evaluate.load('accuracy')

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer

training_args = TrainingArguments(output_dir='./9.bert_tokenizer',
                                  evaluation_strategy='epoch',
                                  learning_rate=1e-5,
                                  logging_steps=150,
                                  per_device_train_batch_size=16,
                                  num_train_epochs=10)

trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=shuffled_train,
                  eval_dataset=tokenized_datasets['test'],
                  compute_metrics=compute_metrics)

In [None]:
trainer.train()