In [1]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk, DatasetDict
from transformers import AdamW, GPT2Tokenizer, GPT2Model

from ppcm_models.pytorch_pretrained_bert.modeling_adapter import GPT2LMHeadModel, GPT2Config
from utils.helper import load_model_recursive

In [2]:
class DataArguments():
    def __init__(self):
        self.dataset_path = '/home/bryan/datasets/bookcorpusopen/bookcorpusopen_chunked.arrow'
        self.bookcorpusopen_story_column_name = 'chunk'
        self.preprocessing_num_workers = 1
        
class ModelArguments():
    def __init__(self):
        self.model_size = 'medium'
        self.lr = 2e-4 #, help="Learning rate")
        self.load_check_point_adapter = ""
#         self.dataset_path = "" #"Path or url of the dataset. If empty download from S3."
#         self.dataset_cache = './dataset_cache' #, help="Path or url of the dataset cache")
#         self.model_checkpoint = "gpt2" #, help="Path, url or short name of the model")
#         self.num_candidates = 2 #, help="Number of candidates for training")
#         self.max_history = 15 #, help="Number of previous exchanges to keep in history")
#         self.max_seq_len = 200 #, help="Max number of tokens")
#         self.train_batch_size = 4 #, help="Batch size for training")
#         self.valid_batch_size = 4 #, help="Batch size for validation")
#         self.gradient_accumulation_steps = 8 #, help="Accumulate gradients on several steps")
#         self.max_norm = 1.0 #, help="Clipping gradient norm")
#         self.n_epochs = 5 #, help="Number of training epochs")
#         self.eval_before_start = 'store_true' #, help="If true start with a first evaluation before training")
#         self.device = 'cuda' if torch.cuda.is_available() else "cpu" #, help="Device (cuda or cpu)")
#         self.fp16 = "" #, help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
#         self.local_rank = -1 #, help="Local rank for distributed training (-1: not distributed)")
#         self.debug = 'store_true' #, help="debugging mode")
#         self.dataset = 'SENT' #, help="Choose between SENT|TOXI|EMO|QUEST|TOPI ")
#         self.label = 'very_negative' #, help="Choose between very_positive|very_negative|toxic|question")
#         self.kl_weight = 0 #, help="kl constraint for language model")
#         self.iter = 75 #, help="Load data from a certain iteration")
        
model_args = ModelArguments()
data_args = DataArguments()

In [3]:
model_args.model_path = f'ppcm_models/dialoGPT/{model_args.model_size}/'

config = GPT2Config.from_json_file(os.path.join(model_args.model_path, 'config.json'))
tokenizer = GPT2Tokenizer.from_pretrained(model_args.model_path)

## Load either Adapters' checkpoint, or just finetuned DialoGPT
if(model_args.load_check_point_adapter != ""):
    print("Loading ADAPTERS")
    model = load_model_recursive(GPT2LMHeadModel(config), model_args.load_check_point_adapter, model_args, verbose=True)
else:
    model = load_model_recursive(GPT2LMHeadModel(config), model_args.model_path+f"{model_args.model_size}_ft.pkl", model_args, verbose=True)

## Load GPT2 instead of DialoGPT

pt_gpt2_model = GPT2Model.from_pretrained('gpt2-medium')

model.transformer.wte.weight = pt_gpt2_model.wte.weight
model.transformer.wpe.weight = pt_gpt2_model.wpe.weight

layers = np.arange(0,len(pt_gpt2_model.h),1)
for layer in layers:
    model.transformer.h[layer].ln_1.weight = pt_gpt2_model.h[layer].ln_1.weight
    model.transformer.h[layer].attn.c_attn.weight = pt_gpt2_model.h[layer].attn.c_attn.weight
    model.transformer.h[layer].attn.c_proj.weight = pt_gpt2_model.h[layer].attn.c_proj.weight
    model.transformer.h[layer].ln_2.weight = pt_gpt2_model.h[layer].ln_2.weight
    model.transformer.h[layer].mlp.c_fc.weight = pt_gpt2_model.h[layer].mlp.c_fc.weight
    model.transformer.h[layer].mlp.c_proj.weight = pt_gpt2_model.h[layer].mlp.c_proj.weight
# model.to(model_args.device)
print('GPT2 loaded instead DialoGPT')

for n, p in model.named_parameters():
    if "adapter" not in str(n):
        p.requires_grad = False
parameters_to_update = [p for n, p in model.named_parameters() if "adapter" in str(n)]
optimizer = AdamW(parameters_to_update, lr=model_args.lr, correct_bias=True)
print('GPT2 param frozen, Adapter is trainable and initialized with AdamW')

Loading finetuned model from ppcm_models/dialoGPT/medium/medium_ft.pkl
GPT2 loaded instead DialoGPT
GPT2 param frozen, Adapter is trainable and initialized with AdamW




In [4]:
from torch.utils.data import Dataset

class BookcorpusGenreAdapterDataset(Dataset):
    def __init__(self, data_args, split, tokenizer, genres=None, sample_row=100,
                        top_n_genres=6, genre_sample_range=100, max_seq_len=512,
                         exclude_non_adapter=True, truncate=True, 
                         add_special_tokens = True,
                         *args, **kwargs):
        super(BookcorpusGenreAdapterDataset, self).__init__(*args, **kwargs)
        
        self.data_args = data_args
        self.tokenizer = tokenizer
        self.add_special_tokens = add_special_tokens
        self.truncate = truncate
        self.max_seq_len = max_seq_len
        self.preprocessing_num_workers = data_args.preprocessing_num_workers
        self.tokenized_dataset, self.genres = self.load_bookcorpusopen(split, genres, sample_row, 
                                                                top_n_genres, genre_sample_range,
                                                                exclude_non_adapter)

    def load_bookcorpusopen(self, split, genres=None, sample_row=None, top_n_genres=None, 
                            genre_sample_range=None, exclude_non_adapter=True):
        """
        Load bookcorpusopen from pyarrow file.
            
        Args:
            split: string, {train, valid, test}
            genres: list of string, genres that we want the dataset to be labelled with, 
                    according to the index, e.g. ['Fiction', 'General', ...]
            sample_row: int, set the int number to sample the dataset, None means using 
                        all the datasets samples available
            top_n_genres: int, if genres==None, we will extract the list of genres ourselves,
                          and top_n_genres dictates how many genres we want to take sorted on
                          the frequency, descending
            genre_sample_range: int, if genres==None, we will extract the list of genres ourselves,
                                and genre_sample_range dictates how many samples will be used in to
                                derive list of genres
            exclude_non_adapter: bool, set to False if we want to use the non styled dataset
            
        Returns:
            dataset: tokenized huggingface dataset format from one of the bookcorpusopen split, 
                        with the adapter_id attached, and without any adapter_id = -1
        """
        
        def get_adapter_id(story_genre_list_string, adapter_genre_list):
            """
            assume that the genre of story is the foremost genre listed in story_genre_list_string
            """
            spotted_genre = {}
            selected_adapter_id = -1
            genre_index = 9999999999999999
            story_genre_list = [genre[1:-1] for genre in story_genre_list_string[1:-1].split(', ')]

            for adapter_id, adapter_genre in enumerate(adapter_genre_list):
                if adapter_genre.lower() in story_genre_list_string.lower():
                    for i, story_genre in enumerate(story_genre_list):
                        if adapter_genre in story_genre_list and i < genre_index:
                            genre_index = i
                    spotted_genre[i] = adapter_id
            selected_adapter_id = spotted_genre[min(spotted_genre.keys())] \
                                    if len(spotted_genre)>0 else selected_adapter_id

            return selected_adapter_id
        
        def map_tokenization(batch):
            tokenized = tokenizer(batch[data_args.bookcorpusopen_story_column_name], 
                                  truncation=self.truncate,
                                  max_length=self.max_seq_len,
                                  add_special_tokens=self.add_special_tokens,
                                  return_tensors='pt')
            return tokenized
        
        # load bookcorpusopen from arrow file
        datasets = DatasetDict()
        print('Loading train, validation, test dataset...')
        datasets = load_from_disk(data_args.dataset_path)
        print('Loaded')
        sample_row = len(datasets[split]) if sample_row == None else sample_row
        
        print('Getting adapter_ids and use only the selected split')
        # if frequent_genres not defined yet, derive frequent genres
        if genres == None:
            print('Generating new list of frequent genres from', split, 'split')
            genres=[]
            for i in range(sample_range):
                genres.extend([genre[1:-1] for genre in datasets[split]['genre'][i][1:-1].split(', ')])

            df_genres = pd.DataFrame({'genres':genres})
            frequent_genres = df_genres.genres.value_counts()[:top_n].index.tolist()
            frequent_genres.remove('') if '' in top_genres else top_genres
            genres = frequent_genres

        dataset = datasets[split].select(np.arange(0,sample_row,1))\
                                    .map(lambda x: {'adapter_id': get_adapter_id(x['genre'], genres)}\
                                         , num_proc=self.preprocessing_num_workers)
        dataset = dataset.filter(lambda x: x['adapter_id']!=-1) if exclude_non_adapter else dataset
        print('Derived adapter_ids and used only the', split, 'split')
        
        # Tokenize with huggingface datasets mapping function
        tokenized_dataset = dataset.map(
            map_tokenization,
            remove_columns=data_args.bookcorpusopen_story_column_name,
            num_proc=self.preprocessing_num_workers,
            load_from_cache_file=True
        )
        
        return tokenized_dataset, genres

    def __getitem__(self, index):
            
        forward_inputs = {}
        forward_inputs['input_ids'] = self.tokenized_dataset[index]['input_ids']
        forward_inputs['attention_mask'] = self.tokenized_dataset[index]['attention_mask']
        forward_inputs['adapter_id'] = self.tokenized_dataset[index]['adapter_id']
        return forward_inputs

    def __len__(self):
        return self.tokenized_dataset.num_rows

In [5]:
frequent_genres = ['Fiction', 'General', 'Fantasy', 'Romance', 'Adventure']
train_dataset = BookcorpusGenreAdapterDataset(data_args, 'train', tokenizer, genres=frequent_genres,
                                                sample_row=200, top_n_genres=6, genre_sample_range=100, 
                                                  max_seq_len=512)

Loading train, validation, test dataset...
Loaded
Getting adapter_ids and use only the selected split


  0%|          | 0/200 [00:00<?, ?ex/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Derived adapter_ids and used only the train split


  0%|          | 0/157 [00:00<?, ?ex/s]

In [6]:
# # check run

# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# encoded_input = {'input_ids': encoded_input['input_ids']}
# output = model(**encoded_input, task_id=19)

### To do:

- Set up the training code to train the adapters
- Trial run the adapters