<a href="https://colab.research.google.com/github/jinisaweaklearner/ML-DL-papers-and-code/blob/master/src/RoBERTa_multi_class_yelp5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. IMPORT LIBRARY

In [1]:
# To install the package "pytorch-transformers"
! pip install pytorch-transformers pendulum

import pendulum
from fastai.text import *
from fastai.metrics import *
import torch
import torch.nn as nn
from pytorch_transformers import RobertaTokenizer
from pytorch_transformers import RobertaModel

# Garbage Collector
import gc 

# link colab with google drive
from google.colab import drive 
drive.mount('/content/drive')

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 1.4MB/s 
[?25hCollecting pendulum
[?25l  Downloading https://files.pythonhosted.org/packages/fc/e8/2eb9f8a5ce6511f2f1d44f621171388765f34fe1d5fa74d50368aa620bbf/pendulum-2.1.0-cp36-cp36m-manylinux1_x86_64.whl (152kB)
[K     |████████████████████████████████| 153kB 35.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 41.5MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[

# 2. SETUP CONFIG

In [0]:
# Creating a config object to store task specific information (hyperparameters and path)
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

'''
file_path: path of files
date: today YYYYMMDD
seed: diff seed can have diff train and validate dataset (how to split)
roberta_model_name: roberta-large/roberta-base
max_lr: maximum learning rate
epochs: number of epoch
bs: batch size, set 4 because of limitation of gpu memory
max_seq_len: the maximum length of tokens in a sentence
num_labels: number of categories
hidden_dropout_prob: the percentage of dropout 
hidden_size: 1024 for roberta-large and 768 for roberta-base
valida_pct: the percentage of validation dataset
start_tok: start of a sentence
end_tok: end of a sentence
model_path: path of model
pred_path: path of prediction
train_file_path: path of training dataset
test_file_path: path of test dataset
text_column_name: column name of text
target_column_name: column name of target/label
'''
config = Config(
    # change to your own path
    file_path = "/content/drive/Shared drives/NLP/Preprocessing/Training Dataset-20191006/",
    date = pendulum.now().strftime('%Y%m%d'),
    seed=18,  
    roberta_model_name='roberta-base',
    max_lr=1e-5,
    epochs=1,
    bs=16,  
    max_seq_len=200,
    num_labels=5,
    hidden_dropout_prob=.05,
    hidden_size=768,
    valid_pct=0.30,
    start_tok="<s>",
    end_tok="</s>",
    text_column_name='text',
    target_column_name='label'
)

config.model_path = f'{config.file_path}/models/{config.roberta_model_name}_Epoch_{config.epochs}_len_{config.max_seq_len}_{config.date}.pkl'
config.pred_path = f'{config.file_path}/pred/{config.roberta_model_name}_Epoch_{config.epochs}_len_{config.max_seq_len}_{config.date}.csv'
config.train_file_path=f'{config.file_path}/labeled_data.csv'
config.test_file_path=f'{config.file_path}/test_data.csv'

# 3. SET UP TOKENIZER

In [0]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        return [config.start_tok] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [config.end_tok]

# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained(config.roberta_model_name)

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

# create fastai vocabulary for roberta
path = Path()
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=fastai_roberta_vocab, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(vocab=vocab)]

# 4. SETUP DATABUNCH

In [8]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

# load dataset
train_df = pd.read_csv(config.train_file_path)
test_df = pd.read_csv(config.test_file_path)
feat_cols = config.text_column_name
label_cols = config.target_column_name

# loading the tokenizer and vocab processors
processor = get_roberta_processor(
    tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

'''
creating databunch
from_df: import the data
split_by_rand_pct: split the data between the training and the validation set
label_from_df: get label from dataset
add_test: get test dataset
databunch: get some config (batch size)
'''
data = RobertaTextList.from_df(train_df, ".", cols=feat_cols, processor=processor) \
    .split_by_rand_pct(valid_pct=config.valid_pct, seed=config.seed) \
    .label_from_df(cols=label_cols, label_cls=CategoryList) \
    .add_test(RobertaTextList.from_df(test_df, ".", cols=feat_cols, processor=processor)) \
    .databunch(bs=config.bs, pad_first=False, pad_idx=0)

# remove unnecessary files
del train_df  
del test_df 
gc.collect()  # clean the memory

0

# 5. TRAINING AND VALIDATION

In [0]:
# defining our model architecture
class CustomRobertaModel(nn.Module):
    def __init__(self, num_labels=5):
        super(CustomRobertaModel, self).__init__()
        self.num_labels = num_labels  # get number of labels
        self.roberta = RobertaModel.from_pretrained(
            config.roberta_model_name)  # get pre-trained model
        # set up percentage of drop
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # defining final output layer
        self.classifier = nn.Linear(config.hidden_size, num_labels)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.roberta(
            input_ids, token_type_ids, attention_mask)
        logits = self.classifier(pooled_output)
        return logits

roberta_model = CustomRobertaModel()
learn = Learner(data, roberta_model, metrics=[accuracy]) #  use acc as evaluation metrics
learn = learn.to_fp16()  # train using half precision (instead of float 32) which can help to speedup

gc.collect() # clean the memory before modelling
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr) # train on one epoch
learn.export(config.model_path)  # store the model

# 6. PREDICTION

In [0]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval() # evaludation
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy() # prediction
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler) # return the indices that would sort an array
    ordered_preds = preds[reverse_sampler, :] # get predicitons by order
    pred_values = np.argmax(ordered_preds, axis=1) # get the index of highiest probability
    return ordered_preds, pred_values # return value and probability

test_preds = get_preds_as_nparray(DatasetType.Test) # predict on test dataset

# import the test dataset again to get test_id
test_df = pd.read_csv(config.test_file_path)

prediction = test_preds[1] + 1 # convert predictions from (0-4) to (1-5)
test_df[config.text_column_name] = prediction 
test_df.drop([config.target_column_name], axis=1, inplace=True) # drop the text

In [11]:
test_df

Unnamed: 0,test_id,text
0,test_1,3
1,test_2,4
2,test_3,2
3,test_4,5
4,test_5,4
...,...,...
49995,test_49996,3
49996,test_49997,2
49997,test_49998,2
49998,test_49999,5


In [0]:
test_df.to_csv(config.pred_path, index=False)  # save the prediction on google drive