<a href="https://colab.research.google.com/github/haythemtellili/Great-notebook/blob/master/Roberta_multiclass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Create a directory called kaggle and copy the kaggle.json file in it
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

!kaggle competitions download -c sentiment-analysis-on-movie-reviews
!chmod 600 /root/.kaggle/kaggle.json
!unzip \*.zip;

Downloading test.tsv.zip to /content
  0% 0.00/494k [00:00<?, ?B/s]
100% 494k/494k [00:00<00:00, 68.8MB/s]
Downloading sampleSubmission.csv to /content
  0% 0.00/583k [00:00<?, ?B/s]
100% 583k/583k [00:00<00:00, 82.6MB/s]
Downloading train.tsv.zip to /content
  0% 0.00/1.28M [00:00<?, ?B/s]
100% 1.28M/1.28M [00:00<00:00, 171MB/s]
Archive:  test.tsv.zip
  inflating: test.tsv                

Archive:  train.tsv.zip
  inflating: train.tsv               

2 archives were successfully processed.


In [23]:
import pandas as pd
train=pd.read_csv('/content/train.tsv',sep='\t', usecols=["Phrase", "Sentiment"])
test_df=pd.read_csv('/content/test.tsv',sep='\t', usecols=["Phrase"])
feat_cols = "Phrase"
label_cols = "Sentiment"
train.head()

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
1,A series of escapades demonstrating the adage ...,2
2,A series,2
3,A,2
4,series,2


In [8]:
test_df.head()

Unnamed: 0,Phrase
0,An intermittently pleasing but mostly routine ...
1,An intermittently pleasing but mostly routine ...
2,An
3,intermittently pleasing but mostly routine effort
4,intermittently pleasing but mostly routine


In [10]:
train.shape,test_df.shape

((156060, 2), (66292, 1))

In [9]:
! pip install pytorch-transformers

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |█▉                              | 10kB 29.2MB/s eta 0:00:01[K     |███▊                            | 20kB 6.0MB/s eta 0:00:01[K     |█████▋                          | 30kB 7.1MB/s eta 0:00:01[K     |███████▍                        | 40kB 5.7MB/s eta 0:00:01[K     |█████████▎                      | 51kB 6.0MB/s eta 0:00:01[K     |███████████▏                    | 61kB 7.1MB/s eta 0:00:01[K     |█████████████                   | 71kB 7.7MB/s eta 0:00:01[K     |██████████████▉                 | 81kB 7.3MB/s eta 0:00:01[K     |████████████████▊               | 92kB 8.1MB/s eta 0:00:01[K     |██████████████████▋             | 102kB 8.5MB/s eta 0:00:01[K     |████████████████████▍           | 112kB 8.5MB/s eta 0:00:01[K     |██████████████████████▎     

In [0]:
from fastai.text import *
from fastai.metrics import *
from pytorch_transformers import RobertaTokenizer

In [0]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=False,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchnaged with roberta-large 
    max_lr=1e-5,
    epochs=1,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 5,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
)

In [13]:
train.dtypes

Phrase       object
Sentiment     int64
dtype: object

In [0]:
class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        return [config.start_tok] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [config.end_tok]

In [15]:
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

100%|██████████| 898823/898823 [00:00<00:00, 2081954.68B/s]
100%|██████████| 456318/456318 [00:00<00:00, 1311324.02B/s]


In [0]:
# create fastai vocabulary for roberta
path = Path()
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

In [0]:
# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=fastai_roberta_vocab, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), NumericalizeProcessor(vocab=vocab)]


In [0]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)


In [0]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

In [20]:
test_df.shape

(66292, 1)

In [24]:
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)
data = RobertaTextList.from_df(train, ".", cols=feat_cols, processor=processor) \
    .split_by_rand_pct(seed=2019) \
    .label_from_df(cols=label_cols,label_cls=CategoryList) \
    .add_test(RobertaTextList.from_df(test_df, ".", cols=feat_cols, processor=processor)) \
    .databunch(bs=4, pad_first=False, pad_idx=0)

In [0]:
import torch
import torch.nn as nn
from pytorch_transformers import RobertaModel

# defining our model architecture 
class CustomRobertaModel(nn.Module):
    def __init__(self,num_labels=5):
        super(CustomRobertaModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(config.roberta_model_name)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels) # defining final output layer
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _ , pooled_output = self.roberta(input_ids, token_type_ids, attention_mask) # 
        logits = self.classifier(pooled_output)        
        return logits

In [26]:
roberta_model = CustomRobertaModel()

learn = Learner(data, roberta_model, metrics=[accuracy])

100%|██████████| 524/524 [00:00<00:00, 222744.03B/s]
100%|██████████| 501200538/501200538 [00:19<00:00, 25790201.01B/s]


In [27]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

epoch,train_loss,valid_loss,accuracy,time
0,0.726807,0.708036,0.701589,32:39


In [0]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values




In [29]:
test_preds,preds = get_preds_as_nparray(DatasetType.Test)

In [0]:
sub=pd.read_csv('/content/sampleSubmission.csv')


In [38]:
test_preds

array([[4.716541e-03, 1.266189e-01, 4.538473e-01, 4.012272e-01, 1.358993e-02],
       [3.533341e-03, 1.107268e-01, 4.676867e-01, 4.060097e-01, 1.204348e-02],
       [2.698367e-04, 4.460934e-03, 9.546169e-01, 4.015342e-02, 4.988877e-04],
       [2.668839e-03, 9.832936e-02, 4.930915e-01, 3.962442e-01, 9.666134e-03],
       ...,
       [1.810178e-01, 6.735034e-01, 1.383285e-01, 6.497589e-03, 6.526521e-04],
       [3.356638e-02, 5.888578e-01, 3.644367e-01, 1.276485e-02, 3.742652e-04],
       [3.098268e-02, 5.943120e-01, 3.603474e-01, 1.395341e-02, 4.045825e-04],
       [1.565725e-02, 5.251684e-01, 4.431560e-01, 1.561542e-02, 4.030015e-04]], dtype=float32)

In [39]:
preds

array([2, 2, 2, 2, ..., 1, 1, 1, 1])

In [0]:
sub['Sentiment']=preds

In [42]:
sub.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [0]:
sub.to_csv('sentimentsub.csv',index=False)

In [44]:
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)
(pred_values == data.valid_ds.y.items).mean()

0.7015891323849801