
**Package Installation**

In [None]:
!git clone https://github.com/devkosal/fastai_roberta.git

Cloning into 'fastai_roberta'...
remote: Enumerating objects: 180, done.[K
remote: Counting objects: 100% (180/180), done.[K
remote: Compressing objects: 100% (130/130), done.[K
remote: Total 180 (delta 96), reused 110 (delta 44), pack-reused 0[K
Receiving objects: 100% (180/180), 25.46 MiB | 24.76 MiB/s, done.
Resolving deltas: 100% (96/96), done.


In [None]:
!pip install fastai==1.0.60 transformers==2.3.0

Collecting fastai==1.0.60
[?25l  Downloading https://files.pythonhosted.org/packages/f5/e4/a7025bf28f303dbda0f862c09a7f957476fa92c9271643b4061a81bb595f/fastai-1.0.60-py3-none-any.whl (237kB)
[K     |█▍                              | 10kB 23.6MB/s eta 0:00:01[K     |██▊                             | 20kB 31.4MB/s eta 0:00:01[K     |████▏                           | 30kB 24.6MB/s eta 0:00:01[K     |█████▌                          | 40kB 28.1MB/s eta 0:00:01[K     |███████                         | 51kB 26.0MB/s eta 0:00:01[K     |████████▎                       | 61kB 28.7MB/s eta 0:00:01[K     |█████████▋                      | 71kB 18.8MB/s eta 0:00:01[K     |███████████                     | 81kB 19.9MB/s eta 0:00:01[K     |████████████▍                   | 92kB 18.8MB/s eta 0:00:01[K     |█████████████▉                  | 102kB 18.7MB/s eta 0:00:01[K     |███████████████▏                | 112kB 18.7MB/s eta 0:00:01[K     |████████████████▋               | 122

**Load And Set Configuration**

In [None]:
from fastai.text import *
from fastai.metrics import *


In [None]:
from transformers import RobertaTokenizer

In [None]:
# Creating a config object to store task specific information
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed = 2019,
    roberta_model_name='roberta-base', # can also be exchnaged with roberta-large 
    max_lr=1e-5,
    epochs=1,
    use_fp16=False,
    bs=4, 
    max_seq_len=256, 
    num_labels = 2,
    hidden_dropout_prob=.05,
    hidden_size=768, # 1024 for roberta-large
    start_tok = "<s>",
    end_tok = "</s>",
)

In [None]:
df = pd.read_csv("/content/dataset.csv")

In [None]:
df=df.drop(df.columns[[0,3,4]], axis=1)
for i in range(len(df['Labels Set'])):
    df['Labels Set'][i]=list(df['Labels Set'][i].split(','))

In [None]:
df

Unnamed: 0,Post,Labels Set
0,मेरे देश के हिन्दु बहुत निराले है। कुछ तो पक्क...,"[hate, offensive]"
1,सरकार हमेशा से किसानों की कमाई को बढ़ाने के लि...,[non-hostile]
2,"सुशांत ने जो बिजनेस डील 9 जून को की थी, वो डील...",[non-hostile]
3,@prabhav218 साले जेएनयू छाप कमिने लोग हिन्दुओं...,"[defamation, offensive]"
4,#unlock4guidelines - अनलॉक-4 के लिए गाइडलाइन्स...,[non-hostile]
...,...,...
5723,उदितराज जी हिम्मत जुटा कर उस नेता के लिए कुछ ...,[hate]
5724,उप्र: पीएम रिपोर्ट में खुलासा: हार्टअटैक से हु...,[non-hostile]
5725,नौकरी गंवा चुके दोस्त की मदद: नगद के बजाए गिफ्...,[non-hostile]
5726,बंगाल में हिन्दू मरे हैं इसलिए मुझे कोई फर्क न...,[fake]


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Labels Set'])
train_result=pd.DataFrame(y, columns=multilabel.classes_)
train_result = train_result.rename({'non-hostile': 'nh'}, axis=1)
train_result

Unnamed: 0,defamation,fake,hate,nh,offensive
0,0,0,1,0,1
1,0,0,0,1,0
2,0,0,0,1,0
3,1,0,0,0,1
4,0,0,0,1,0
...,...,...,...,...,...
5723,0,0,1,0,0
5724,0,0,0,1,0
5725,0,0,0,1,0
5726,0,1,0,0,0


In [None]:
df = pd.concat([df, train_result], axis=1)

In [None]:
df.head()

Unnamed: 0,Post,Labels Set,defamation,fake,hate,nh,offensive
0,मेरे देश के हिन्दु बहुत निराले है। कुछ तो पक्क...,"[hate, offensive]",0,0,1,0,1
1,सरकार हमेशा से किसानों की कमाई को बढ़ाने के लि...,[non-hostile],0,0,0,1,0
2,"सुशांत ने जो बिजनेस डील 9 जून को की थी, वो डील...",[non-hostile],0,0,0,1,0
3,@prabhav218 साले जेएनयू छाप कमिने लोग हिन्दुओं...,"[defamation, offensive]",1,0,0,0,1
4,#unlock4guidelines - अनलॉक-4 के लिए गाइडलाइन्स...,[non-hostile],0,0,0,1,0


In [None]:
test_df = pd.read_csv("/content/validation.csv")
test_df=test_df.drop(test_df.columns[[0]], axis=1)
for i in range(len(test_df['Labels Set'])):
    test_df['Labels Set'][i]=list(test_df['Labels Set'][i].split(','))
from sklearn.preprocessing import MultiLabelBinarizer
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(test_df['Labels Set'])
test_result=pd.DataFrame(y, columns=multilabel.classes_)
test_result = test_result.rename({'non-hostile': 'nh'}, axis=1)
test_result

Unnamed: 0,defamation,fake,hate,nh,offensive
0,0,0,0,1,0
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
...,...,...,...,...,...
806,0,0,1,0,0
807,1,0,0,0,0
808,0,0,0,1,0
809,0,0,0,1,0


In [None]:
test_df = pd.concat([test_df, test_result], axis=1)
test_df.head()

Unnamed: 0,Post,Labels Set,defamation,fake,hate,nh,offensive
0,दृढ़ इच्छा शक्ति से परिपूर्ण प्रणबदा के लिए दे...,[non-hostile],0,0,0,1,0
1,भारतीय जनता पार्टी rss वाले इतने गिरे हुए हैं ...,[defamation],1,0,0,0,0
2,कोरोना से निपटने की तैयारी / दिल्ली में 10 हजा...,[non-hostile],0,0,0,1,0
3,गवर्नर कॉन्फ्रेंस में PM मोदी बोले- शिक्षा नीत...,[non-hostile],0,0,0,1,0
4,"यूपी: गाजीपुर में Toilet घोटाला, प्रधान व सचिव...",[non-hostile],0,0,0,1,0


In [None]:
# frames = [df, test_df]
# df = pd.concat(frames)


In [None]:
df.head()
df

Unnamed: 0,Post,Labels Set,defamation,fake,hate,nh,offensive
0,मेरे देश के हिन्दु बहुत निराले है। कुछ तो पक्क...,"[hate, offensive]",0,0,1,0,1
1,सरकार हमेशा से किसानों की कमाई को बढ़ाने के लि...,[non-hostile],0,0,0,1,0
2,"सुशांत ने जो बिजनेस डील 9 जून को की थी, वो डील...",[non-hostile],0,0,0,1,0
3,@prabhav218 साले जेएनयू छाप कमिने लोग हिन्दुओं...,"[defamation, offensive]",1,0,0,0,1
4,#unlock4guidelines - अनलॉक-4 के लिए गाइडलाइन्स...,[non-hostile],0,0,0,1,0
...,...,...,...,...,...,...,...
5723,उदितराज जी हिम्मत जुटा कर उस नेता के लिए कुछ ...,[hate],0,0,1,0,0
5724,उप्र: पीएम रिपोर्ट में खुलासा: हार्टअटैक से हु...,[non-hostile],0,0,0,1,0
5725,नौकरी गंवा चुके दोस्त की मदद: नगद के बजाए गिफ्...,[non-hostile],0,0,0,1,0
5726,बंगाल में हिन्दू मरे हैं इसलिए मुझे कोई फर्क न...,[fake],0,1,0,0,0


In [None]:
feat_cols = "Post"
label_cols = "fake"


**Setting Up the Tokenizer**

In [None]:

class FastAiRobertaTokenizer(BaseTokenizer):
    """Wrapper around RobertaTokenizer to be compatible with fastai"""
    def __init__(self, tokenizer: RobertaTokenizer, max_seq_len: int=128, **kwargs): 
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
        return self 
    def tokenizer(self, t:str) -> List[str]: 
        """Adds Roberta bos and eos tokens and limits the maximum sequence length""" 
        return [config.start_tok] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + [config.end_tok]

In [None]:
# create fastai tokenizer for roberta
roberta_tok = RobertaTokenizer.from_pretrained("roberta-base")

fastai_tokenizer = Tokenizer(tok_func=FastAiRobertaTokenizer(roberta_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

In [None]:
# create fastai vocabulary for roberta
path = Path()
roberta_tok.save_vocabulary(path)

with open('vocab.json', 'r') as f:
    roberta_vocab_dict = json.load(f)
    
fastai_roberta_vocab = Vocab(list(roberta_vocab_dict.keys()))

In [None]:

# Setting up pre-processors
class RobertaTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
         super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class RobertaNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)


def get_roberta_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for Roberta
    We remove sos and eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original Roberta model.
    """
    return [RobertaTokenizeProcessor(tokenizer=tokenizer), RobertaNumericalizeProcessor(vocab=vocab)]

**Setting up the DataBunch**

In [None]:
# Creating a Roberta specific DataBunch class
class RobertaDataBunch(TextDataBunch):
    "Create a `TextDataBunch` suitable for training Roberta"
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs:int=None, pad_idx=1,
               pad_first=True, device:torch.device=None, no_check:bool=False, backwards:bool=False, 
               dl_tfms:Optional[Collection[Callable]]=None, **dl_kwargs) -> DataBunch:
        "Function that transform the `datasets` in a `DataBunch` for classification. Passes `**dl_kwargs` on to `DataLoader()`"
        datasets = cls._init_ds(train_ds, valid_ds, test_ds)
        val_bs = ifnone(val_bs, bs)
        collate_fn = partial(pad_collate, pad_idx=pad_idx, pad_first=pad_first, backwards=backwards)
        train_sampler = SortishSampler(datasets[0].x, key=lambda t: len(datasets[0][t][0].data), bs=bs)
        train_dl = DataLoader(datasets[0], batch_size=bs, sampler=train_sampler, drop_last=True, **dl_kwargs)
        dataloaders = [train_dl]
        for ds in datasets[1:]:
            lengths = [len(t) for t in ds.x.items]
            sampler = SortSampler(ds.x, key=lengths.__getitem__)
            dataloaders.append(DataLoader(ds, batch_size=val_bs, sampler=sampler, **dl_kwargs))
        return cls(*dataloaders, path=path, device=device, dl_tfms=dl_tfms, collate_fn=collate_fn, no_check=no_check)

In [None]:
class RobertaTextList(TextList):
    _bunch = RobertaDataBunch
    _label_cls = TextList

In [None]:
# loading the tokenizer and vocab processors
processor = get_roberta_processor(tokenizer=fastai_tokenizer, vocab=fastai_roberta_vocab)

In [None]:
# creating our databunch 
data = RobertaTextList.from_df(df, ".", cols=feat_cols, processor=processor) \
    .split_by_rand_pct(seed=config.seed) \
    .label_from_df(cols=label_cols,label_cls=CategoryList) \
    .databunch(bs=config.bs, pad_first=False, pad_idx=0)

  return np.array(a, dtype=dtype, **kwargs)


  return array(a, dtype, copy=False, order=order)


In [None]:

data

RobertaDataBunch;

Train: LabelList (4583 items)
x: RobertaTextList
<s> à¤ ® à¥ ĩ à¤ ° à¥ ĩ Ġà¤ ¦ à¥ ĩ à¤ ¶ Ġà¤ ķ à¥ ĩ Ġà¤ ¹ à¤ ¿ à¤ ¨ à¥ į à¤ ¦ à¥ ģ Ġà¤ ¬ à¤ ¹ à¥ ģ à¤ ¤ Ġà¤ ¨ à¤ ¿ à¤ ° à¤¾ à¤ ² à¥ ĩ Ġà¤ ¹ à¥ Ī à¥ ¤ Ġà¤ ķ à¥ ģ à¤ Ľ Ġà¤ ¤ à¥ ĭ Ġà¤ ª à¤ ķ à¥ į à¤ ķ à¥ ĩ Ġà¤ ° à¤¾ à¤ ® Ġà¤ Ń à¤ ķ à¥ į à¤ ¤ Ġà¤ ¹ à¥ Ī Ġà¤ Ķ à¤ ° Ġà¤ ķ à¥ ģ à¤ Ľ Ġà¤ ¬ à¤¾ à¤ ¬ à¤ ° Ġà¤ ķ à¥ ĩ Ġà¤ ¸ à¤¾ à¤ ² à¥ ĩ Ġà¤ ¹ à¥ Ī Ġ Ċ Ċ ðŁ Ļ ı Ġà¤ ľ à¤ ¯ Ġà¤ ¶ à¥ į à¤ ° à¥ Ģ Ġà¤ ° à¤¾ à¤ ® ĠðŁ Ļ ı </s>,<s> à¤ ¸ à¤ ° à¤ ķ à¤¾ à¤ ° Ġà¤ ¹ à¤ ® à¥ ĩ à¤ ¶ à¤¾ Ġà¤ ¸ à¥ ĩ Ġà¤ ķ à¤ ¿ à¤ ¸ à¤¾ à¤ ¨ à¥ ĭ à¤ Ĥ Ġà¤ ķ à¥ Ģ Ġà¤ ķ à¤ ® à¤¾ à¤ Ī Ġà¤ ķ à¥ ĭ Ġà¤ ¬ à¤ ¢ à¤ ¼ à¤¾ à¤ ¨ à¥ ĩ Ġà¤ ķ à¥ ĩ Ġà¤ ² à¤ ¿ à¤ ı Ġà¤ ¨ à¤ Ī - à¤ ¨ à¤ Ī Ġà¤ ¸ à¥ į à¤ ķ à¥ Ģ à¤ ® à¥ ĩ à¤ Ĥ Ġà¤ ² à¤¾ à¤ ¤ à¥ Ģ Ġà¤ ° à¤ ¹ à¤ ¤ à¥ Ģ Ġà¤ ¹ à¥ Ī , Ġà¤ ¤ à¤¾ à¤ ķ à¤ ¿ Ġà¤ ī à¤ ¨ Ġà¤ ª à¤ ° Ġà¤ ľ à¥ į à¤ ¯ à¤¾ à¤ ¦ à¤¾ Ġà¤ Ĩ à¤ ° à¥ į à¤ ¥ à¤ ¿ à¤ ķ Ġà¤ ¬ à¥ ĭ à¤ Ŀ Ġà¤ ¨ Ġà¤ ª à¤ ¡ à¤ ¼ à¥ ĩ . Ċ Ċ https :// t . co / 8 iy 2 MJ SB As </s>,<s> @ pr ab h av 21

**Building the Model** 

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaModel

# defining our model architecture 
class CustomRobertaModel(nn.Module):
    def __init__(self,num_labels=2):
        super(CustomRobertaModel,self).__init__()
        self.num_labels = num_labels
        self.roberta = RobertaModel.from_pretrained(config.roberta_model_name)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels) # defining final output layer
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _ , pooled_output = self.roberta(input_ids, token_type_ids, attention_mask) # 
        logits = self.classifier(pooled_output)        
        return logits

In [None]:

roberta_model = CustomRobertaModel(num_labels=config.num_labels)

learn = Learner(data, roberta_model, metrics=[accuracy])

In [None]:
learn.model.roberta.train() # setting roberta to train as it is in eval mode by default
learn.fit_one_cycle(config.epochs, max_lr=config.max_lr)

epoch,train_loss,valid_loss,accuracy,time
0,0.367236,0.337991,0.806114,04:09


  return array(a, dtype, copy=False, order=order)


In [None]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    learn.model.roberta.eval()
    preds = learn.get_preds(ds_type)[0].detach().cpu().numpy()
    sampler = [i for i in data.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    ordered_preds = preds[reverse_sampler, :]
    pred_values = np.argmax(ordered_preds, axis=1)
    return ordered_preds, pred_values

In [None]:
preds, pred_values = get_preds_as_nparray(DatasetType.Valid)


In [None]:
# accuracy on valid
(pred_values == data.valid_ds.y.items).mean()

0.8061135371179039

In [None]:
from sklearn.metrics import classification_report
print(classification_report(data.valid_ds.y.items,pred_values))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89       923
           1       0.00      0.00      0.00       222

    accuracy                           0.81      1145
   macro avg       0.40      0.50      0.45      1145
weighted avg       0.65      0.81      0.72      1145



  _warn_prf(average, modifier, msg_start, len(result))
