<a href="https://colab.research.google.com/github/jaydebsarker/toxicity-detector/blob/master/bertwithfai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import numpy as np
import pandas as pd

from pathlib import Path
from typing import *

import torch
import torch.optim as optim

from fastai import *
from fastai.vision import *
from fastai.text import *
from fastai.callbacks import *

from sklearn.metrics import classification_report

In [3]:
!pip install pytorch_pretrained_bert



In [4]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    testing=False,
    bert_model_name="bert-base-uncased",
    max_lr=3e-5,
    epochs=1,
    use_fp16=False,
    bs=4,
    discriminative=False,
    max_seq_len=128,
)

from pytorch_pretrained_bert import BertTokenizer
bert_tok = BertTokenizer.from_pretrained(
    config.bert_model_name,
)

In [5]:

class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]

In [6]:

fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])
fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))

In [7]:

import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving code-review-dataset-full.csv to code-review-dataset-full (1).csv


In [8]:

import io
df2 = pd.read_csv(io.BytesIO(uploaded['code-review-dataset-full.csv']))

In [9]:
train=df2.sample(frac=0.5, replace=True, random_state=1)

In [10]:
train["is_toxic"].value_counts()

0    7900
1    1886
Name: is_toxic, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
initial_train=train.fillna("fillna").values

train, val = train_test_split(train)
test=val
if config.testing:
    train = train.head(1024)
    val = val.head(1024)
    test = test.head(1024)

In [12]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import  precision_score
from sklearn.metrics import  f1_score


num_folds = 10

run_precision = []
run_recall = []
run_f1score = []
run_accuracy = []

In [13]:
fold_no = 1
import pandas as pd
from sklearn.model_selection import train_test_split
for train_index, test_index in KFold(10).split(initial_train):
    print("Starting Fold: ", fold_no)
     
    
    x_train, x_val = initial_train[train_index], initial_train[test_index]
   
    #xp_train, xp_test= train_test_split(x_train,  test_size=0.11115, random_state=125
    
    train = pd.DataFrame(data=x_train, index=[i for i in range(x_train.shape[0])], columns=[ "message", "is_toxic"])
     
    val = pd.DataFrame(data=x_val, index=[i for i in range(x_val.shape[0])], columns=[ "message", "is_toxic"])
    
    x_train, x_test= train_test_split(x_train,  test_size=0.11115, random_state=125)

    train = pd.DataFrame(data=x_train, index=[i for i in range(x_train.shape[0])], columns=[ "message", "is_toxic"])
    test = pd.DataFrame(data=x_test, index=[i for i in range(x_test.shape[0])], columns=[ "message", "is_toxic"])
    
    print(len(train))
    print(len(val))
    print(len(test))
    indexs=[train_index]
    val_indexes=[test_index]
     
    
    
    if config.testing:
        train = train.head(1024)
        val = val.head(1024)
        test = test.head(1024)
    
    test = test.fillna(0)

     
    
    
    label_cols = ["is_toxic","is_toxic"]
    
    print("validation counts", val["is_toxic"].value_counts())
    print("test counts", test["is_toxic"].value_counts())

    databunch = TextDataBunch.from_df(".", train, val,  test,
                  tokenizer=fastai_tokenizer,
                  vocab=fastai_bert_vocab,
                  include_bos=False,
                  include_eos=False,
                  text_cols="message",
                  label_cols=label_cols,
                  bs=config.bs,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),)
    
    
   
    
    class BertTokenizeProcessor(TokenizeProcessor):
        def __init__(self, tokenizer):
            super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

    class BertNumericalizeProcessor(NumericalizeProcessor):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

        def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
   
            return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]
    
    
    
    from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
    bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels=2)
    
    loss_func = nn.BCEWithLogitsLoss()
    from fastai.callbacks import *

    learner = Learner(
    databunch, bert_model,
    loss_func=loss_func,)
    
    if config.use_fp16: learner = learner.to_fp16()
    
    
    ##training algorithm 
    learner.lr_find()
    learner.fit_one_cycle(config.epochs, max_lr=config.max_lr)
    
    
    def get_preds_as_nparray(ds_type) -> np.ndarray:
        preds = learner.get_preds(ds_type)[0].detach().cpu().numpy()
        sampler = [i for i in databunch.dl(ds_type).sampler]
        reverse_sampler = np.argsort(sampler)
        return preds[reverse_sampler, :]
    

    #val_pred=get_preds_as_nparray(DatasetType.Valid)
    val_pred=get_preds_as_nparray(DatasetType.Test)
    val_pred=val_pred[:,0]
    val_pred = (val_pred >= 0.5)
    
    val_main=test['is_toxic']
    val_main=(val_main)
    
    precision = precision_score(val_main, val_pred, pos_label=1)
    recall = recall_score(val_main, val_pred, pos_label=1)
    f1score = f1_score(val_main, val_pred, pos_label=1)
    accuracy = accuracy_score(val_main, val_pred)

    print(classification_report(val_main, val_pred))


    run_accuracy.append(accuracy)
    run_f1score.append(f1score)
    run_precision.append(precision)
    run_recall.append(recall)
    
    fold_no=fold_no+1

Starting Fold:  1
7828
979
979
validation counts 0    799
1    180
Name: is_toxic, dtype: int64
test counts 0    800
1    179
Name: is_toxic, dtype: int64


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.129623,0.130621,1:30:47


              precision    recall  f1-score   support

           0       0.97      0.95      0.96       800
           1       0.80      0.87      0.83       179

    accuracy                           0.94       979
   macro avg       0.89      0.91      0.90       979
weighted avg       0.94      0.94      0.94       979

Starting Fold:  2
7828
979
979
validation counts 0    783
1    196
Name: is_toxic, dtype: int64
test counts 0    804
1    175
Name: is_toxic, dtype: int64


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time


KeyboardInterrupt: ignored

In [2]:
from statistics import *

print(mean(run_precision),mean(run_recall),mean(run_f1score),mean(run_accuracy))


NameError: ignored