In [2]:
import numpy as np
import pandas as pd

from pathlib import Path
from typing import *

import torch
import torch.optim as optim

from fastai import *
#from fastai.vision import *
from fastai.text import *
from fastai.callbacks import *

from sklearn.metrics import classification_report

In [4]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    testing=False,
    bert_model_name="bert-base-uncased",
    max_lr=3e-5,
    epochs=1,
    use_fp16=False,
    bs=4,
    discriminative=False,
    max_seq_len=128,
)

from pytorch_pretrained_bert import BertTokenizer
bert_tok = BertTokenizer.from_pretrained(
    config.bert_model_name,
)

In [3]:
#!conda install fastai pytorch=1.4.0 -c fastai -c pytorch -c conda-forge


In [5]:

class FastAiBertTokenizer(BaseTokenizer):
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs):
        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __call__(self, *args, **kwargs):
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length"""
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"]


In [6]:

fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])
fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))

In [7]:
import pandas as pd


In [8]:
train = pd.read_csv('../code-review-dataset-full.csv', sep=',', header=0)


In [9]:
from sklearn.model_selection import train_test_split
initial_train=train.fillna("fillna").values

train, val = train_test_split(train)
test=val
if config.testing:
    train = train.head(1024)
    val = val.head(1024)
    test = test.head(1024)

In [10]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import  precision_score
from sklearn.metrics import  f1_score


num_folds = 2

run_precision = []
run_recall = []
run_f1score = []
run_accuracy = []

In [11]:

fold_no = 1
import pandas as pd
from sklearn.model_selection import train_test_split
for train_index, test_index in KFold(10).split(initial_train):
    print("Starting Fold: ", fold_no)
     
    
    x_train, x_val = initial_train[train_index], initial_train[test_index]
   
    #xp_train, xp_test= train_test_split(x_train,  test_size=0.11115, random_state=125
    
    train = pd.DataFrame(data=x_train, index=[i for i in range(x_train.shape[0])], columns=[ "message", "is_toxic"])
     
    val = pd.DataFrame(data=x_val, index=[i for i in range(x_val.shape[0])], columns=[ "message", "is_toxic"])
    
    x_train, x_test= train_test_split(x_train,  test_size=0.11115, random_state=125)

    train = pd.DataFrame(data=x_train, index=[i for i in range(x_train.shape[0])], columns=[ "message", "is_toxic"])
    test = pd.DataFrame(data=x_test, index=[i for i in range(x_test.shape[0])], columns=[ "message", "is_toxic"])
    
    print(len(train))
    print(len(val))
    print(len(test))
    indexs=[train_index]
    val_indexes=[test_index]

Starting Fold:  1
15655
1958
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958
Starting Fold:  1
15656
1957
1958


In [12]:
fold_no = 1
import pandas as pd
from sklearn.model_selection import train_test_split
for train_index, test_index in KFold(10).split(initial_train):
    print("Starting Fold: ", fold_no)
     
    
    x_train, x_val = initial_train[train_index], initial_train[test_index]
   
    #xp_train, xp_test= train_test_split(x_train,  test_size=0.11115, random_state=125
    
    train = pd.DataFrame(data=x_train, index=[i for i in range(x_train.shape[0])], columns=[ "message", "is_toxic"])
     
    val = pd.DataFrame(data=x_val, index=[i for i in range(x_val.shape[0])], columns=[ "message", "is_toxic"])
    
    x_train, x_test= train_test_split(x_train,  test_size=0.11115, random_state=125)

    train = pd.DataFrame(data=x_train, index=[i for i in range(x_train.shape[0])], columns=[ "message", "is_toxic"])
    test = pd.DataFrame(data=x_test, index=[i for i in range(x_test.shape[0])], columns=[ "message", "is_toxic"])
    
    print(len(train))
    print(len(val))
    print(len(test))
    indexs=[train_index]
    val_indexes=[test_index]
     
    
    
    if config.testing:
        train = train.head(1024)
        val = val.head(1024)
        test = test.head(1024)
    
    test = test.fillna(0)

     
    
    label_cols = ["is_toxic","is_toxic"]
    
    
    databunch = TextDataBunch.from_df(".", train, val,  test,
                  tokenizer=fastai_tokenizer,
                  vocab=fastai_bert_vocab,
                  include_bos=False,
                  include_eos=False,
                  text_cols="message",
                  label_cols=label_cols,
                  bs=config.bs,
                  collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),)
    
    
   
    
    class BertTokenizeProcessor(TokenizeProcessor):
        def __init__(self, tokenizer):
            super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

    class BertNumericalizeProcessor(NumericalizeProcessor):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

        def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
   
            return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]
    
    
    
    from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
    bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels=2)
    
    loss_func = nn.BCEWithLogitsLoss()
    from fastai.callbacks import *

    learner = Learner(
    databunch, bert_model,
    loss_func=loss_func,)
    
    if config.use_fp16: learner = learner.to_fp16()
    
    
    ##training algorithm 
    learner.lr_find()
    learner.fit_one_cycle(config.epochs, max_lr=config.max_lr)
    
    
    def get_preds_as_nparray(ds_type) -> np.ndarray:
        preds = learner.get_preds(ds_type)[0].detach().cpu().numpy()
        sampler = [i for i in databunch.dl(ds_type).sampler]
        reverse_sampler = np.argsort(sampler)
        return preds[reverse_sampler, :]
    

    #val_pred=get_preds_as_nparray(DatasetType.Valid)
    val_pred=get_preds_as_nparray(DatasetType.Test)
    val_pred=val_pred[:,0]
    val_pred = (val_pred >= 0.5)
    
    val_main=test['is_toxic']
    val_main=(val_main)
    
    precision = precision_score(val_main, val_pred, pos_label=1)
    recall = recall_score(val_main, val_pred, pos_label=1)
    f1score = f1_score(val_main, val_pred, pos_label=1)
    accuracy = accuracy_score(val_main, val_pred)

    print(classification_report(val_main, val_pred))


    run_accuracy.append(accuracy)
    run_f1score.append(f1score)
    run_precision.append(precision)
    run_recall.append(recall)
    
    fold_no=fold_no+1

Starting Fold:  1
15655
1958
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.149215,0.237552,1:30:23


              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1591
           1       0.87      0.86      0.87       367

    accuracy                           0.95      1958
   macro avg       0.92      0.92      0.92      1958
weighted avg       0.95      0.95      0.95      1958

Starting Fold:  2
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.126703,0.193595,1:25:06


              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1556
           1       0.93      0.84      0.88       402

    accuracy                           0.95      1958
   macro avg       0.94      0.91      0.93      1958
weighted avg       0.95      0.95      0.95      1958

Starting Fold:  3
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.078476,0.232174,1:23:39


              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1567
           1       0.92      0.86      0.89       391

    accuracy                           0.96      1958
   macro avg       0.94      0.92      0.93      1958
weighted avg       0.96      0.96      0.96      1958

Starting Fold:  4
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.115936,0.243936,1:23:46


              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1613
           1       0.90      0.78      0.84       345

    accuracy                           0.95      1958
   macro avg       0.93      0.88      0.90      1958
weighted avg       0.94      0.95      0.94      1958

Starting Fold:  5
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.113021,0.10438,1:23:03


              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1575
           1       0.92      0.82      0.86       383

    accuracy                           0.95      1958
   macro avg       0.94      0.90      0.92      1958
weighted avg       0.95      0.95      0.95      1958

Starting Fold:  6
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.146808,0.071086,1:23:17


              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1551
           1       0.90      0.85      0.87       407

    accuracy                           0.95      1958
   macro avg       0.93      0.91      0.92      1958
weighted avg       0.95      0.95      0.95      1958

Starting Fold:  7
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.105784,0.143862,1:22:17


              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1583
           1       0.88      0.85      0.86       375

    accuracy                           0.95      1958
   macro avg       0.92      0.91      0.92      1958
weighted avg       0.95      0.95      0.95      1958

Starting Fold:  8
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.1494,0.111635,1:22:48


              precision    recall  f1-score   support

           0       0.97      0.98      0.97      1589
           1       0.89      0.85      0.87       369

    accuracy                           0.95      1958
   macro avg       0.93      0.91      0.92      1958
weighted avg       0.95      0.95      0.95      1958

Starting Fold:  9
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.140992,0.074247,1:27:01


              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1561
           1       0.94      0.82      0.87       397

    accuracy                           0.95      1958
   macro avg       0.95      0.90      0.92      1958
weighted avg       0.95      0.95      0.95      1958

Starting Fold:  10
15656
1957
1958


epoch,train_loss,valid_loss,time


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


epoch,train_loss,valid_loss,time
0,0.186239,0.043333,1:27:19


              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1540
           1       0.92      0.83      0.88       418

    accuracy                           0.95      1958
   macro avg       0.94      0.91      0.92      1958
weighted avg       0.95      0.95      0.95      1958



In [14]:
from statistics import mean
print("...........Score of BERT..................")
print("     Precision", "         Recall   ", "       F-score        ", "    Accuracy")
print(mean(run_precision),mean(run_recall),mean(run_f1score),mean(run_accuracy))

...........Score of BERT..................
     Precision          Recall           F-score             Accuracy
0.906454803241088 0.834609413409594 0.8686693400303963 0.950561797752809
