In [None]:
!pip install folium==0.2.1 torch neptune-client==0.15.2 tqdm > /dev/null 
!pip install transformers==2.1.1 pytorch-lightning==1.5.10  > /dev/null
!apt install git git-lfs > /dev/null
!git clone https://github.com/karlfroldan/prototype.git
!git clone https://huggingface.co/microsoft/codebert-base codebert
!mv prototype/* . 
!rm -rf prototype 
%cd codebert 
!git lfs install
!git lfs pull 
%cd ..

In [None]:
import pandas as pd

import random
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader, SubsetRandomSampler

import numpy as np
import sklearn
from sklearn.model_selection import KFold
from sklearn.preprocessing import MultiLabelBinarizer
from pytorch_lightning.loggers.neptune import NeptuneLogger

import neptune.new as neptune

import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor

import transformers
from transformers import RobertaTokenizer, RobertaModel

from torchvision.ops import sigmoid_focal_loss

from prototype_dataloader import get_datasets

from sklearn.metrics import f1_score, hamming_loss
import warnings

from tqdm.notebook import tqdm

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {device}')

def seed_everything(seed=42):
    """"
    Seed everything.
    """   
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    pl.seed_everything(seed)

# Set the RNG
seed_everything(1729)

### Data Pre-processing

In [None]:
labels = ["quicksort", "mergesort", "selectionsort", "insertionsort", "bubblesort", 
            "linearsearch", "binarysearch", "linkedlist", "hashmap"] #['selectionsort', 'bubblesort', 'binarysearch']

tokenizer = RobertaTokenizer.from_pretrained("./codebert")
model = RobertaModel.from_pretrained("./codebert")

data_csv = pd.read_csv("prototype.csv")
train_set, test_set = get_datasets(data_csv, tokenizer, split=0.1, data_folder='./data/prototype', labels=labels)
len(train_set), len(test_set)

### Model

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

class OurModel(pl.LightningModule):
    def __init__(self, codebert, loss, input=393_216, hidden=None, labels=9, train_rate=1e-3, device='cuda'):
        super().__init__()
        
        self.transformer = codebert.to(device)

        # RoBERTa has 12 encoding layers. For this study, let's freeze the first 9
        # and retrain the last 3
        layers = [self.transformer.embeddings, *self.transformer.encoder.layer[:9]]
        for layer in layers: #self.transformer.parameters():
            for param in layer.parameters():
                param.requires_grad = False

        self.loss = loss
        self.train_rate = train_rate

        layers = [nn.Dropout(p=0.1), nn.Linear(768 * 512, 420), nn.BatchNorm1d(420), nn.ReLU()]

        # self.dropout = nn.Dropout(p=0.1).to(device)

        # self.fc1 = nn.Linear(768 * 512, 420).to(device)
        
        # self.bn1 = nn.BatchNorm1d(420).to(device)

        # init_weights(self.fc1)

        

        self.hidden_is_none = hidden is None
        last = 420
        if hidden is not None:
            #self.hidden = []
            #self.batch_norms = []

            for i in hidden:
                layers.append(nn.Dropout(p=0.1))
                layers.append(nn.Linear(last, i)) 
                layers.append(nn.BatchNorm1d(420))
                layers.append(nn.ReLU())
                # n = nn.Linear(last, i).to(device)
                # bn = nn.BatchNorm1d(i).to(device)

                # init_weights(n)
                # self.hidden.append(n)
                # self.batch_norms.append(bn)
                last = i
        layers.append(nn.Linear(last, labels)) 
        
        # self.output = nn.Linear(last, labels).to(device)
        # init_weights(self.output)

        for layer in layers:
            init_weights(layer)

        self.ann = nn.Sequential(*layers).to(device)

    def get_preds(self, y):
        return (y >= 0.5).long()
      
    def get_preds_numpy(self, y):
        return (y >= 0.5).astype(int)

    def forward(self, x):
        # Pass the inputs to the transformer
        (out, mask) = self.transformer(x)

        # Flatten the transformer's output so we can plug it into the
        # simple feedforward neural network.
        out = torch.flatten(out, 1)

        #out = self.bn1(F.relu(self.fc1(out)))
        #out = F.relu(self.bn1(self.fc1(self.dropout(out))))
        return self.ann(out)
        # if not self.hidden_is_none:
        #     for layer, bn in zip(self.hidden, self.batch_norms):
        #         #out = bn(F.relu(layer(out)))
        #         out = F.relu(bn(layer(self.dropout(out))))

        # Instead, we need to ensure that we add a sigmoid layer
        # when training the model.
        #return self.output(out)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), weight_decay=1e-6, lr=self.train_rate)
        #scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)
        #self.scheduler = scheduler
        return optimizer
        #return [optimizer], [self.scheduler]
        
    def training_step(self, train_batch, batch_idx):
        X, y = train_batch
        X = X['input_ids']
        y_hat = self(X)

        #print(y_hat)
        
        loss = self.loss(y_hat, y)
        self.log('train loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        X, y = val_batch
        X = X['input_ids']
        y_hat = self(X)
        loss = self.loss(y_hat, y)
        y_hat = torch.sigmoid(y_hat)

        self.log('validation loss', loss)
        # Transfer them to the CPU

        y_cpu = y.squeeze().cpu().detach().numpy()
        y_hat_sigmoid_cpu = self.get_preds(y_hat).squeeze().cpu().detach().numpy()

        hamming = hamming_loss(y_cpu, y_hat_sigmoid_cpu)

        f1_micro = f1_score(y_cpu, y_hat_sigmoid_cpu, average='micro', zero_division=1)
        f1_macro = f1_score(y_cpu, y_hat_sigmoid_cpu, average='macro', zero_division=1)
        self.log('hamming loss', hamming)
        self.log('Micro F1', f1_micro)
        self.log('Macro F1', f1_macro)
        #self.log('Learning rate', self.scheduler.get_last_lr()[len(self.scheduler.get_last_lr()) - 1])
        

### The Loss Function

The criterion that this model will use is the **Focal Loss** which is defined as an extension of the **Cross-entropy loss**. 

We know that Cross-entropy loss is defined as 

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=4, alpha=0.1, device='cuda', labels=9):
        super(FocalLoss, self).__init__()
        self.alpha = torch.tensor([alpha, 1 - alpha])
        if device == 'cuda':
          self.alpha = self.alpha.to(device)
        self.gamma = gamma
        self.labels = labels

        self.bce = nn.BCEWithLogitsLoss(reduction='none')
    
    def forward(self, y_hat, y):
        epsilon = 1e-4
        y_prime = y.type(torch.float32)
        y_hat = y_hat.type(torch.float32)

        b = self.bce(y_hat, y_prime)
        # alpha_t = self.alpha.gather(0, y.data.view(-1)).reshape(-1, 9)

        alpha_t = self.alpha.gather(0, y.data.view(-1)).reshape(-1, self.labels)
        p_t = torch.exp(-b + epsilon)
        
        F_loss = alpha_t * (1 - p_t) ** self.gamma * b

        return F_loss.mean().type(torch.float16)

class WeightedCrossEntropyLoss(nn.Module):
    def __init__(self, weight=None, pos_weight=None):
        super(WeightedCrossEntropyLoss, self).__init__()
        self.bce = nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight)
    def forward(self, y_hat, y):
        y = y.type(torch.float32)
        y_hat = y_hat.type(torch.float32)
        return self.bce(y_hat, y).type(torch.float16)

### Training

In [None]:
weights = torch.tensor([0.9875727720555306, 0.9703313927451859, 0.7831392745185849, 1.0, 0.9658531124048365,
    0.7736229287953426, 0.5107478728168383, 0.38434841021047916, 0.36990595611285265])

wce_loss = WeightedCrossEntropyLoss(weight=weights)
#focal_loss = FocalLoss(gamma=5, alpha=0.25, device='cuda', labels=len(labels))

In [None]:
train_dataloader = DataLoader(train_set, batch_size=32, num_workers=2, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=64, num_workers=2, shuffle=False)

In [None]:
neptune_logger = NeptuneLogger(
    api_key="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI1NDBhOTI3OC0yOTZmLTQ4YmYtOTVjNC00MTIzZjJjMGM3Y2MifQ==",  # replace with your own
    project="pancit-canton/augmented-runs" # "<WORKSPACE/PROJECT>"
)

m = OurModel(model, loss=wce_loss, train_rate=0.0001, hidden=None, device='cuda', labels=len(labels))
trainer = pl.Trainer(gpus=1, precision=16, max_epochs=6, log_every_n_steps=3, logger=neptune_logger, enable_checkpointing=False)
trainer.fit(m, train_dataloader, test_dataloader)

The hamming loss is defined as 
$$
\frac{1}{|N|\cdot|L|}\sum_{i=1}^{|N|}\sum_{j=1}^{|L|}\left(\widehat{y}_{i,j}\oplus y_{i,j}\right)
$$

In [None]:
prediction = []
real = []
subset_acc = 0
subset_accuracy = lambda y_hat, y: torch.all((y == y_hat)).float()
subset_pred = lambda y_hat, y: (torch.all(get_preds(y_hat) == y)).float()
get_preds = lambda ys : (ys >= 0.5).long()

#device = 'cpu'
m.eval()

with torch.no_grad():
    m = m.to(device)
    for X, y in tqdm(test_dataloader):
        X = X['input_ids'].to(device)
        y_hat = torch.sigmoid(m(X))
        prediction.append(y_hat.cpu().detach().numpy())
        real.append(y.cpu().detach().numpy())


prediction = np.vstack(prediction)
real = np.vstack(real)

import pickle as pkl 

for fname, array in zip([f'prediction_wce.pkl', f'real_wce.pkl'], [prediction, real]):
    with open(fname, 'wb') as f:
        pkl.dump(array, f)
        print(f'Dumped {fname}')

In [None]:
prediction

In [None]:
real

In [None]:
np.mean(np.logical_and(real == 1, prediction >= 0.5))

In [None]:
np.sum(real == 1), np.sum(real == 1) + np.sum(real == 0), np.sum(real == 1) / (np.sum(real == 1) + np.sum(real == 0))

In [None]:
true_positives = np.sum(np.logical_and(real == 1, prediction >= 0.5))
all_positives = np.sum(real == 1)
false_positives = np.sum(np.logical_and(real == 0, prediction >= 0.5))
print(f"All Positives: {all_positives} | True Positives: {true_positives} | False Positives: {false_positives}")

In [None]:
subset_all = np.all(np.equal(real == 1, prediction >= 0.5), axis=1)
subset_acc = np.sum(subset_all) / len(subset_all)
subset_acc