In [None]:
!pip install folium==0.2.1 torch neptune-client==0.15.2 tqdm > /dev/null 
!pip install transformers==2.1.1 pytorch-lightning==1.5.10  > /dev/null
!apt install git git-lfs xz > /dev/null
!git clone https://github.com/gerdiedoo/alg-dataset.git
!mv prototype/* .
!tar xf train.tar.xz
!tar xf test_noise.xz 
!mkdir data
!mv new_test data/test 
!mv train data/train
!git clone https://huggingface.co/microsoft/codebert-base codebert
%cd codebert 
!git lfs install
!git lfs pull 
%cd ..

In [None]:
import pandas as pd

import random
import os

import torch
import torch.nn as nn

from torch.utils.data import DataLoader

import numpy as np

import pytorch_lightning as pl
from transformers import RobertaTokenizer, RobertaModel
from prototype_dataloader import get_datasets_2
from sklearn.metrics import f1_score, hamming_loss


os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {device}')

def seed_everything(seed=42):
    """"
    Seed everything.
    """   
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    pl.seed_everything(seed)

# Set the RNG
seed_everything(1729)

### Data Pre-processing

In [None]:
labels = ["quicksort", "mergesort", "selectionsort", "insertionsort", "bubblesort", 
            "linearsearch", "binarysearch", "linkedlist", "hashmap"] #['selectionsort', 'bubblesort', 'binarysearch']

tokenizer = RobertaTokenizer.from_pretrained("./codebert")
model = RobertaModel.from_pretrained("./codebert")

train_csv = pd.read_csv("train.csv")
test_csv = pd.read_csv('test_noise.csv')

train_set, test_set = get_datasets_2(train_csv, test_csv, tokenizer, labels=labels)
len(train_set), len(test_set)

### Model

In [None]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

class OurModel(pl.LightningModule):
    def __init__(self, codebert, loss, input=393_216, hidden=None, labels=9, train_rate=1e-3, device='cuda'):
        super().__init__()
        
        self.transformer = codebert.to(device)

        # RoBERTa has 12 encoding layers. For this study, let's freeze the first 9
        # and retrain the last 3
        layers = [self.transformer.embeddings, *self.transformer.encoder.layer[:9]]
        for layer in layers:
            for param in layer.parameters():
                param.requires_grad = False

        self.loss = loss
        self.train_rate = train_rate

        layers = [nn.Dropout(p=0.1), nn.Linear(768 * 512, 420), nn.BatchNorm1d(420), nn.ReLU()]
        
        self.hidden_is_none = hidden is None
        last = 420
        if hidden is not None:

            for i in hidden:
                layers.append(nn.Dropout(p=0.1))
                layers.append(nn.Linear(last, i)) 
                layers.append(nn.BatchNorm1d(i))
                layers.append(nn.ReLU())
                last = i
        layers.append(nn.Linear(last, labels)) 
        for layer in layers:
            init_weights(layer)

        self.ann = nn.Sequential(*layers).to(device)

    def get_preds(self, y):
        return (y >= 0.5).long()
      
    def get_preds_numpy(self, y):
        return (y >= 0.5).astype(int)

    def forward(self, x):
        # Pass the inputs to the transformer
        (out, mask) = self.transformer(x)

        # Flatten the transformer's output so we can plug it into the
        # simple feedforward neural network.
        out = torch.flatten(out, 1)
        return self.ann(out)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), weight_decay=1e-6, lr=self.train_rate)
        return optimizer
        
    def training_step(self, train_batch, batch_idx):
        X, y = train_batch
        X = X['input_ids']
        y_hat = self(X)
        loss = self.loss(y_hat, y)
        self.log('train loss', loss)
        return loss

    def validation_step(self, val_batch, batch_idx):
        X, y = val_batch
        X = X['input_ids']
        y_hat = self(X)
        loss = self.loss(y_hat, y)
        y_hat = torch.sigmoid(y_hat)

        self.log('validation loss', loss)
        # Transfer them to the CPU

        y_cpu = y.squeeze().cpu().detach().numpy()
        y_hat_sigmoid_cpu = self.get_preds(y_hat).squeeze().cpu().detach().numpy()

        hamming = hamming_loss(y_cpu, y_hat_sigmoid_cpu)

        f1_micro = f1_score(y_cpu, y_hat_sigmoid_cpu, average='micro', zero_division=1)
        f1_macro = f1_score(y_cpu, y_hat_sigmoid_cpu, average='macro', zero_division=1)
        self.log('hamming loss', hamming)
        self.log('Micro F1', f1_micro)
        self.log('Macro F1', f1_macro)

### The Loss Function

The criterion that this model will use is the **Focal Loss** which is defined as an extension of the **Cross-entropy loss**. 

We know that Cross-entropy loss is defined as 

In [None]:
class WeightedCrossEntropyLoss(nn.Module):
    def __init__(self, weight=None, pos_weight=None):
        super(WeightedCrossEntropyLoss, self).__init__()
        self.bce = nn.BCEWithLogitsLoss(weight=weight, pos_weight=pos_weight)
    def forward(self, y_hat, y):
        y = y.type(torch.float32)
        y_hat = y_hat.type(torch.float32)
        return self.bce(y_hat, y).type(torch.float16)

### Training

In [None]:
weights = torch.tensor([0.7172649927370823,
    0.8717576260634986, 0.5655737704918032, 1.0, 0.9660717991284499, 0.6380991907034654, 0.5633948952064743,
    0.2723594106661133, 0.21311475409836064])

wce_loss = WeightedCrossEntropyLoss(weight=weights)

In [None]:
train_dataloader = DataLoader(train_set, batch_size=32, num_workers=2, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_set, batch_size=64, num_workers=2, shuffle=False)

In [None]:
m = OurModel(model, loss=wce_loss, train_rate=0.0001, hidden=None, device='cuda', labels=len(labels))
trainer = pl.Trainer(gpus=1, precision=16, max_epochs=5, log_every_n_steps=3, enable_checkpointing=False)#, limit_val_batches=0)
trainer.fit(m, train_dataloader, test_dataloader)

In [None]:
torch.save(m.state_dict(), "augmented_only.pth")