In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import os

os.environ["WANDB_DISABLED"] = "true"

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

BERT_PATH = "."

data_dir = Path('./AI4Code')

In [2]:
NUM_TRAIN = 20000


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)
df.source = df.source.str.lower()

Train NBs: 100%|████████████████████████████████████████████████████████████████| 20000/20000 [01:19<00:00, 252.02it/s]


In [3]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()



  df_orders = pd.read_csv(


In [4]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

In [5]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

In [6]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')

In [7]:
df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])
df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,00001756c60be8,1862f0a6,code,# this python 3 environment comes with many helpful analytics libraries installed\n# it is defined by the kaggle/pyt...,0,945aea18,
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros...",2,945aea18,
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),4,945aea18,
3,00001756c60be8,2eefe0ef,code,matplotlib.rcparams.update({'font.size': 14}),6,945aea18,
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""train r2:\...",8,945aea18,
...,...,...,...,...,...,...,...
916551,249632a403a0d4,c33633b1,markdown,### 4. top 10 publisher by:\n* na sales: nintendo.\n* eu sales: nintendo.\n* jp sales: nintendo.\n* other sales:...,47,bee66634,
916552,249632a403a0d4,a3443531,markdown,### top publishers releases over years,62,bee66634,
916553,249632a403a0d4,8e0d6d03,markdown,### 2. top 10 genre by:\n* global sales: action.\n* na sales: action.\n* eu sales: action.\n* jp sales: role-pla...,31,bee66634,
916554,249632a403a0d4,57f3400f,markdown,"### 3) data visualization\n\ntasks:\n\nmake a report of:\n\n* top 10 game by total reveneu worldwide and by eu, jp, ...",16,bee66634,


In [8]:
import regex as re
# import fasttext

re_sc_ch = re.compile(r'\P{L}+')
re_sg_ch = re.compile(r' +\p{L} +')
re_sg_ch_st = re.compile(r'^\p{L} +')
re_mul_sp =  re.compile(r' +')
re_pre_b = re.compile(r'^b +')

def preprocess_text(document):
        # Remove all the special characters
        document = re_sc_ch.sub(' ', str(document))

        # remove all single characters
        document = re_sg_ch.sub(' ', document)

        # Remove single characters from the start
        document = re_sg_ch_st.sub(' ', document)

        # Substituting multiple spaces with single space
        document = re_mul_sp.sub(' ', document)

        # Removing prefixed 'b'
        document =re_pre_b.sub('', document)

        return document

    
def preprocess_df(df):
    """
    This function is for processing sorce of notebook
    returns preprocessed dataframe
    """
    return [preprocess_text(message) for message in df.source]

df.source = df.source.apply(preprocess_text)

In [9]:
dict_cellid_source = dict(zip(df['cell_id'].values, df['source'].values))

In [10]:
from transformers import AutoTokenizer, AutoModel

#tokenizer = AutoTokenizer.from_pretrained('./Model/Pre-trained/tokenizer')
#model = AutoModelWithLMHead.from_pretrained('./Model/Pre-trained')

In [11]:
def generate_triplet(df, mode='train'):
    triplets = []
    ids = df.id.unique()
    random_drop = np.random.random(size=10000)>0.9
    count = 0

    for id, df_tmp in tqdm(df.groupby('id')):
        df_tmp_markdown = df_tmp[df_tmp['cell_type']=='markdown']

        df_tmp_code = df_tmp[df_tmp['cell_type']=='code']
        df_tmp_code_rank = df_tmp_code['rank'].values
        df_tmp_code_cell_id = df_tmp_code['cell_id'].values

        for cell_id, rank in df_tmp_markdown[['cell_id', 'rank']].values:
            labels = np.array([(r==(rank+1)) for r in df_tmp_code_rank]).astype('int')

            for cid, label in zip(df_tmp_code_cell_id, labels):
                count += 1
                if label==1:
                    triplets.append( [cell_id, cid, label] )
                    # triplets.append( [cid, cell_id, label] )
                elif mode == 'test':
                    triplets.append( [cell_id, cid, label] )
                    # triplets.append( [cid, cell_id, label] )
                elif random_drop[count%10000]:
                    triplets.append( [cell_id, cid, label] )
                    # triplets.append( [cid, cell_id, label] )

    return triplets

triplets = generate_triplet(df)

100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [00:29<00:00, 678.43it/s]


In [12]:
import torch
MAX_LEN = 128
    
class MarkdownModel(torch.nn.Module):
    def __init__(self):
        super(MarkdownModel, self).__init__()
        self.distill_bert = AutoModel.from_pretrained("./Model")
        self.dropout = torch.nn.Dropout(0.2)
        self.top = torch.nn.Linear(512, 1)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.dropout(x)
        x = self.top(x[:, 0, :])
        x = torch.sigmoid(x) 
        return x

In [13]:
from torch.utils.data import DataLoader, Dataset



class MarkdownDataset(Dataset):
    def __init__(self, df, max_len, mode='train'):
        super().__init__()
        self.df = df
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained("./Model/tokenizer")
        self.mode=mode
       
    
    def __getitem__(self, index):
        row = self.df[index]
        label = row[-1]
        txt = dict_cellid_source[row[0]] + '[SEP]' + dict_cellid_source[row[1]]
        inputs = self.tokenizer.encode_plus(
            txt,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])
        return ids, mask, torch.FloatTensor([label])
    
    def __len__(self):
        return len(self.df)


train_ds = MarkdownDataset(triplets, max_len=MAX_LEN)

In [14]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 3e-5
    elif epoch < 3:
        lr = 1e-5
    elif epoch < 5:
        lr = 5e-6
    else:
        lr = 1e-7
    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-8 ) #1e-08)
    return optimizer

BS = 128 
#NW = 8

train_loader = DataLoader(train_ds, batch_size=BS, shuffle=True, drop_last=True)

In [15]:
import sys, os

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()

def validate(model, val_loader, mode='train'):
    model.eval()
    tbar = tqdm(val_loader, file=sys.stdout)
    preds = []
    labels = []
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)
            pred = model(inputs[0], inputs[1])
            preds.append(pred.detach().cpu().numpy().ravel())
            if mode=='train':
                labels.append(target.detach().cpu().numpy().ravel())
    if mode=='test':
        return np.concatenate(preds)
    else:
        return np.concatenate(labels), np.concatenate(preds)

def train(model, train_loader, epochs, Type='markdown'):
    np.random.seed(0)
    
    optimizer = get_optimizer(model)

    mixed_precision = True
    try:  
        from apex import amp
    except:
        mixed_precision = False  # not installed
        
    # model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=1)
    criterion = torch.nn.L1Loss()
    #criterion = torch.nn.BCELoss()
    
    for e in range(epochs):   
        model.train()
        tbar = tqdm(train_loader, position=0, leave=True)
        lr = adjust_lr(optimizer, e)
        loss_list = []
        preds = []
        labels = []
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)
            optimizer.zero_grad()
            pred = model(inputs[0], inputs[1])
            loss = criterion(pred, target)
            loss.backward()
            optimizer.step()
            
            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
            
            avg_loss = np.round(np.mean(loss_list), 4)
            tbar.set_description(f"Epoch {e} Loss: {avg_loss} lr: {lr}")
        
        output_model_file = f"./my_model/my_own_model_{e}.bin"
        model_to_save = model.module if hasattr(model, 'module') else model
        torch.save(model_to_save.state_dict(), output_model_file)

    return model

model = MarkdownModel()
model = model.cuda()

Some weights of the model checkpoint at ./Model were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./Model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN t

In [None]:
model = train(model, train_loader, epochs=15, Type='markdown')

Epoch 0 Loss: 0.1363 lr: 3e-05: 100%|██████████████████████████████████████████| 52194/52194 [2:11:00<00:00,  6.64it/s]
Epoch 1 Loss: 0.1362 lr: 1e-05: 100%|██████████████████████████████████████████| 52194/52194 [2:10:39<00:00,  6.66it/s]
Epoch 2 Loss: 0.1362 lr: 1e-05:  40%|████████████████▋                         | 20747/52194 [51:33<1:18:45,  6.65it/s]

In [16]:
import tensorflow as tf
import numpy as np

In [20]:
input_vocab_size = 128
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_vocab_size, embedding_dim)    
])

In [21]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, None, 16)          2048      
                                                                 
Total params: 2,048
Trainable params: 2,048
Non-trainable params: 0
_________________________________________________________________


In [22]:
inputs = np.array([5,1,2,3,4])
model.predict(inputs)



array([[ 0.03175068,  0.04518476,  0.03099047,  0.03902406, -0.0238451 ,
         0.01022248,  0.04638659,  0.00112636, -0.02454869, -0.02420332,
         0.01812441, -0.02569896, -0.00101122,  0.03786495,  0.03084195,
        -0.04268376],
       [ 0.01147168,  0.02028776,  0.01787443,  0.02878406,  0.03316477,
        -0.03730999, -0.02772543,  0.02157864, -0.04378268, -0.04534464,
         0.01430156, -0.04093121, -0.04419219,  0.01793952,  0.04641494,
        -0.02804509],
       [-0.00246056,  0.03388472,  0.01325028, -0.01709906,  0.02511751,
         0.02343695, -0.03716427, -0.0100871 , -0.04942725,  0.00847252,
         0.01380309, -0.01255905,  0.02336795,  0.01358635,  0.02113799,
        -0.03850905],
       [-0.01261082, -0.00207077, -0.00695227,  0.00687863,  0.00174991,
        -0.00712912, -0.03085891,  0.02559337,  0.00947325,  0.03509101,
         0.01380303, -0.02599156,  0.01021805,  0.03814136,  0.01850437,
         0.00975385],
       [ 0.00257504,  0.02435377, -0

In [9]:
input_vocab_size = 128
embedding_dim = 16
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_vocab_size, embedding_dim),
    tf.keras.layers.GRU(256, dropout= 0.2),
    tf.keras.layers.Dense(64, activation='ReLU'),
    tf.keras.layers.Dense(1)
    
])

In [10]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 16)          2048      
                                                                 
 gru_2 (GRU)                 (None, 256)               210432    
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 228,993
Trainable params: 228,993
Non-trainable params: 0
_________________________________________________________________


In [157]:
for name, param in torch_model.named_parameters():
    print((param).shape)
    if param.requires_grad:
        print(name, " ", (param.data.shape))

torch.Size([192, 1024])
linear_relu_stack.0.weight_ih_l0   torch.Size([192, 1024])
torch.Size([192, 64])
linear_relu_stack.0.weight_hh_l0   torch.Size([192, 64])
torch.Size([192])
linear_relu_stack.0.bias_ih_l0   torch.Size([192])
torch.Size([192])
linear_relu_stack.0.bias_hh_l0   torch.Size([192])
torch.Size([64, 128])
linear_relu_stack.1.weight   torch.Size([64, 128])
torch.Size([64])
linear_relu_stack.1.bias   torch.Size([64])
torch.Size([64, 64])
linear_relu_stack.2.weight   torch.Size([64, 64])
torch.Size([64])
linear_relu_stack.2.bias   torch.Size([64])
torch.Size([64, 64])
linear_relu_stack.3.weight   torch.Size([64, 64])
torch.Size([64])
linear_relu_stack.3.bias   torch.Size([64])
torch.Size([1024, 64])
linear_relu_stack.4.weight   torch.Size([1024, 64])
torch.Size([1024])
linear_relu_stack.4.bias   torch.Size([1024])
torch.Size([11, 64])
linear_relu_stack.5.weight   torch.Size([11, 64])
torch.Size([11])
linear_relu_stack.5.bias   torch.Size([11])


In [162]:
gru_pytorch = nn.GRU(input_size = 1024, hidden_size =64,num_layers=1, dropout=0.3, bidirectional=False)
sum([p.numel() for p in gru_pytorch.parameters()])

209280

In [96]:
from torch import nn
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.GRU(input_size = 1024, hidden_size =64,num_layers=2, dropout=0.3, bidirectional=True),
            nn.ModuleList([
                nn.Linear(128,64),
                nn.Linear(64,64),
                nn.Linear(64,64)]
            ),
            nn.Linear(64,1),
            nn.Linear(64,11),
            nn.Dropout(0.3)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
torch_model = NeuralNetwork()
print(count_parameters(torch_model))
torch_model

510412


NeuralNetwork(
  (linear_relu_stack): Sequential(
    (0): GRU(1024, 64, num_layers=2, dropout=0.3, bidirectional=True)
    (1): ModuleList(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): Linear(in_features=64, out_features=64, bias=True)
      (2): Linear(in_features=64, out_features=64, bias=True)
    )
    (2): Linear(in_features=64, out_features=1, bias=True)
    (3): Linear(in_features=64, out_features=11, bias=True)
    (4): Dropout(p=0.3, inplace=False)
  )
)

In [102]:
gru = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(None,1024)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64)),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(1),
    tf.keras.layers.Dense(64, dynamic=True ),
    tf.keras.layers.Dense(11,input_dim=64,input_shape=(None,64)),
])
gru.summary()

Model: "sequential_51"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_112 (Bidirect  (None, None, 128)        418560    
 ional)                                                          
                                                                 
 bidirectional_113 (Bidirect  (None, 128)              74496     
 ional)                                                          
                                                                 
 dense_236 (Dense)           (None, 64)                8256      
                                                                 
 dense_237 (Dense)           (None, 64)                4160      
                                                                 
 dense_238 (Dense)           (None, 64)                4160      
                                                                 
 dense_239 (Dense)           (None, 1)               

In [43]:
import torch, numpy
def count_parameters(model):
    total_param = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            num_param = numpy.prod(param.size())
            #if param.dim() > 1:
            #    print(name, ':', 'x'.join(str(x) for x in list(param.size())), '=', num_param)
            #else:
            #    print(name, ':', num_param)
            total_param += num_param
    return total_param

model = torch.nn.Sequential(torch.nn.GRU(64, 32, bidirectional=True, num_layers=2, dropout=0.25, batch_first=True))
print(count_parameters(model))
print(model)

37632
Sequential(
  (0): GRU(64, 32, num_layers=2, batch_first=True, dropout=0.25, bidirectional=True)
)


In [31]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(None, 64)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True, dropout=0.25, time_major=False)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32, return_sequences=True, dropout=0.25, time_major=False)))
model.summary()

Model: "sequential_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_28 (Bidirecti  (None, None, 64)         18816     
 onal)                                                           
                                                                 
 bidirectional_29 (Bidirecti  (None, None, 64)         18816     
 onal)                                                           
                                                                 
Total params: 37,632
Trainable params: 37,632
Non-trainable params: 0
_________________________________________________________________


In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("Kaggle_model/GraphCodeBert")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at Kaggle_model/GraphCodeBert and are newly initialized: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [6]:
import torch, numpy
def count_parameters(model):
    total_param = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            num_param = numpy.prod(param.size())
            #if param.dim() > 1:
            #    print(name, ':', 'x'.join(str(x) for x in list(param.size())), '=', num_param)
            #else:
            #    print(name, ':', num_param)
            total_param += num_param
    return total_param
count_parameters(model)

124697433