In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import os

os.environ["WANDB_DISABLED"] = "true"

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

BERT_PATH = "."

data_dir = Path('./AI4Code')

In [99]:
NUM_TRAIN = 1


def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))[10:20]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)
df.source = df.source.str.lower()

Train NBs: 100%|██████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 232.40it/s]


In [100]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()



  df_orders = pd.read_csv(


In [101]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

In [102]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

In [103]:
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')

In [104]:
df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])

In [105]:
import regex as re
# import fasttext

re_sc_ch = re.compile(r'\P{L}+')
re_sg_ch = re.compile(r' +\p{L} +')
re_sg_ch_st = re.compile(r'^\p{L} +')
re_mul_sp =  re.compile(r' +')
re_pre_b = re.compile(r'^b +')

def preprocess_text(document):
        # Remove all the special characters
        document = re_sc_ch.sub(' ', str(document))

        # remove all single characters
        document = re_sg_ch.sub(' ', document)

        # Remove single characters from the start
        document = re_sg_ch_st.sub(' ', document)

        # Substituting multiple spaces with single space
        document = re_mul_sp.sub(' ', document)

        # Removing prefixed 'b'
        document =re_pre_b.sub('', document)

        return document

    
def preprocess_df(df):
    """
    This function is for processing sorce of notebook
    returns preprocessed dataframe
    """
    return [preprocess_text(message) for message in df.source]

df.source = df.source.apply(preprocess_text)

In [106]:
from transformers import AutoTokenizer, AutoModel

#tokenizer = AutoTokenizer.from_pretrained('./Model/Pre-trained/tokenizer')
#model = AutoModelWithLMHead.from_pretrained('./Model/Pre-trained')

In [107]:
def generate_triplet(df, mode='train'):
    triplets = []
    ids = df.id.unique()
    random_drop = np.random.random(size=10000)>0.9
    count = 0

    for id, df_tmp in tqdm(df.groupby('id')):
        df_tmp_markdown = df_tmp[df_tmp['cell_type']=='markdown']

        df_tmp_code = df_tmp[df_tmp['cell_type']=='code']
        df_tmp_code_rank = df_tmp_code['rank'].values
        df_tmp_code_cell_id = df_tmp_code['cell_id'].values

        for cell_id, rank in df_tmp_markdown[['cell_id', 'rank']].values:
            labels = np.array([(r==(rank+1)) for r in df_tmp_code_rank]).astype('int')

            for cid, label in zip(df_tmp_code_cell_id, labels):
                count += 1
                if label==1:
                    triplets.append( [cell_id, cid, label] )
                    # triplets.append( [cid, cell_id, label] )
                elif mode == 'test':
                    triplets.append( [cell_id, cid, label] )
                    # triplets.append( [cid, cell_id, label] )
                elif random_drop[count%10000]:
                    triplets.append( [cell_id, cid, label] )
                    # triplets.append( [cid, cell_id, label] )

    return triplets

#triplets = generate_triplet(df, mode='test')

In [108]:
len(triplets)

10389

In [109]:
import torch
MAX_LEN = 128
    
class MarkdownModel(torch.nn.Module):
    def __init__(self):
        super(MarkdownModel, self).__init__()
        self.distill_bert = AutoModel.from_pretrained("./Model")
        self.dropout = torch.nn.Dropout(0.2)
        self.top = torch.nn.Linear(512, 1)
        
    def forward(self, ids, mask):
        x = self.distill_bert(ids, mask)[0]
        x = self.dropout(x)
        x = self.top(x[:, 0, :])
        x = torch.sigmoid(x) 
        return x

In [110]:
dict_cellid_source = dict(zip(df['cell_id'].values, df['source'].values))

In [111]:
from torch.utils.data import DataLoader, Dataset



class MarkdownDataset(Dataset):
    def __init__(self, df, max_len, mode='train'):
        super().__init__()
        self.df = df
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained("./Model/tokenizer")
        self.mode=mode
    
    
    def __getitem__(self, index):
        row = self.df[index]
        label = row[-1]
        txt = dict_cellid_source[row[0]] + '[SEP]' + dict_cellid_source[row[1]]
        inputs = self.tokenizer.encode_plus(
            txt,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])
        return ids, mask, torch.FloatTensor([label])
    
    def __len__(self):
        return len(self.df)


val_ds = MarkdownDataset(triplets, max_len=MAX_LEN, mode='test')

In [112]:
BS = 128 
#NW = 8

val_loader = DataLoader(val_ds, batch_size=BS*2, shuffle=True, drop_last=False)

In [113]:
import sys, os

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader, mode='train'):
    model.eval()
    tbar = tqdm(val_loader, file=sys.stdout) 
    preds = np.zeros(len(val_loader.dataset), dtype='float32')
    labels = []
    count = 0
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)
            pred = model(inputs[0], inputs[1]).detach().cpu().numpy().ravel()
            preds[count:count+len(pred)] = pred
            count += len(pred)
            if mode=='train':
                labels.append(target.detach().cpu().numpy().ravel())
    if mode=='test':
        return preds
    else:
        return np.concatenate(labels), np.concatenate(preds)

In [114]:
model = MarkdownModel()
model = model.cuda()
model.load_state_dict(torch.load('./my_model/First/my_own_model_21.bin'))
y_test = validate(model, val_loader, mode='test')

Some weights of the model checkpoint at ./Model were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ./Model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN t

100%|██████████████████████████████████████████████████████████████████████████████████| 41/41 [00:15<00:00,  2.56it/s]


In [115]:
preds_copy = y_test

In [116]:
len(preds_copy)

10389

In [117]:
pred_vals = []
count = 0
for id, df_tmp in tqdm(df.groupby('id')):
    df_tmp_mark = df_tmp[df_tmp['cell_type']=='markdown']
    df_tmp_code = df_tmp[df_tmp['cell_type']!='markdown']
    df_tmp_code_rank = df_tmp_code['rank'].rank().values
    N_code = len(df_tmp_code_rank)
    N_mark = len(df_tmp_mark)

    preds_tmp = preds_copy[count:count+N_mark * N_code]
    count += N_mark + N_code

    for i in range(N_mark):
        pred = preds_tmp[i*N_code:i*N_code+N_code] 

        softmax = np.exp((pred-np.mean(pred)) *20)/np.sum(np.exp((pred-np.mean(pred)) *20)) 

        rank = np.sum(softmax * df_tmp_code_rank)
        pred_vals.append(rank)

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 416.57it/s]


In [118]:
df.loc[df["cell_type"] == "markdown", "pred"] = pred_vals

In [119]:
sub_df = df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)

In [120]:
from bisect import bisect


def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [121]:
p = [_p.split() for _p in sub_df['cell_order']]

In [122]:
y = [list(df_orders.loc[x]) for x in sub_df['id']]

In [123]:
kendall_tau(y, p) 
#0.5901620303153493

0.4022510087067318

In [27]:
#kendall_tau(y, p) #21
#0.46545454545454545

In [53]:
df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pred
0,000624d747afd3,bbdeb43f,code,this python environment comes with many helpful analytics libraries installed it is defined by the kaggle python do...,2,55fbe5b5,,
1,000624d747afd3,fd65ce23,code,pip install scikit allel,4,55fbe5b5,,
2,000624d747afd3,9f08dd38,code,import allel allel version,5,55fbe5b5,,
3,000624d747afd3,8601e3b0,code,callset allel read vcf input end als end als genomics data answerals subset annovar hg anno and geno no intergenic ...,7,55fbe5b5,,
4,000624d747afd3,1ad2d212,code,sorted callset keys,10,55fbe5b5,,
5,000624d747afd3,296862c5,code,callset samples,12,55fbe5b5,,
6,000624d747afd3,ac95ae9c,code,import numpy as np np version,13,55fbe5b5,,
7,000624d747afd3,8d2df64a,code,pip install zarr,14,55fbe5b5,,
8,000624d747afd3,a07f8cf0,code,import zarr zarr version,15,55fbe5b5,,
9,000624d747afd3,7527cb70,code,pip install numcodecs,16,55fbe5b5,,
