In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Dataset
import os, sys

os.environ["WANDB_DISABLED"] = "true"

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

model_path = "../input/kagglemodels/GraphCodeBert"
ckpt_path = "../input/kaggle-models/model.bin"
tokenizer_path = "../input/kagglemodels/GraphCodeBert/tokenizer"

data_dir = Path('../input/AI4Code')
total_max_len = 512
TOKENIZERS_PARALLELISM=False

In [2]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()
df["rank"] = df.groupby(["id", "cell_type"]).cumcount()
df["pred"] = df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
df.source = df.source.str.lower()

Test NBs: 100%|██████████| 4/4 [00:00<00:00, 66.21it/s]


In [3]:
def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else:
        results = []
        step = len(cells) / n
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")):
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20)
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [4]:
test_fts = get_features(df)

import regex as re

def preprocess_text_sc(document):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(document))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    return document

df.source = df.source.apply(preprocess_text_sc)

100%|██████████| 4/4 [00:00<00:00, 498.08it/s]


In [5]:
from torch import nn
from transformers import AutoTokenizer, AutoModel

class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769, 1)
        
    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = self.top(torch.cat((x[:, 0, :], fts),1))
        return x

class MarkdownDataset(Dataset):

    def __init__(self, df, tokenizer_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        self.fts = fts
        self.dict_code_encode = self.code_encodes(self.df, self.fts)
        
    
    def code_encodes(self, df, fts):
        list_index = self.df.id.unique()
        dict_code_encode = {}
        for idx in list_index:
            code_inputs = self.tokenizer.batch_encode_plus(
                [str(x) for x in self.fts[idx]["codes"]],
                add_special_tokens=True,
                max_length=23,
                padding="max_length",
                truncation=True
            )
            dict_code_encode[idx] = code_inputs
        return dict_code_encode
    
    
    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        code_inputs = self.dict_code_encode[row.id]
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_md"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]


In [6]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())
    
    return np.concatenate(labels), np.concatenate(preds)

def predict(model_path, ckpt_path, tokenizer_path):
    model = MarkdownModel(model_path)
    model = model.to('cuda:0')
    model.eval()
    model.load_state_dict(torch.load(ckpt_path, map_location='cuda:0'))
    BS = 32
    NW = 8
    MAX_LEN = 64
    df["pct_rank"] = 0
    test_ds = MarkdownDataset(df[df["cell_type"] == "markdown"].reset_index(drop=True), 
                              md_max_len=64,
                              tokenizer_path = tokenizer_path,
                              total_max_len = total_max_len,
                              fts=test_fts)
    test_loader = DataLoader(test_ds, batch_size=BS, 
                             shuffle=False,
                             num_workers=NW,
                             pin_memory=False,
                             drop_last=False)
    _, y_test = validate(model, test_loader)
    return y_test

In [7]:
y_test = predict(model_path, ckpt_path, tokenizer_path)

  cpuset_checked))


  0%|          | 0/2 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling pa

In [8]:
df.loc[df.cell_type=='markdown', "pred"] = y_test
sub_df = df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

Unnamed: 0,id,cell_order
0,0009d135ece78d,0a226b6a ddfd239c 8cb8d28a c6cd22db 1372ae9b e25aa9bd 90ed07ab ba55e576 7f388a41 f9893819 2843a25a 39e937ec 06dbf8cf
1,0010483c12ba9b,7f270e34 54c7cab3 fe66203e 7844d5f8 5ce8863c 4a0777c4 4703bb6d 4a32c095 865ad516 02a0be6d
2,0010a919d60e4f,23607d04 b7578789 aafc3d23 80e077ec bbff12d4 b190ebb4 d3f5c397 8ce62db4 584f6568 ed415c3c 89b1fdd2 322850af 35cd0771...
3,0028856e09c5b7,eb293dfc 012c9d02 d22526d1 3ae7ece3


In [9]:
sub_df.to_csv('submission.csv', index=False)