# 0. Add Data 
## - 1. Accelerator GPU & AI4Code Data
## - 2. Run https://github.com/hotorch/ai4code-baseline/blob/main/code/preprocess.py
## - 3. load codebert

# 1. dataset.py


In [1]:
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AutoTokenizer

class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.fts = fts

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        # markdown 
        inputs = self.tokenizer.encode_plus(
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        # code
        code_inputs = self.tokenizer.batch_encode_plus(
            [str(x) for x in self.fts[row.id]["codes"]],
            add_special_tokens=True,
            max_length=24, #################
            padding="max_length",
            truncation=True
        )
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_md"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])

        ids = inputs['input_ids']
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask']
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask))
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]


# 2. Metric.py

In [2]:
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

# 3. model.py

In [3]:
import torch.nn.functional as F
import torch.nn as nn
import torch
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup


class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769, 1)

    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = torch.cat((x[:, 0, :], fts), 1)
        x = self.top(x)
        return x

# 4. Train

In [4]:
import json
from pathlib import Path
# from dataset import *
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
# from model import *
from tqdm import tqdm
import sys, os
# from metrics import *
import torch
# import argparse

# parser = argparse.ArgumentParser(description='Process some arguments')
# parser.add_argument('--model_name_or_path', type=str, default='microsoft/codebert-base')
# parser.add_argument('--train_mark_path', type=str, default='./data/train_mark.csv')
# parser.add_argument('--train_features_path', type=str, default='./data/train_fts.json')
# parser.add_argument('--val_mark_path', type=str, default='./data/val_mark.csv')
# parser.add_argument('--val_features_path', type=str, default='./data/val_fts.csv')
# parser.add_argument('--val_path', type=str, default="./data/val.csv")

# parser.add_argument('--md_max_len', type=int, default=64)
# parser.add_argument('--total_max_len', type=int, default=512)
# parser.add_argument('--batch_size', type=int, default=8)
# parser.add_argument('--accumulation_steps', type=int, default=4)
# parser.add_argument('--epochs', type=int, default=5)
# parser.add_argument('--n_workers', type=int, default=8)

# args = parser.parse_args()
os.mkdir("./outputs")
# data_dir = Path('..//input/')
data_dir = Path('../input/AI4Code')

## 4-0. Define Constants


In [5]:
train_data_start_point = 54321
train_data_num = 15000

valid_data_start_point = 4321
valid_data_num = 2500

## 4-1. Load Preprocessed Data - Train

In [6]:
%%time
# train_df_mark = pd.read_csv(args.train_mark_path).drop("parent_id", axis=1).dropna().reset_index(drop=True)
# train_fts = json.load(open(args.train_features_path))
# val_df_mark = pd.read_csv(args.val_mark_path).drop("parent_id", axis=1).dropna().reset_index(drop=True)
# val_fts = json.load(open(args.val_features_path))
# val_df = pd.read_csv(args.val_path)
train_df_mark = pd.read_csv('../input/ai4code-preprocess/data/train_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
train_fts = json.load(open('../input/ai4code-preprocess/data/train_fts.json'))

CPU times: user 15.1 s, sys: 2.08 s, total: 17.1 s
Wall time: 26.9 s


In [7]:
print(train_df_mark.shape)

(1949464, 7)


In [8]:
train_df_mark.id.nunique()

125243

## 학습 데이터 건수가 너무 많아 조정

In [9]:
len(train_df_mark.id.tolist())

1949464

In [10]:
train_sample_mark_id = train_df_mark.id.unique().tolist()[train_data_start_point:train_data_start_point+train_data_num]
# train_sample_mark_id = train_df_mark.id.unique().tolist()[121:191]
train_df_mark = train_df_mark[train_df_mark['id'].isin(train_sample_mark_id)].reset_index(drop=True)

In [11]:
train_df_mark.shape

(233939, 7)

## 4-2. Load Preprocessed Data - Valid

In [12]:
%%time
val_df_mark = pd.read_csv('../input/ai4code-preprocess/data/val_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
val_fts = json.load(open('../input/ai4code-preprocess/data/val_fts.json'))
val_df = pd.read_csv('../input/ai4code-preprocess/data/val.csv')

CPU times: user 4.67 s, sys: 300 ms, total: 4.97 s
Wall time: 8.23 s


## 학습 데이터 건수가 너무 많아 조정

In [13]:
# val_sample_mark_id = val_df_mark.id.unique().tolist()[4321:9321]
val_sample_mark_id = val_df_mark.id.unique().tolist()[valid_data_start_point:valid_data_start_point+valid_data_num]
val_df_mark = val_df_mark[val_df_mark['id'].isin(val_sample_mark_id)].reset_index(drop=True)

In [14]:
val_df_mark.cell_type.value_counts()

markdown    38984
Name: cell_type, dtype: int64

In [15]:
val_df_mark.shape

(38984, 7)

## 4-3. Define Constant Values

In [16]:

# parser.add_argument('--md_max_len', type=int, default=64)
# parser.add_argument('--total_max_len', type=int, default=512)
# parser.add_argument('--batch_size', type=int, default=8)
# parser.add_argument('--accumulation_steps', type=int, default=4)
# parser.add_argument('--epochs', type=int, default=5)
# parser.add_argument('--n_workers', type=int, default=8)

md_max_len = 64
total_max_len = 512
batch_size = 8
accumulation_steps = 4
epochs = 2
n_workers = 8
model_name_or_path = '../input/codebertbase'

save_model_prefix_name = f'sample_{md_max_len}_{total_max_len}_{epochs}_{train_data_start_point}_{train_data_num}_model'

print(save_model_prefix_name)

sample_64_512_2_54321_15000_model


## 4-4. Load Origin Data

In [17]:
%%time
order_df = pd.read_csv(data_dir / "train_orders.csv").set_index("id")
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()

CPU times: user 1.98 s, sys: 275 ms, total: 2.25 s
Wall time: 2.71 s


In [18]:
df_orders

id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b7...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c4172...
0001bdd4021779    [3fdc37be, 073782ca, 8ea7263c, 80543cd8, 38310...
0001daf4c2c76d    [97266564, a898e555, 86605076, 76cc2642, ef279...
0002115f48f982    [9ec225f0, 18281c6c, e3b6b115, 4a044c54, 365fe...
                                        ...                        
fffc30d5a0bc46    [09727c0c, ff1ea6a0, ddfef603, a01ce9b3, 3ba95...
fffc3b44869198    [978a5137, faa48f03, 28dfb12a, eea2e812, 64fef...
fffc63ff750064    [5015c300, 411b85d9, 8238198c, f4781d1d, b5532...
fffcd063cda949    [7e6266ad, d8281fc5, d4ffcaef, 3e0e4a47, 21387...
fffe1d764579d5    [1a63248d, 9c3b96a5, 1398a873, 4e2d4c2d, f71c5...
Name: cell_order, Length: 139256, dtype: object

In [19]:
# order_df = order_df[order_df.index.isin(train_sample_mark_id)].reset_index(drop = True)
# df_orders = df_orders[df_orders.index.isin(train_sample_mark_id)].reset_index(drop = True)

In [20]:
order_df

Unnamed: 0_level_0,cell_order
id,Unnamed: 1_level_1
00001756c60be8,1862f0a6 448eb224 2a9e43d6 7e2f170a 038b763d 7...
00015c83e2717b,2e94bd7a 3e99dee9 b5e286ea da4f7550 c417225b 5...
0001bdd4021779,3fdc37be 073782ca 8ea7263c 80543cd8 38310c80 0...
0001daf4c2c76d,97266564 a898e555 86605076 76cc2642 ef279279 d...
0002115f48f982,9ec225f0 18281c6c e3b6b115 4a044c54 365fe576 a...
...,...
fffc30d5a0bc46,09727c0c ff1ea6a0 ddfef603 a01ce9b3 3ba953ee b...
fffc3b44869198,978a5137 faa48f03 28dfb12a eea2e812 64fef97c 4...
fffc63ff750064,5015c300 411b85d9 8238198c f4781d1d b5532930 e...
fffcd063cda949,7e6266ad d8281fc5 d4ffcaef 3e0e4a47 21387fc8 c...


In [21]:
model_name_or_path

'../input/codebertbase'

In [22]:
AutoTokenizer.from_pretrained(model_name_or_path)

PreTrainedTokenizerFast(name_or_path='../input/codebertbase', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [23]:
# train_ds = MarkdownDataset(train_df_mark, model_name_or_path=args.model_name_or_path, md_max_len=args.md_max_len,
#                            total_max_len=args.total_max_len, fts=train_fts)
# val_ds = MarkdownDataset(val_df_mark, model_name_or_path=args.model_name_or_path, md_max_len=args.md_max_len,
#                          total_max_len=args.total_max_len, fts=val_fts)
# train_loader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.n_workers,
#                           pin_memory=False, drop_last=True)
# val_loader = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.n_workers,
#                         pin_memory=False, drop_last=False)




train_ds = MarkdownDataset(train_df_mark, model_name_or_path = model_name_or_path, md_max_len=md_max_len,
                           total_max_len=total_max_len, fts=train_fts)
val_ds = MarkdownDataset(val_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
                         total_max_len=total_max_len, fts=val_fts)

In [24]:
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=n_workers,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=n_workers,
                        pin_memory=False, drop_last=False)

  cpuset_checked))


In [25]:
val_ds.df.tail(5)

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,pct_rank
38979,7ce09c96a95315,32de4ae3,markdown,# Skewed features\n,19.0,1af46790,0.487179
38980,7ce09c96a95315,d18236e9,markdown,# Save results,36.0,1af46790,0.923077
38981,7ce09c96a95315,122e732c,markdown,# Checking the target,5.0,1af46790,0.128205
38982,7ce09c96a95315,272647ac,markdown,# Data Loading,2.0,1af46790,0.051282
38983,7ce09c96a95315,d722ad32,markdown,FASTAI Modelling,22.0,1af46790,0.564103


In [26]:
val_ds.tokenizer

PreTrainedTokenizerFast(name_or_path='../input/codebertbase', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

In [27]:
def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    return np.concatenate(labels), np.concatenate(preds)

In [28]:
def train(model, train_loader, val_loader, epochs):
    np.random.seed(4321)
    # Creating optimizer and lr schedulers
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

#     num_train_optimization_steps = int(args.epochs * len(train_loader) / args.accumulation_steps)
    num_train_optimization_steps = int(epochs * len(train_loader) / accumulation_steps)
    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5,
                      correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                                num_training_steps=num_train_optimization_steps)  # PyTorch scheduler

    criterion = torch.nn.L1Loss()
    scaler = torch.cuda.amp.GradScaler()

    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)
                loss = criterion(pred, target)
            scaler.scale(loss).backward()
            # if idx % args.accumulation_steps == 0 or idx == len(tbar) - 1:
            if idx % accumulation_steps == 0 or idx == len(tbar) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")

        y_val, y_pred = validate(model, val_loader)
        # print('y_val : ', y_val)
        # print('y_pred : ', y_pred)
        # print('val_df : ', val_df.head(5))
        val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
        # print('val_df["pred"] : ', val_df["pred"])
        y_pred = val_df.loc[val_df["cell_type"] == "markdown", "pred"]
        # val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
        y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
        # print('y_dummy : ', y_dummy[0:20])
        # print('y_dummy.index : ', y_dummy.index[0:20])
        
        # print('df_orders.loc[y_dummy.index] : ', df_orders.loc[y_dummy.index])
        # print("Preds score", kendall_tau(df_orders.loc[y_dummy.index], y_dummy)) ### ERROR!
        torch.save(model.state_dict(), f"./outputs/{save_model_prefix_name}.bin")

    return model, y_pred

In [29]:
model = MarkdownModel(model_name_or_path)
model = model.cuda()

In [30]:
model, y_pred = train(model, train_loader, val_loader, epochs=epochs)



  0%|          | 0/29242 [00:00<?, ?it/s]

  cpuset_checked))


Epoch 1 Loss: 1.195 lr: [4.103686478353054e-08, 4.103686478353054e-08]:   0%|          | 1/29242 [00:02<21:41:20,  2.67s/it]



Epoch 1 Loss: 0.1905 lr: [1.5786233931727617e-05, 1.5786233931727617e-05]: 100%|██████████| 29242/29242 [3:38:08<00:00,  2.23it/s]
100%|██████████| 4873/4873 [12:54<00:00,  6.29it/s]
Epoch 2 Loss: 0.1324 lr: [0.0, 0.0]: 100%|██████████| 29242/29242 [3:38:31<00:00,  2.23it/s]
100%|██████████| 4873/4873 [12:56<00:00,  6.28it/s]


##

In [31]:
import gc
gc.collect()

21

# 5. Inference

In [32]:
# def read_notebook(path):
#     return (
#         pd.read_json(
#             path,
#             dtype={'cell_type': 'category', 'source': 'str'})
#         .assign(id=path.stem)
#         .rename_axis('cell_id')
#     )

# paths_test = list((data_dir / 'test').glob('*.json'))
# notebooks_test = [
#     read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
# ]
# test_df = (
#     pd.concat(notebooks_test)
#     .set_index('id', append=True)
#     .swaplevel()
#     .sort_index(level='id', sort_remaining=False)
# ).reset_index()
# test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount()
# test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

In [33]:

# test_df.tail(5)

In [34]:
# # Additional code cells
# def clean_code(cell):
#     return str(cell).replace("\\n", "\n")


# def sample_cells(cells, n):
#     cells = [clean_code(cell) for cell in cells]
#     if n >= len(cells):
#         return [cell[:200] for cell in cells]
#     else:
#         results = []
#         step = len(cells) / n
#         idx = 0
#         while int(np.round(idx)) < len(cells):
#             results.append(cells[int(np.round(idx))])
#             idx += step
#         assert cells[0] in results
#         if cells[-1] not in results:
#             results[-1] = cells[-1]
#         return results


# def get_features(df):
#     features = dict()
#     df = df.sort_values("rank").reset_index(drop=True)
#     for idx, sub_df in tqdm(df.groupby("id")):
#         features[idx] = dict()
#         total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
#         code_sub_df = sub_df[sub_df.cell_type == "code"]
#         total_code = code_sub_df.shape[0]
#         codes = sample_cells(code_sub_df.source.values, 20)
#         features[idx]["total_code"] = total_code
#         features[idx]["total_md"] = total_md
#         features[idx]["codes"] = codes
#     return features

In [35]:
# test_fts = get_features(test_df)

In [36]:
# test_fts['0009d135ece78d']

In [37]:
# def predict(model_path, ckpt_path):
#     model = MarkdownModel(model_path)
#     model = model.cuda()
#     model.eval()
#     model.load_state_dict(torch.load(ckpt_path))
#     BS = 32
#     NW = 8
#     MAX_LEN = 64
#     test_df["pct_rank"] = 0
#     test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), md_max_len=64,total_max_len=512, model_name_or_path=model_path, fts=test_fts)
#     test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
#                               pin_memory=False, drop_last=False)
#     _, y_test = validate(model, test_loader)
#     return y_test



# ckpt_path = "./outputs/sample_model.bin" 
# y_test_2 = predict(model_name_or_path, ckpt_path)

In [38]:
# y_test = y_test_2

In [39]:
# test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test

In [40]:
# sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
# sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
# sub_df.head()

In [41]:
# sub_df.to_csv("submission.csv", index=False)