# Training

- https://github.com/suicao/ai4code-baseline 참고
    - metric.py - kendall_tau
    - model.py - class MarkdownModel
    - dataset.py - class MarkdownDataset
    - preprocess.py - read_notebook, ... 등 학습을 위해 필요한 파일 생성 및 저장


- 기존 공개되어있던 Sample code를 BASE로 작성된 코드
    - 모델을 - codeBERT 사용


- 현재 성능 0.8151
    - 20k 데이터셋 사용



## import package

In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('./input')

In [2]:
import os

os.makedirs("./outputs", exist_ok=True)

## metric

In [3]:
#metric.py
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max


## Model & Dataset 정의

In [4]:
#model.py
from tqdm import tqdm
import sys, os
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path) #codebert-base
        self.top = nn.Linear(769, 1) #769? 
        
    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0]
        x = self.top(torch.cat((x[:, 0, :], fts),1)) #여기 질문
        return x


#dataset.py
from torch.utils.data import DataLoader, Dataset

class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.md_max_len = md_max_len
        self.total_max_len = total_max_len  # maxlen allowed by model config
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.fts = fts #features

    def __getitem__(self, index):
        row = self.df.iloc[index]

        inputs = self.tokenizer.encode_plus( # 마크다운 tokenizer
            row.source,
            None,
            add_special_tokens=True,
            max_length=self.md_max_len,
            padding="max_length", #64
            return_token_type_ids=True,
            truncation=True
        )
        code_inputs = self.tokenizer.batch_encode_plus( # 해당 md를 포함하는 노트북의 모든 코드 셀 tokenizer
            [str(x) for x in self.fts[row.id]["codes"]],
            add_special_tokens=True,
            max_length=35, #?? 실제 input을 보고 판단!
            padding="max_length",
            truncation=True
        )
        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_code"] # ["total_code"] 여야할 것 같은데??? total_md 가 아니라
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)]) #fts- 해당 노트북의 마크다운 비율

        ids = inputs['input_ids'] #해당 token ids
        for x in code_inputs['input_ids']:
            ids.extend(x[:-1])
        ids = ids[:self.total_max_len]
        if len(ids) != self.total_max_len:
            ids = ids + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(ids))
        ids = torch.LongTensor(ids)

        mask = inputs['attention_mask'] #해당 token attention_mask
        for x in code_inputs['attention_mask']:
            mask.extend(x[:-1])
        mask = mask[:self.total_max_len]
        if len(mask) != self.total_max_len:
            mask = mask + [self.tokenizer.pad_token_id, ] * (self.total_max_len - len(mask)) #512
        mask = torch.LongTensor(mask)

        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])

    def __len__(self):
        return self.df.shape[0]

# Pairwise MarkdownDataset 과의 차이점(의미론적)

코드랑 마크다운의 시퀀스를 고려한게 성능이 더 잘나오는 것 같음 -> pairwise


codebert(이 코드)는 한 마크다운에 대해, 해당 노트북의 모든 코드셀을 뒤에 붙임
- 토크나이징 먼저하고 후에 싹 다 합침

    

pairwise 는, 올바른 순서에 대해서, 한 마크다운&코드 쌍에 대해서만 데이터셋을 만들어서 넘겨줌
- 텍스트를 싹 다 합친 후 토크나이징




# 학습

In [5]:
model_name_or_path = 'microsoft/codebert-base'

In [6]:
import os

os.makedirs("./outputs", exist_ok=True)


In [8]:
# 50k

# train_df_mark = pd.read_csv('./data/train_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# train_fts = json.load(open('./data/train_fts.json'))
# val_df_mark = pd.read_csv('./data/val_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# val_fts = json.load(open('./data/val_fts.json'))
# val_df = pd.read_csv('./data/val.csv')

### 필요한 데이터 로드

In [9]:
# 20k

# train_df_mark = pd.read_csv('./data/train_mark_2.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# train_fts = json.load(open('./data/train_fts_2.json'))
# val_df_mark = pd.read_csv('./data/val_mark_2.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
# val_fts = json.load(open('./data/val_fts_2.json'))
# val_df = pd.read_csv('./data/val_2.csv')

In [7]:
# 1k

train_df_mark = pd.read_csv('./data_1k/train_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
train_fts = json.load(open('./data_1k/train_fts.json'))
val_df_mark = pd.read_csv('./data_1k/val_mark.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
val_fts = json.load(open('./data_1k/val_fts.json'))
val_df = pd.read_csv('./data_1k/val.csv')

In [8]:
data_dir = Path('./input/AI4Code')

order_df = pd.read_csv(data_dir / "train_orders.csv").set_index("id")
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()



  df_orders = pd.read_csv(


In [14]:
# df_orders 정의

In [9]:
md_max_len = 64
total_max_len = 512
batch_size = 8
accumulation_steps = 4
epochs = 3
n_workers = 8


train_ds = MarkdownDataset(train_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
                           total_max_len=total_max_len, fts=train_fts)
val_ds = MarkdownDataset(val_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
                         total_max_len=total_max_len, fts=val_fts)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=n_workers,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=n_workers,
                        pin_memory=False, drop_last=False)

### 필요한 함수 정의

In [10]:
def read_data(data):
    return tuple(d.to(device) for d in data[:-1]), data[-1].to(device)


def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    return np.concatenate(labels), np.concatenate(preds)


from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup

def train(model, train_loader, val_loader, epochs):
    np.random.seed(0)
    # 하이퍼파라미터 튜닝하는 부분!!!
    # Scheduler
    # Creating optimizer and lr schedulers
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    num_train_optimization_steps = int(epochs * len(train_loader) / accumulation_steps)
    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, #이 부분도 성능에 꽤나 영향을 미침..!
                      correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                                num_training_steps=num_train_optimization_steps)  # PyTorch scheduler

    # 러닝레이트(lr)-경사하강법으로 loss값 최소화 - 폭 조정.

    criterion = torch.nn.L1Loss() #L1, L2, MSE, ...
    scaler = torch.cuda.amp.GradScaler()

    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)
                loss = criterion(pred, target)
            scaler.scale(loss).backward()
            if idx % accumulation_steps == 0 or idx == len(tbar) - 1: #파라미터 찾으려고 스케줄러 실행해주는데
                                        #매스텝마다 실행시키면 시간이 오래걸려서, 일정 조건을 주고 하도록 ex) 4번에 1번
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")

        y_val, y_pred = validate(model, val_loader)
        val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True) #그룹별 순위 구하기, 큰 순서대로 정렬
        val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred #마크다운엔 prediction 값
        y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
        print("Preds score", kendall_tau(df_orders.loc[y_dummy.index], y_dummy))
        torch.save(model.state_dict(), "./outputs/model_2.bin")

    return model, y_pred


In [11]:
for i, k in val_df.groupby(["id", "cell_type"])["rank"]:#.rank(pct=True)
    display(i,k)
    break


('003f36ab2c577d', 'code')

0      2
1      4
2      6
3      8
4     10
5     11
6     12
7     14
8     16
9     18
10    20
Name: rank, dtype: int64

In [12]:
val_df.loc[val_df['id']=='00062ab8487156']

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank


In [18]:
val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
y_dummy

id
00062ab8487156    [dcad687f, a2e1fc80, 7d977ee8, 45a82a59, cbbc326a, aa354742, 004d0eee, b9344b3d, 51709b61, 979a5b9a, 96c8449c, af2b0...
002d93ddca8c5d    [b6afdfdb, 0ac314cc, 522b1069, f9a6802d, d8352a91, c215d32e, 2ce6a42e, d7b802ab, d93dc64c, 366b6317, 25e523bd, 21105...
003f36ab2c577d    [386d31f0, 42435be9, 16435878, da99f684, f4bb282f, 82dfbe9a, 4cc5ee5a, 6d31400d, 215ec8c0, 3e1430c4, 3f201013, db8c6...
0048f2cacd521f    [0a160aa1, 0e00f03b, 16b810d8, bd68dd98, 8fa131c3, 601f0f79, 0aac6999, 8dd046f4, 77df26d1, 871b57ce, 2a04ba71, 98b2b...
005bf0cb0cf1e5    [a2dacc28, 4b164dc9, dc6b3a89, 9b368f74, 0c3e045a, d3175678, 042cfce6, 5589cf8a, 6b063353, bed0d2bc, 5182defc, 81c54...
                                                                           ...                                                           
ff26949c53037b    [38b61f5d, 8ba0f38c, e79bffc7, 7194e409, 9b7400f4, c9f7d4eb, fec3ec21, a813061a, b24e21b9, cb94c782, d3e56b0a, 2ba2f...
ff49b6c2609ed9    [42496f3f, f6

In [20]:
y_dummy.index
df_orders.loc[y_dummy.index]


id
00062ab8487156    [dcad687f, a2e1fc80, 7d977ee8, 45a82a59, cbbc326a, aa354742, 004d0eee, b9344b3d, 51709b61, 979a5b9a, af2b0426, 96c84...
002d93ddca8c5d    [0ac314cc, b6afdfdb, d8352a91, 2ce6a42e, 522b1069, f9a6802d, 366b6317, c215d32e, d7b802ab, 211055ea, d93dc64c, 25e52...
003f36ab2c577d    [42435be9, da99f684, 386d31f0, 82dfbe9a, 16435878, 6d31400d, f4bb282f, 3e1430c4, 4cc5ee5a, db8c69de, 215ec8c0, 3f201...
0048f2cacd521f    [0e00f03b, bd68dd98, 0a160aa1, 601f0f79, 16b810d8, 8fa131c3, 8dd046f4, 0aac6999, 871b57ce, 77df26d1, 2a04ba71, 98b2b...
005bf0cb0cf1e5    [9b368f74, a2dacc28, 4b164dc9, dc6b3a89, 0c3e045a, d3175678, 5589cf8a, 042cfce6, 6b063353, 81c543e7, bed0d2bc, 6ca5d...
                                                                           ...                                                           
ff26949c53037b    [38b61f5d, 8ba0f38c, e79bffc7, 7194e409, 9b7400f4, c9f7d4eb, a813061a, b24e21b9, cb94c782, d3e56b0a, 2ba2f9be, fec3e...
ff49b6c2609ed9    [42496f3f, c0

In [13]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name())

True
1
A100-SXM4-40GB MIG 1g.5gb


In [15]:
device

device(type='cuda')

In [16]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [17]:
model = MarkdownModel(model_name_or_path)
model = model.to(device)
# model, y_pred = train(model, train_loader, val_loader, epochs=epochs)


In [18]:
model, y_pred = train(model, train_loader, val_loader, epochs=epochs)




  0%|                                                                                          | 0/1749 [00:01<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 4.75 GiB total capacity; 3.03 GiB already allocated; 4.00 MiB free; 3.10 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF