In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('./input/AI4Code')

In [98]:
NUM_TRAIN = 1000

#preprocess.py
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

# 리스트 내 확장자가 json인 파일 import
paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
# 읽어온 notebook을 리스트 형태로 저장
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]

# pd.dataframe 형태로 저장 
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel() # 인덱스의 기준값을 변경해줌 
    .sort_index(level='id', sort_remaining=False)
)

df

Train NBs: 100%|███████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 293.88it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
000a2f5243e1ca,1d968d84,code,import pandas as pd\nimport numpy as np
000a2f5243e1ca,5774aca9,code,train=pd.read_csv('train.csv')
000a2f5243e1ca,2ddb979d,code,train.head()
000a2f5243e1ca,dc18005d,code,train=train.dropna()
000a2f5243e1ca,3c7d2db1,code,"x=train.drop('label',axis=1)"
...,...,...,...
ff9e97995c0fd1,8704f1b3,code,"from sklearn.ensemble import RandomForestClassifier\nRFC = RandomForestClassifier(random_state=42)\nRFC.fit(X,y)\ny_..."
ff9e97995c0fd1,644da0b1,code,"submission=pd.DataFrame({""label"":y_pred,""actual"":y})\nsubmission.to_csv(""submission.csv"", index=False)\nsubmission"
ff9e97995c0fd1,f1b9d756,code,from IPython.display import FileLink\nFileLink('submission.csv')
ff9e97995c0fd1,08b7cd21,code,"<a href=""./submission.csv""> Download File </a>"


In [99]:
# Train 데이터(notebook)의 order 순서가 적혀있는 데이터(train_orders.csv) import
# 여기서의 id는 notebook 단위

#preprocess.py -2
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # cell_id가 텍스트로 붙어있음, 띄어쓰기 단위로 끊어서 리스트화

print(df_orders.shape)
df_orders.head(2)



  df_orders = pd.read_csv(


(139256,)


id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
Name: cell_order, dtype: object

In [100]:
# 위 결과처럼
# 정렬되어있지 않는 Train 데이터(notebook)의 cell 순서(rank)를 구하는 함수 설정

#preprocess.py -3
def get_ranks(base, derived):
    return [base.index(d) for d in derived]


In [101]:
# 전체 Trainset의 rank 구함

#preprocess.py -4
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
000a2f5243e1ca,1d968d84,1
000a2f5243e1ca,5774aca9,2
000a2f5243e1ca,2ddb979d,3
000a2f5243e1ca,dc18005d,4
000a2f5243e1ca,3c7d2db1,5
...,...,...
ff9e97995c0fd1,8704f1b3,2
ff9e97995c0fd1,644da0b1,3
ff9e97995c0fd1,f1b9d756,4
ff9e97995c0fd1,08b7cd21,5


In [102]:
# ancestor & parent_id 데이터 가져옴

#preprocess.py -5
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors.head(2)

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df


In [103]:
# train set에 ancestor & parent info merge

#preprocess.py -6
df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])
df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,000a2f5243e1ca,1d968d84,code,import pandas as pd\nimport numpy as np,1,ae9b5889,
1,000a2f5243e1ca,5774aca9,code,train=pd.read_csv('train.csv'),2,ae9b5889,
2,000a2f5243e1ca,2ddb979d,code,train.head(),3,ae9b5889,
3,000a2f5243e1ca,dc18005d,code,train=train.dropna(),4,ae9b5889,
4,000a2f5243e1ca,3c7d2db1,code,"x=train.drop('label',axis=1)",5,ae9b5889,
...,...,...,...,...,...,...,...
45762,ff9e97995c0fd1,8704f1b3,code,"from sklearn.ensemble import RandomForestClassifier\nRFC = RandomForestClassifier(random_state=42)\nRFC.fit(X,y)\ny_...",2,4eb69823,
45763,ff9e97995c0fd1,644da0b1,code,"submission=pd.DataFrame({""label"":y_pred,""actual"":y})\nsubmission.to_csv(""submission.csv"", index=False)\nsubmission",3,4eb69823,
45764,ff9e97995c0fd1,f1b9d756,code,from IPython.display import FileLink\nFileLink('submission.csv'),4,4eb69823,
45765,ff9e97995c0fd1,08b7cd21,code,"<a href=""./submission.csv""> Download File </a>",5,4eb69823,


# 최종 train DataFrame

#df 에 포함된 노트북은 50,000개

#노트북 내 cell 개수까지 모두 합쳐서 2,293,280개

#df_orders 는 모든 노트북에 대한 셀 순서대로를 포함하고 있음

In [104]:
# rank / 각 id마다 cell의 개수 
# 해당 값을 학습 시에는 y LABEL로 활용함
#   - Cell 전체를 0~1로 보았을 때, 해당 Cell이 몇%정도에 위치하고 있는지에 대한 값

#preprocess.py -7
df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")
# df["pct_rank"].hist(bins=10)
df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,000a2f5243e1ca,1d968d84,code,import pandas as pd\nimport numpy as np,1,ae9b5889,,0.022222
1,000a2f5243e1ca,5774aca9,code,train=pd.read_csv('train.csv'),2,ae9b5889,,0.044444
2,000a2f5243e1ca,2ddb979d,code,train.head(),3,ae9b5889,,0.066667
3,000a2f5243e1ca,dc18005d,code,train=train.dropna(),4,ae9b5889,,0.088889
4,000a2f5243e1ca,3c7d2db1,code,"x=train.drop('label',axis=1)",5,ae9b5889,,0.111111
...,...,...,...,...,...,...,...,...
45762,ff9e97995c0fd1,8704f1b3,code,"from sklearn.ensemble import RandomForestClassifier\nRFC = RandomForestClassifier(random_state=42)\nRFC.fit(X,y)\ny_...",2,4eb69823,,0.285714
45763,ff9e97995c0fd1,644da0b1,code,"submission=pd.DataFrame({""label"":y_pred,""actual"":y})\nsubmission.to_csv(""submission.csv"", index=False)\nsubmission",3,4eb69823,,0.428571
45764,ff9e97995c0fd1,f1b9d756,code,from IPython.display import FileLink\nFileLink('submission.csv'),4,4eb69823,,0.571429
45765,ff9e97995c0fd1,08b7cd21,code,"<a href=""./submission.csv""> Download File </a>",5,4eb69823,,0.714286


## Train / Valid 분리

In [105]:
#preprocess.py -8
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.1  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)
train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))

train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)

train_df.shape, val_df.shape

((41485, 8), (4282, 8))

In [106]:
val_df.head()

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,003f36ab2c577d,386d31f0,code,import numpy as np\nimport pandas as pd\nimport tensorflow as tf\n# import tf2_0_baseline_w_bert as tf2baseline # ol...,2,8508be37,3bde8d65a3508b,0.095238
1,003f36ab2c577d,16435878,code,def del_all_flags(FLAGS):\n flags_dict = FLAGS._flags()\n keys_list = [keys for keys in flags_dict]\n for k...,4,8508be37,3bde8d65a3508b,0.190476
2,003f36ab2c577d,f4bb282f,code,bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)\n\ntf2baseline.validate_flags_or_throw(bert...,6,8508be37,3bde8d65a3508b,0.285714
3,003f36ab2c577d,4cc5ee5a,code,"test_answers_df = pd.read_json(""/kaggle/working/predictions.json"")",8,8508be37,3bde8d65a3508b,0.380952
4,003f36ab2c577d,215ec8c0,code,"def create_short_answer(entry):\n # if entry[""short_answers_score""] < 1.5:\n # return """"\n \n answer...",10,8508be37,3bde8d65a3508b,0.47619


In [107]:
# train & Valid 중 Markdown 부분만 분리해서 생성

#preprocess.py -9
train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)
val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)

In [24]:
#preprocess.py -10
train_df_mark.to_csv("./data_1k/train_mark.csv", index=False)
val_df_mark.to_csv("./data_1k/val_mark.csv", index=False)
val_df.to_csv("./data_1k/val.csv", index=False)
train_df.to_csv("./data_1k/train.csv", index=False)

# sample_cells 와 get_features

In [14]:
# Additional code cells

#preprocess.py -11
def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else: #code cell 개수가 지정된 n 보다 넘어가면
        results = []
        step = len(cells) / n #ex) 25/20 = 1.25 씩 뛰어 넘으면서 셀을 추가
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results # 마지막 셀 꼭 추가
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")): # 각 노트북에 대한 정보 저장, rank 순서대로 되어있음
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = dict_cellid_source[triplets[0][1]]
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [16]:
#preprocess.py -12
val_fts = get_features(val_df)
json.dump(val_fts, open("./data_1k/val_fts.json","wt"))
train_fts = get_features(train_df)
json.dump(train_fts, open("./data_1k/train_fts.json","wt"))

100%|███████████████████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 1606.79it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 899/899 [00:00<00:00, 1557.53it/s]


## metric

In [17]:
#metric.py
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max


In [18]:
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

In [19]:
data_dir = Path('./input/AI4Code')

In [20]:
paths_test = list((data_dir / 'test').glob('*.json'))

notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]

test_df = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
).reset_index()

# 각 노트북 내에서 code셀&md셀 각각 번호가 0부터 매겨짐
test_df["rank"] = test_df.groupby(["id", "cell_type"]).cumcount() #cumcount(): 각 그룹의 각 항목에 0부터 번호를 매김!
test_df["pred"] = test_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)

Test NBs: 100%|██████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 292.87it/s]


In [21]:
test_df

Unnamed: 0,id,cell_id,cell_type,source,rank,pred
0,0009d135ece78d,ddfd239c,code,"import numpy as np # linear algebra\nimport pandas as pd # data processing,\nimport matplotlib.pyplot as plt\nfrom s...",0,0.142857
1,0009d135ece78d,c6cd22db,code,df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')\ndf,1,0.285714
2,0009d135ece78d,1372ae9b,code,"numerical_data = df.loc[:, ~df.columns.isin(['id', ""diagnosis""])]\n\nlabels = df[""diagnosis""].factorize(['B','M'])[0...",2,0.428571
3,0009d135ece78d,90ed07ab,code,"def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):\n # Scaling Data for testing\n ...",3,0.571429
4,0009d135ece78d,7f388a41,code,"# Ploting data with different columns\n#####################################\ncomparison_plot_maker(numerical_data[""...",4,0.714286
...,...,...,...,...,...,...
84,0010a919d60e4f,d3f5c397,markdown,We have 177 rows with missing `Age` and 687 rows with missing `Cabin`,34,1.000000
85,0028856e09c5b7,012c9d02,code,"sns.set()\nsns.pairplot(data1, 2.5)\nplt.show(); = size",0,0.333333
86,0028856e09c5b7,d22526d1,code,"types----------"")\n# is uniques----------"")\n# plt\nimport mis_val +\n = #https://pandas.pydata.org/pandas...",1,0.666667
87,0028856e09c5b7,3ae7ece3,code,"#correlation avoid map\nf,ax verbose 20), 18))\nsns.heatmap(data1.corr(), the annot=True, ; informations bins=50, '....",2,1.000000


In [22]:
test_fts = get_features(test_df)

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 1297.74it/s]


In [23]:
#test
test_fts

{'0009d135ece78d': {'total_code': 7,
  'total_md': 6,
  'codes': ['import numpy as np # linear algebra\nimport pandas as pd # data processing,\nimport matplotlib.pyplot as plt\nfrom sklearn.decomposition import PCA\nfrom sklearn.preprocessing import StandardScaler\nfrom s',
   "df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')\ndf",
   'numerical_data = df.loc[:, ~df.columns.isin([\'id\', "diagnosis"])]\n\nlabels = df["diagnosis"].factorize([\'B\',\'M\'])[0]\n\nheader_labels = pd.DataFrame(data=labels, columns=["diagnosis"])',
   'def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):\n    # Scaling Data for testing\n    # data_1 = scale(data_1)\n    # data_2 = scale(data_2)\n\n    range =  np.random.randn(le',
   '# Ploting data with different columns\n#####################################\ncomparison_plot_maker(numerical_data["radius_mean"], numerical_data["radius_worst"], "Mean Radius vs Worst Radius", "Mean Ra',
   '# Scaling Data\nscal

## 기존 MakrdownDataset은, input 은 각 마크다운과 해당 노트북의 코드셀이 뒤에 붙고(max_len 512)
## forward는 bert 모델을 통과시켜서, index 0 만 뽑고(마크다운만 뽑으려고인 듯), fts(마크다운비율)을 마지막에 cat 해줌

In [83]:
#model.py
from tqdm import tqdm
import sys, os
from transformers import AutoModel, AutoTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch

class MarkdownModel(nn.Module):
    def __init__(self, model_path):
        super(MarkdownModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_path)
        self.top = nn.Linear(769, 1) #fts 정보 추가
        
    def forward(self, ids, mask, fts):
        x = self.model(ids, mask)[0] 
        x = self.top(torch.cat((x[:, 0, :], fts),1)) #내생각엔 마크다운 순서를 맞추는거니까 가장 위(마크다운셀)만 뽑는거인듯?
        return x


#dataset.py
from torch.utils.data import DataLoader, Dataset

class MarkdownDataset(Dataset):

    def __init__(self, df, model_name_or_path, total_max_len, md_max_len, fts):
        super().__init__()
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
        self.fts = fts

    def __getitem__(self, index): #마크다운 하나에 대한 Input 값
        row = self.df.iloc[index]
        
        md = dict_cellid_source[row.cell_id]
        cd = dict_cellid_source[[i[1] for i in triplets if i[0] == row.cell_id][0]]

        txt = md + '[SEP]' + cd
        inputs = self.tokenizer.encode_plus(
            txt,
            None,
            add_special_tokens=True,
            max_length=self.max_len, #여기 맞춰줘야됨
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = torch.LongTensor(inputs['input_ids'])
        mask = torch.LongTensor(inputs['attention_mask'])

        n_md = self.fts[row.id]["total_md"]
        n_code = self.fts[row.id]["total_code"]
        if n_md + n_code == 0:
            fts = torch.FloatTensor([0])
        else:
            fts = torch.FloatTensor([n_md / (n_md + n_code)])


        assert len(ids) == self.total_max_len

        return ids, mask, fts, torch.FloatTensor([row.pct_rank])
    

    def __len__(self):
        return self.df.shape[0]

In [110]:
def generate_triplet(df, mode='train'):
    triplets = []
    ids = df.id.unique() #notebook id
    random_drop = np.random.random(size=10000)>0.9
    count = 0

    #df의 모든 노트북 반복
    for id, df_tmp in tqdm(df.groupby('id')): #같은 노트북
        df_tmp_markdown = df_tmp[df_tmp['cell_type']=='markdown'] #마크다운만
#         display(df_tmp_markdown)
        df_tmp_code = df_tmp[df_tmp['cell_type']=='code'] #코드셀
        df_tmp_code_rank = df_tmp_code['rank'].values #코드셀 rank(순서)
        df_tmp_code_cell_id = df_tmp_code['cell_id'].values #코드셀 셀id
#         display(df_tmp_code)
        for cell_id, rank in df_tmp_markdown[['cell_id', 'rank']].values: #마크다운 셀id, 랭크만 가져와서
            labels = np.array([(r==(rank+1)) for r in df_tmp_code_rank]).astype('int') #다음 순서의 코드셀 저장
#             print(labels)
            for cid, label in zip(df_tmp_code_cell_id, labels): #코드셀 id랑, 라벨 가져와서
                count += 1
                if label==1:    #다음 코드셀인거 일단 짝짝쿵은 다 가져옴
                    triplets.append( [cell_id, cid, label] ) #마크다운셀id, 코드셀id, 라벨 저장

    return triplets

triplets = generate_triplet(val_df)
triplets


100%|████████████████████████████████████████████████████████████████████████████████| 101/101 [00:00<00:00, 850.26it/s]


[['3e1430c4', '4cc5ee5a', 1],
 ['6f70d84e', '1d664ca8', 1],
 ['da99f684', '386d31f0', 1],
 ['db8c69de', '215ec8c0', 1],
 ['5948bc1a', 'a3f4e6bc', 1],
 ['00070116', '05bc949b', 1],
 ['82dfbe9a', '16435878', 1],
 ['6d31400d', 'f4bb282f', 1],
 ['c47a4a8e', '5c545af1', 1],
 ['faeeee7e', '0851a599', 1],
 ['96e93e20', '4633b3e1', 1],
 ['74f1d455', '608ccbbb', 1],
 ['d6d69942', 'e220cbc3', 1],
 ['3bfc6346', '50891a3b', 1],
 ['02082534', '314200c7', 1],
 ['fd360046', 'dbcc00fa', 1],
 ['473920fc', 'c6e1d964', 1],
 ['c622ea41', '07ef2c2a', 1],
 ['5d9e42f4', '96a2cfa1', 1],
 ['3a44910c', 'e32c2591', 1],
 ['c91d558e', 'd0308e62', 1],
 ['f99269a9', '856d0f79', 1],
 ['15958824', '16e210dc', 1],
 ['656dea7e', '48a61b9f', 1],
 ['b158954c', 'a87ad072', 1],
 ['b8be971a', 'ab72c18f', 1],
 ['109079a1', '03f4e7ba', 1],
 ['7b17b371', 'f3ede2c6', 1],
 ['78919b51', '050dd584', 1],
 ['c86e1675', '28cbae69', 1],
 ['69605797', 'fd98c389', 1],
 ['3a09ce83', '73668a3c', 1],
 ['34b84b0f', 'dc1bf2e3', 1],
 ['ef7b911

In [111]:
dict_cellid_source = dict(zip(val_df['cell_id'].values, val_df['source'].values))


# test 시작!

In [40]:
model_name_or_path = 'microsoft/codebert-base'

In [42]:
import os

os.makedirs("./outputs", exist_ok=True)
# data_dir = Path('./input/')


In [43]:
train_df_mark = pd.read_csv('./data/train_mark_2.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
train_fts = json.load(open('./data/train_fts_2.json'))
val_df_mark = pd.read_csv('./data/val_mark_2.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
val_fts = json.load(open('./data/val_fts_2.json'))
val_df = pd.read_csv('./data/val_2.csv')

In [44]:
print(train_df_mark.shape)

(280477, 7)


In [None]:
# df_orders 정의

In [45]:
max_len = 128
batch_size = 8
accumulation_steps = 4
epochs = 5
n_workers = 8


train_ds = MarkdownDataset(train_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
                           total_max_len=total_max_len, fts=train_fts)
val_ds = MarkdownDataset(val_df_mark, model_name_or_path=model_name_or_path, md_max_len=md_max_len,
                         total_max_len=total_max_len, fts=val_fts)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=n_workers,
                          pin_memory=False, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=n_workers,
                        pin_memory=False, drop_last=False)

In [46]:
def read_data(data):
    return tuple(d.to(device) for d in data[:-1]), data[-1].to(device)


def validate(model, val_loader):
    model.eval()

    tbar = tqdm(val_loader, file=sys.stdout)

    preds = []
    labels = []

    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)

            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

    return np.concatenate(labels), np.concatenate(preds)

from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
def train(model, train_loader, val_loader, epochs):
    np.random.seed(0)
    # Creating optimizer and lr schedulers
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    num_train_optimization_steps = int(epochs * len(train_loader) / accumulation_steps)
    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5,
                      correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_optimization_steps,
                                                num_training_steps=num_train_optimization_steps)  # PyTorch scheduler

    criterion = torch.nn.L1Loss()
    scaler = torch.cuda.amp.GradScaler()

    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        loss_list = []
        preds = []
        labels = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            with torch.cuda.amp.autocast():
                pred = model(*inputs)
                loss = criterion(pred, target)
            scaler.scale(loss).backward()
            if idx % accumulation_steps == 0 or idx == len(tbar) - 1:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            loss_list.append(loss.detach().cpu().item())
            preds.append(pred.detach().cpu().numpy().ravel())
            labels.append(target.detach().cpu().numpy().ravel())

            avg_loss = np.round(np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e + 1} Loss: {avg_loss} lr: {scheduler.get_last_lr()}")

        y_val, y_pred = validate(model, val_loader)
        val_df["pred"] = val_df.groupby(["id", "cell_type"])["rank"].rank(pct=True)
        val_df.loc[val_df["cell_type"] == "markdown", "pred"] = y_pred
        y_dummy = val_df.sort_values("pred").groupby('id')['cell_id'].apply(list)
        print("Preds score", kendall_tau(df_orders.loc[y_dummy.index], y_dummy))
        torch.save(model.state_dict(), "./outputs/model.bin")

    return model, y_pred


In [50]:
model = MarkdownModel(model_name_or_path)
# model = model.to(device)
# model, y_pred = train(model, train_loader, val_loader, epochs=epochs)


In [52]:
model = model.to(device)


In [53]:
model, y_pred = train(model, train_loader, val_loader, epochs=epochs)




  0%|                                                                                         | 0/35059 [00:01<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 4.75 GiB total capacity; 3.05 GiB already allocated; 4.00 MiB free; 3.10 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
def predict(model_path, ckpt_path):
    model = MarkdownModel(model_path)
    model = model.cuda()
    model.eval()
    model.load_state_dict(torch.load(ckpt_path))
    BS = 32
    NW = 8
    MAX_LEN = 64
    test_df["pct_rank"] = 0
    test_ds = MarkdownDataset(test_df[test_df["cell_type"] == "markdown"].reset_index(drop=True), md_max_len=64,total_max_len=512, model_name_or_path=model_path, fts=test_fts)
    test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                              pin_memory=False, drop_last=False)
    _, y_test = validate(model, test_loader)
    return y_test

In [77]:
model_path = "./input/codebert-base/"
ckpt_path = "./outputs/model.bin"


In [None]:
y_test_2 = predict(model_path, ckpt_path)

In [None]:
# y_test = (y_test_1 + y_test_2)/2
y_test = y_test_2

In [None]:
test_df.loc[test_df["cell_type"] == "markdown", "pred"] = y_test

In [None]:
sub_df = test_df.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)