In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('./input/AI4Code')

In [2]:
NUM_TRAIN = 50000

#preprocess.py
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )

# 리스트 내 확장자가 json인 파일 import
paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
# 읽어온 notebook을 리스트 형태로 저장
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]

# pd.dataframe 형태로 저장 
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel() # 인덱스의 기준값을 변경해줌 
    .sort_index(level='id', sort_remaining=False)
)

df

Train NBs: 100%|█████████████████████████████████████████████████████████████████████| 50000/50000 [02:37<00:00, 317.83it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,cell_type,source
id,cell_id,Unnamed: 2_level_1,Unnamed: 3_level_1
00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...
00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros..."
00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore')
00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14})
00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\..."
...,...,...,...
fffe1d764579d5,0d770d6b,markdown,## REMOVING THE OUTLIERS
fffe1d764579d5,d45ddc62,markdown,### DIMENSIONALITY CURSE
fffe1d764579d5,1a63248d,markdown,# BANGALORE HOUSE PRICE PREDICTION
fffe1d764579d5,a8ffc8b4,markdown,* We have achieved 75.2% accuracy in predicting the prices of the homes in Banglore using Decision Tree Regressor


In [3]:
#test
# 샘플데이터(notebook) 확인
# 노트북은 코드와 Markdown 순서가 정렬되어있지 않음(disordered)
nb_id = df.index.unique('id')[3]
print('Notebook:', nb_id)

print("The disordered notebook:")
nb = df.loc[nb_id, :]
display(nb)
print()

Notebook: 000597ac4c6700
The disordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ede4241f,code,import pandas as pd\nimport numpy as np
36b989a3,code,"## Function to reduce the DF size\ndef reduce_mem_usage(df, verbose=True):\n numerics = ['int16', 'int32', 'int64..."
b91fc1a4,code,"train = pd.read_csv(""/kaggle/input/bdg2-class-competition/train.csv"")\ntest = pd.read_csv(""/kaggle/input/bdg2-class-..."
97c1bb21,code,train = reduce_mem_usage(train)\ntest = reduce_mem_usage(test)\nwtrain = reduce_mem_usage(wtrain)\nwtest = reduce_me...
148274fd,code,metadata.info()
7f40f579,code,train.info()
92a7ccd5,code,wtrain.info()
a791bdcd,code,metadata.isna().sum()*100 / len(metadata)
306b0be1,code,# Select columns with more than 50% missing values\nmissing = metadata.isna().sum()*100 / len(metadata)\nto_drop = m...
d8d2abf4,code,metadata.head()





In [4]:
# Train 데이터(notebook)의 order 순서가 적혀있는 데이터(train_orders.csv) import
# 여기서의 id는 notebook 단위

#preprocess.py -2
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # cell_id가 텍스트로 붙어있음, 띄어쓰기 단위로 끊어서 리스트화

print(df_orders.shape)
df_orders.head(2)



  df_orders = pd.read_csv(


(139256,)


id
00001756c60be8    [1862f0a6, 448eb224, 2a9e43d6, 7e2f170a, 038b763d, 77e56113, 2eefe0ef, 1ae087ab, 0beab1cd, 8ffe0b25, 9a78ab76, 0d136...
00015c83e2717b    [2e94bd7a, 3e99dee9, b5e286ea, da4f7550, c417225b, 51e3cd89, 2600b4eb, 75b65993, cf195f8b, 25699d02, 72b3201a, f2c75...
Name: cell_order, dtype: object

In [5]:
#test
# 위에서 확인했던 notebook을 train_order.csv를 활용해
# cell 재정렬 후 결과 확인 (실제 복윈되어야 하는 notebook output)

cell_order = df_orders.loc[nb_id]

print("The ordered notebook:")
nb.loc[cell_order, :]

The ordered notebook:


Unnamed: 0_level_0,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ba54a747,markdown,# Sample submission\n\nThis notebooks is an example on how to make a submission to the competition. Steps to be perf...
ede4241f,code,import pandas as pd\nimport numpy as np
fb42ece2,markdown,"The following function helps reduce the memory usage, it was taken from [this amazing notebook](https://www.kaggle.c..."
36b989a3,code,"## Function to reduce the DF size\ndef reduce_mem_usage(df, verbose=True):\n numerics = ['int16', 'int32', 'int64..."
2fa559cb,markdown,## Load data
b91fc1a4,code,"train = pd.read_csv(""/kaggle/input/bdg2-class-competition/train.csv"")\ntest = pd.read_csv(""/kaggle/input/bdg2-class-..."
649083d4,markdown,And -if you want- use the function defined before to reduce memory usage:
97c1bb21,code,train = reduce_mem_usage(train)\ntest = reduce_mem_usage(test)\nwtrain = reduce_mem_usage(wtrain)\nwtest = reduce_me...
148274fd,code,metadata.info()
7f40f579,code,train.info()


In [6]:
# 위 결과처럼
# 정렬되어있지 않는 Train 데이터(notebook)의 cell 순서(rank)를 구하는 함수 설정

#preprocess.py -3
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

cell_ranks = get_ranks(cell_order, list(nb.index))


In [7]:
nb.insert(0, 'rank', cell_ranks)

nb

Unnamed: 0_level_0,rank,cell_type,source
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ede4241f,1,code,import pandas as pd\nimport numpy as np
36b989a3,3,code,"## Function to reduce the DF size\ndef reduce_mem_usage(df, verbose=True):\n numerics = ['int16', 'int32', 'int64..."
b91fc1a4,5,code,"train = pd.read_csv(""/kaggle/input/bdg2-class-competition/train.csv"")\ntest = pd.read_csv(""/kaggle/input/bdg2-class-..."
97c1bb21,7,code,train = reduce_mem_usage(train)\ntest = reduce_mem_usage(test)\nwtrain = reduce_mem_usage(wtrain)\nwtest = reduce_me...
148274fd,8,code,metadata.info()
7f40f579,9,code,train.info()
92a7ccd5,10,code,wtrain.info()
a791bdcd,12,code,metadata.isna().sum()*100 / len(metadata)
306b0be1,13,code,# Select columns with more than 50% missing values\nmissing = metadata.isna().sum()*100 / len(metadata)\nto_drop = m...
d8d2abf4,14,code,metadata.head()


In [8]:
# 전체 Trainset의 rank 구함

#preprocess.py -4
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

df_ranks

Unnamed: 0_level_0,Unnamed: 1_level_0,rank
id,cell_id,Unnamed: 2_level_1
00001756c60be8,1862f0a6,0
00001756c60be8,2a9e43d6,2
00001756c60be8,038b763d,4
00001756c60be8,2eefe0ef,6
00001756c60be8,0beab1cd,8
...,...,...
fffe1d764579d5,0d770d6b,43
fffe1d764579d5,d45ddc62,33
fffe1d764579d5,1a63248d,0
fffe1d764579d5,a8ffc8b4,69


In [9]:
# ancestor & parent_id 데이터 가져옴

#preprocess.py -5
df_ancestors = pd.read_csv(data_dir / 'train_ancestors.csv', index_col='id')
df_ancestors.head(2)

Unnamed: 0_level_0,ancestor_id,parent_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
00001756c60be8,945aea18,
00015c83e2717b,aa2da37e,317b65d12af9df


In [10]:
# train set에 ancestor & parent info merge

#preprocess.py -6
df = df.reset_index().merge(df_ranks, on=["id", "cell_id"]).merge(df_ancestors, on=["id"])
df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...,0,945aea18,
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros...",2,945aea18,
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),4,945aea18,
3,00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),6,945aea18,
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\...",8,945aea18,
...,...,...,...,...,...,...,...
2283275,fffe1d764579d5,0d770d6b,markdown,## REMOVING THE OUTLIERS,43,3c40bfa6,
2283276,fffe1d764579d5,d45ddc62,markdown,### DIMENSIONALITY CURSE,33,3c40bfa6,
2283277,fffe1d764579d5,1a63248d,markdown,# BANGALORE HOUSE PRICE PREDICTION,0,3c40bfa6,
2283278,fffe1d764579d5,a8ffc8b4,markdown,* We have achieved 75.2% accuracy in predicting the prices of the homes in Banglore using Decision Tree Regressor,69,3c40bfa6,


# 최종 train DataFrame

#df 에 포함된 노트북은 50,000개

#노트북 내 cell 개수까지 모두 합쳐서 2,293,280개

#df_orders 는 모든 노트북에 대한 셀 순서대로를 포함하고 있음

In [11]:
# rank / 각 id마다 cell의 개수 
# 해당 값을 학습 시에는 y LABEL로 활용함
#   - Cell 전체를 0~1로 보았을 때, 해당 Cell이 몇%정도에 위치하고 있는지에 대한 값

#preprocess.py -7
df["pct_rank"] = df["rank"] / df.groupby("id")["cell_id"].transform("count")
# df["pct_rank"].hist(bins=10)
df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,00001756c60be8,1862f0a6,code,# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/pyt...,0,945aea18,,0.0
1,00001756c60be8,2a9e43d6,code,"import numpy as np\nimport pandas as pd\nimport random\n\nfrom sklearn.model_selection import train_test_split, cros...",2,945aea18,,0.034483
2,00001756c60be8,038b763d,code,import warnings\nwarnings.filterwarnings('ignore'),4,945aea18,,0.068966
3,00001756c60be8,2eefe0ef,code,matplotlib.rcParams.update({'font.size': 14}),6,945aea18,,0.103448
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n print(""Train R2:\...",8,945aea18,,0.137931
...,...,...,...,...,...,...,...,...
2283275,fffe1d764579d5,0d770d6b,markdown,## REMOVING THE OUTLIERS,43,3c40bfa6,,0.597222
2283276,fffe1d764579d5,d45ddc62,markdown,### DIMENSIONALITY CURSE,33,3c40bfa6,,0.458333
2283277,fffe1d764579d5,1a63248d,markdown,# BANGALORE HOUSE PRICE PREDICTION,0,3c40bfa6,,0.0
2283278,fffe1d764579d5,a8ffc8b4,markdown,* We have achieved 75.2% accuracy in predicting the prices of the homes in Banglore using Decision Tree Regressor,69,3c40bfa6,,0.958333


In [12]:
df.isnull().sum()

id                   0
cell_id              0
cell_type            0
source               0
rank                 0
ancestor_id          0
parent_id      1979334
pct_rank             0
dtype: int64

In [13]:
df[df['source']=='']

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank


# 데이터 전처리 과정!

In [14]:
# 코드블록(한 셀) 을 Input 으로 받음
def clean_data(c):
    ret = []
    splitted = c.split('\n') # \n 만 있는 라인 제거
    for s in splitted:
        if len(s.strip()) > 0:
            if s.lstrip().startswith('#'): #주석 제거 -> 마크다운에선 하면 안될듯?
                continue
            s = s.rstrip()
            if '#' in s:
                s = s[:s.index('#')]
            s = s.replace('\n', '')
            s = s.replace('    ', '\t')
            if s == '':
                continue
            ret.append(s)
            
    if not ret:
        splitted = c.split('\n')
        for s in splitted:
            if len(s.strip()) > 0:
                ret.append(s)
    
    return '\n'.join(ret)

In [15]:
import numpy as np
import pandas as pd
import os
import re
# import fasttext
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from pathlib import Path
import nltk
nltk.download('wordnet')

stemmer = WordNetLemmatizer()

def preprocess_text(document, cell_type):
        if cell_type == 'code':
            document = clean_data(document)

        # Remove all the special characters
#         document = re.sub(r'\W', ' ', str(document))
#         print(document)

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()
        
        return document

        # Lemmatization
#         tokens = document.split()
#         tokens = [stemmer.lemmatize(word) for word in tokens]
#         tokens = [word for word in tokens if len(word) > 3]

#         preprocessed_text = ' '.join(tokens)
#         return preprocessed_text
        

    
def preprocess_df(df):
    """
    This function is for processing sorce of notebook
    returns preprocessed dataframe
    """
    return [preprocess_text(message, c_type) for message, c_type in zip(df.source, df.cell_type)]



[nltk_data] Downloading package wordnet to
[nltk_data]     /home/innoacad05/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
# df.source = df.source.apply(preprocess_text)
preprocessed_df = df.copy()
preprocessed_df.source = preprocess_df(preprocessed_df)

In [17]:
# 전처리 후
preprocessed_df

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,00001756c60be8,1862f0a6,code,"import numpy as np import pandas as pd import os for dirname, _, filenames in os.walk('/kaggle/input'): for filename...",0,945aea18,,0.0
1,00001756c60be8,2a9e43d6,code,"import numpy as np import pandas as pd import random from sklearn.model_selection import train_test_split, cross_val...",2,945aea18,,0.034483
2,00001756c60be8,038b763d,code,import warnings warnings.filterwarnings('ignore'),4,945aea18,,0.068966
3,00001756c60be8,2eefe0ef,code,matplotlib.rcparams.update({'font.size': 14}),6,945aea18,,0.103448
4,00001756c60be8,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values): print(""train r2:\t"" + ...",8,945aea18,,0.137931
...,...,...,...,...,...,...,...,...
2283275,fffe1d764579d5,0d770d6b,markdown,## removing the outliers,43,3c40bfa6,,0.597222
2283276,fffe1d764579d5,d45ddc62,markdown,### dimensionality curse,33,3c40bfa6,,0.458333
2283277,fffe1d764579d5,1a63248d,markdown,# bangalore house price prediction,0,3c40bfa6,,0.0
2283278,fffe1d764579d5,a8ffc8b4,markdown,* we have achieved 75.2% accuracy in predicting the prices of the homes in banglore using decision tree regressor,69,3c40bfa6,,0.958333


In [18]:
preprocessed_df.isnull().sum()

id                   0
cell_id              0
cell_type            0
source               0
rank                 0
ancestor_id          0
parent_id      1979334
pct_rank             0
dtype: int64

In [19]:
preprocessed_df[preprocessed_df['source']=='']#['source'] #오예~~~~~ 누락부분 다 살렸음!

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank


In [20]:
preprocessed_df[preprocessed_df['cell_id']=='79ab27f3']

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
1004,0017062cc1b4ca,79ab27f3,code,#my_test_indices,76,88c3494d,00b4c1b476d136,0.8


### TEST

In [180]:
preprocessed_df[(preprocessed_df['source']=='') & (preprocessed_df['cell_type']=='code')] # 한줄 주석짜리 코드가 5,169개

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
108,0017062cc1b4ca,79ab27f3,code,,76,88c3494d,00b4c1b476d136,0.8
191,002aed65301beb,3952ace8,code,,63,4136f24d,,0.984375
330,0035e72f9b3c02,709aed69,code,,40,86aaf838,c062ce1246fb51,0.754717
338,0035e72f9b3c02,2d0d1591,code,,48,86aaf838,c062ce1246fb51,0.90566
339,0035e72f9b3c02,25f783fb,code,,49,86aaf838,c062ce1246fb51,0.924528
...,...,...,...,...,...,...,...,...
458673,ffc922d5c58634,243f01fb,code,,6,a358669e,bb14e65019c899,0.3
458677,ffc922d5c58634,5e5e5e17,code,,12,a358669e,bb14e65019c899,0.6
458680,ffc922d5c58634,8769da67,code,,17,a358669e,bb14e65019c899,0.85
458714,ffdc9028f5144d,da2ec96a,code,,10,21150abd,,0.555556


In [163]:
df[df['cell_id']=='79ab27f3']

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
108,0017062cc1b4ca,79ab27f3,code,#my_test_indices,76,88c3494d,00b4c1b476d136,0.8


## Train / Valid 분리

In [21]:
df = preprocessed_df

In [22]:
#preprocess.py -8
from sklearn.model_selection import GroupShuffleSplit

NVALID = 0.01  # size of validation set

splitter = GroupShuffleSplit(n_splits=1, test_size=NVALID, random_state=0)
train_ind, val_ind = next(splitter.split(df, groups=df["ancestor_id"]))

train_df = df.loc[train_ind].reset_index(drop=True)
val_df = df.loc[val_ind].reset_index(drop=True)

train_df.shape, val_df.shape

((2257659, 8), (25621, 8))

In [23]:
#test
# ancestor_id를 key값 비율 확인
print(df.ancestor_id.nunique(), train_df.ancestor_id.nunique(), val_df.ancestor_id.nunique())
print(train_df.ancestor_id.nunique()/df.ancestor_id.nunique(), val_df.ancestor_id.nunique()/df.ancestor_id.nunique())

47093 46622 471
0.9899985135795129 0.010001486420487121


In [24]:
val_df.head()

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
0,002d93ddca8c5d,b6afdfdb,code,import numpy as np import matplotlib.pyplot as plt,1,d0cda2e5,,0.02439
1,002d93ddca8c5d,522b1069,code,"c = np.array([[0.022,0.01,-0.001,0.011,0.005],[0.01,0.033,0,0.014,0.01],[-0.001,0,0.019,-0.001,-0.001],[0.011,0.014,...",4,d0cda2e5,,0.097561
2,002d93ddca8c5d,f9a6802d,code,c_invers = np.linalg.inv(c) print(c_invers),5,d0cda2e5,,0.121951
3,002d93ddca8c5d,c215d32e,code,"u = np.array([[1],[1],[1],[1],[1]]) print(u)",7,d0cda2e5,,0.170732
4,002d93ddca8c5d,d7b802ab,code,u_t = u.transpose() print(u_t),8,d0cda2e5,,0.195122


In [25]:
len(val_df.loc[val_df["cell_type"] == "markdown"])

9013

In [26]:
# train & Valid 중 Markdown 부분만 분리해서 생성

#preprocess.py -9
train_df_mark = train_df[train_df["cell_type"] == "markdown"].reset_index(drop=True)
val_df_mark = val_df[val_df["cell_type"] == "markdown"].reset_index(drop=True)

In [27]:
# 공란이 있는데 공란이라고 인식이 안되고 저장되는 거임
len(val_df_mark)

9013

In [28]:
val_df_mark.isnull().sum()

id                0
cell_id           0
cell_type         0
source            0
rank              0
ancestor_id       0
parent_id      7632
pct_rank          0
dtype: int64

In [29]:
#preprocess.py -10
train_df_mark.to_csv("./data_50k_preprocess/train_mark_preprocess.csv", index=False)
val_df_mark.to_csv("./data_50k_preprocess/val_mark_preprocess.csv", index=False)
val_df.to_csv("./data_50k_preprocess/val_preprocess.csv", index=False)
train_df.to_csv("./data_50k_preprocess/train_preprocess.csv", index=False)

In [30]:
len(val_df_mark)
# len(val_df_mark.drop("parent_id", axis=1).dropna().reset_index(drop=True))

9013

In [31]:
len(val_df.loc[val_df["cell_type"] == "markdown"])


9013

# sample_cells 와 get_features

In [32]:
# Additional code cells

#preprocess.py -11
def clean_code(cell):
    return str(cell).replace("\\n", "\n")


def sample_cells(cells, n):
    cells = [clean_code(cell) for cell in cells]
    if n >= len(cells):
        return [cell[:200] for cell in cells]
    else: #code cell 개수가 지정된 n 보다 넘어가면
        results = []
        step = len(cells) / n #ex) 25/20 = 1.25 씩 뛰어 넘으면서 셀을 추가
        idx = 0
        while int(np.round(idx)) < len(cells):
            results.append(cells[int(np.round(idx))])
            idx += step
        assert cells[0] in results # 마지막 셀 꼭 추가
        if cells[-1] not in results:
            results[-1] = cells[-1]
        return results


def get_features(df):
    features = dict()
    df = df.sort_values("rank").reset_index(drop=True)
    for idx, sub_df in tqdm(df.groupby("id")): # 각 노트북에 대한 정보 저장
        features[idx] = dict()
        total_md = sub_df[sub_df.cell_type == "markdown"].shape[0]
        code_sub_df = sub_df[sub_df.cell_type == "code"]
        total_code = code_sub_df.shape[0]
        codes = sample_cells(code_sub_df.source.values, 20) #20개로 제한
        features[idx]["total_code"] = total_code
        features[idx]["total_md"] = total_md
        features[idx]["codes"] = codes
    return features

In [33]:
#preprocess.py -12
val_fts = get_features(val_df)
json.dump(val_fts, open("./data_50k_preprocess/val_fts_preprocess.json","wt"))
train_fts = get_features(train_df)
json.dump(train_fts, open("./data_50k_preprocess/train_fts_preprocess.json","wt"))

100%|███████████████████████████████████████████████████████████████████████████████████| 505/505 [00:00<00:00, 1509.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 49495/49495 [00:33<00:00, 1457.30it/s]


In [43]:
train_df_mark = pd.read_csv('data_all_preprocess/train_mark_preprocess.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
train_fts = json.load(open('data_all_preprocess/train_fts_preprocess.json'))
val_df_mark = pd.read_csv('data_all_preprocess/val_mark_preprocess.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
val_fts = json.load(open('data_all_preprocess/val_fts_preprocess.json'))
val_df = pd.read_csv('data_all_preprocess/val_preprocess.csv')

In [47]:
tr_df = pd.read_csv('data_all_preprocess/train_preprocess.csv')

In [48]:
tr_df[tr_df['source']=='']#['source'] #오예~~~~~ 누락부분 다 살렸음!

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank


In [36]:
len(val_df_mark)

21411

In [37]:
len(val_df.loc[val_df["cell_type"] == "markdown"])

21411

In [214]:
train_df_mark = pd.read_csv('data_10k_preprocess/train_mark_preprocess.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
train_fts = json.load(open('data_10k_preprocess/train_fts_preprocess.json'))
val_df_mark = pd.read_csv('data_10k_preprocess/val_mark_preprocess.csv').drop("parent_id", axis=1).dropna().reset_index(drop=True)
val_fts = json.load(open('data_10k_preprocess/val_fts_preprocess.json'))
val_df = pd.read_csv('data_10k_preprocess/val_preprocess.csv')

In [215]:
len(val_df_mark)

15701

In [119]:
val_df_mark = pd.read_csv('data_10k_preprocess/val_mark_preprocess.csv').drop("parent_id", axis=1)
len(val_df_mark)

15701

In [127]:
val_df_mark[val_df_mark['source'].isnull()==True]

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,pct_rank
1,002d93ddca8c5d,366b6317,markdown,,6,d0cda2e5,0.146341
2,002d93ddca8c5d,e242f99d,markdown,,22,d0cda2e5,0.536585
6,002d93ddca8c5d,d83f6ded,markdown,,20,d0cda2e5,0.487805
7,002d93ddca8c5d,db5a673e,markdown,,14,d0cda2e5,0.341463
13,002d93ddca8c5d,2ce6a42e,markdown,,3,d0cda2e5,0.073171
...,...,...,...,...,...,...,...
15056,f40db975ce722e,be11c0ed,markdown,,105,b8818416,0.362069
15098,f40db975ce722e,9ae31310,markdown,,24,b8818416,0.082759
15103,f40db975ce722e,87d8aba0,markdown,,171,b8818416,0.589655
15229,f629daaf99ff7b,62ae6385,markdown,,60,3f74f2fc,0.821918


In [92]:
len(val_df.loc[val_df["cell_type"] == "markdown"])

15701

In [129]:
df[df['cell_id']=='366b6317']

Unnamed: 0,id,cell_id,cell_type,source,rank,ancestor_id,parent_id,pct_rank
242,002d93ddca8c5d,366b6317,markdown,,6,d0cda2e5,,0.146341
