In [1]:
import gc
import os
import pickle
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import polars as pl


In [2]:
class Config:
    PREPROCESS = False
    KAGGLE_NOTEBOOK = False
    DEBUG = True
    
    SEED = 42
    EPOCHS = 5
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 0.05
    PATIENCE = 5
    NBR_FOLDS = 15
    SELECTED_FOLDS = [0]
    
    
if Config.DEBUG:
    n_rows = 10**3
else:
    n_rows = None
    

In [3]:
if Config.KAGGLE_NOTEBOOK:
    RAW_DIR = "/kaggle/input/leash-BELKA/"
    PROCESSED_DIR = "/kaggle/input/belka-enc-dataset"
    OUTPUT_DIR = ""
    MODEL_DIR = ""
else:
    RAW_DIR = "../data/raw/"
    PROCESSED_DIR = "../data/processed/"
    OUTPUT_DIR = "../data/result/"
    MODEL_DIR = "../models/"

TRAIN_DATA_NAME = "local_train_enc.parquet"
SAVE_PATH = "../data/chuncked-dataset/"

データを10分割して保存

In [None]:

# split dataset to several parquet
train = pl.read_parquet(os.path.join(PROCESSED_DIR, TRAIN_DATA_NAME), n_rows=None)

# shuffle
train = train.sample(fraction=1, seed=Config.SEED, shuffle=True)
# trainを10分割して保存
n = 10
chunk_size = len(train) // n
for i in range(n):
    chunk = train[i*chunk_size:(i+1)*chunk_size]
    chunk.write_parquet(os.path.join(SAVE_PATH, f"local_train_enc_{i}.parquet"))
    

マスクを保存

In [14]:
FEATURES = [f'enc{i}' for i in range(142)]

for i in range(10):
    train = pl.read_parquet(os.path.join(SAVE_PATH, f"local_train_enc_{i}.parquet"), n_rows=None).to_pandas()
    mask_df = (train.values > 0).astype(int)
    mask_df = pd.DataFrame(mask_df, columns=train.columns)
    mask_df = mask_df[FEATURES]
    mask_df.to_parquet(os.path.join(SAVE_PATH, f"local_train_mask_{i}.parquet"))

In [4]:
train = pl.read_parquet(os.path.join(SAVE_PATH, f"local_train_enc_0.parquet"), n_rows=1000).to_pandas()
mask = pl.read_parquet(os.path.join(SAVE_PATH, f"local_train_mask_0.parquet"), n_rows=1000).to_pandas()

In [10]:
mask.loc[100][mask.loc[0] == 1].index, train.loc[100][train.loc[0] > 0].index

(Index(['enc0', 'enc1', 'enc2', 'enc3', 'enc4', 'enc5', 'enc6', 'enc7', 'enc8',
        'enc9', 'enc10', 'enc11', 'enc12', 'enc13', 'enc14', 'enc15', 'enc16',
        'enc17', 'enc18', 'enc19', 'enc20', 'enc21', 'enc22', 'enc23', 'enc24',
        'enc25', 'enc26', 'enc27', 'enc28', 'enc29', 'enc30', 'enc31', 'enc32',
        'enc33', 'enc34', 'enc35', 'enc36', 'enc37', 'enc38', 'enc39', 'enc40',
        'enc41', 'enc42', 'enc43', 'enc44', 'enc45', 'enc46', 'enc47', 'enc48',
        'enc49', 'enc50', 'enc51', 'enc52', 'enc53', 'enc54', 'enc55', 'enc56',
        'enc57', 'enc58', 'enc59', 'enc60', 'enc61', 'enc62', 'enc63', 'enc64',
        'enc65', 'enc66', 'enc67', 'enc68', 'enc69', 'enc70', 'enc71', 'enc72',
        'enc73', 'enc74', 'enc75', 'enc76', 'enc77', 'enc78', 'enc79'],
       dtype='object'),
 Index(['enc0', 'enc1', 'enc2', 'enc3', 'enc4', 'enc5', 'enc6', 'enc7', 'enc8',
        'enc9', 'enc10', 'enc11', 'enc12', 'enc13', 'enc14', 'enc15', 'enc16',
        'enc17', 'enc18', 

生データを１０分割して保存

In [15]:
train = pl.read_parquet(os.path.join(RAW_DIR, "train.parquet"), n_rows=None)

In [16]:
# 10分割して保存
SAVE_PATH = "../data/chuncked-raw-dataset"
n = 10
chunk_size = len(train) // n
for i in range(n):
    chunk = train[i*chunk_size:(i+1)*chunk_size]
    path = os.path.join(SAVE_PATH, f"train_{i}.parquet")
    chunk.write_parquet(path)
    print(f"data saved to {path}", chunk.shape)

In [13]:
train0 = pl.read_parquet(os.path.join(SAVE_PATH, "train_0.parquet"), n_rows=1000)
train0

id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
i64,str,str,str,str,str,i64
0,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""Br.Br.NCC1CCCN1c1cccnn1""","""C#CCOc1ccc(CNc2nc(NCC3CCCN3c3c…","""BRD4""",0
1,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""Br.Br.NCC1CCCN1c1cccnn1""","""C#CCOc1ccc(CNc2nc(NCC3CCCN3c3c…","""HSA""",0
2,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""Br.Br.NCC1CCCN1c1cccnn1""","""C#CCOc1ccc(CNc2nc(NCC3CCCN3c3c…","""sEH""",0
3,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""Br.NCc1cccc(Br)n1""","""C#CCOc1ccc(CNc2nc(NCc3cccc(Br)…","""BRD4""",0
4,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""Br.NCc1cccc(Br)n1""","""C#CCOc1ccc(CNc2nc(NCc3cccc(Br)…","""HSA""",0
…,…,…,…,…,…,…
95,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""CC1CC(CN)C(C)O1""","""C#CCOc1ccc(CNc2nc(NCC3CC(C)OC3…","""sEH""",0
96,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""CC1CCc2nc(CN)sc2C1""","""C#CCOc1ccc(CNc2nc(NCc3nc4c(s3)…","""BRD4""",0
97,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""CC1CCc2nc(CN)sc2C1""","""C#CCOc1ccc(CNc2nc(NCc3nc4c(s3)…","""HSA""",0
98,"""C#CC[C@@H](CC(=O)O)NC(=O)OCC1c…","""C#CCOc1ccc(CN)cc1.Cl""","""CC1CCc2nc(CN)sc2C1""","""C#CCOc1ccc(CNc2nc(NCc3nc4c(s3)…","""sEH""",0


前処理してからシャッフルして10分割して保存

In [50]:
n_rows = None
train_raw = pl.read_parquet(os.path.join(RAW_DIR, "train.parquet"), n_rows=n_rows).to_pandas()

train = train_raw[train_raw['protein_name']=='BRD4'].copy()
train = train.drop(columns=['protein_name', "binds", "id"])
train['bind1'] = train_raw[train_raw['protein_name']=='BRD4']['binds'].values
train['bind2'] = train_raw[train_raw['protein_name']=='HSA']['binds'].values
train['bind3'] = train_raw[train_raw['protein_name']=='sEH']['binds'].values
# to polars
train = pl.DataFrame(train)
train = train.sample(fraction=1, seed=Config.SEED, shuffle=True)

# 10分割して保存
SAVE_PATH = "../data/shuffled-dataset/"
n = 10
chunk_size = len(train) // n
for i in range(n):
    chunk = train[i*chunk_size:(i+1)*chunk_size]
    path = os.path.join(SAVE_PATH, f"train_{i}.parquet")
    chunk.write_parquet(path)
    print(f"data saved to {path}", chunk.shape)

data saved to ../data/shuffled-dataset/train_0.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_1.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_2.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_3.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_4.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_5.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_6.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_7.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_8.parquet (9841561, 7)
data saved to ../data/shuffled-dataset/train_9.parquet (9841561, 7)
