In [1]:
import gc
import os
import pickle
import random
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
import polars as pl


In [2]:
class Config:
    PREPROCESS = False
    KAGGLE_NOTEBOOK = False
    DEBUG = True
    
    SEED = 42
    EPOCHS = 5
    BATCH_SIZE = 4096
    LR = 1e-3
    WD = 0.05
    PATIENCE = 5
    NBR_FOLDS = 15
    SELECTED_FOLDS = [0]
    
    
if Config.DEBUG:
    n_rows = 10**3
else:
    n_rows = None
    

In [3]:
if Config.KAGGLE_NOTEBOOK:
    RAW_DIR = "/kaggle/input/leash-BELKA/"
    PROCESSED_DIR = "/kaggle/input/belka-enc-dataset"
    OUTPUT_DIR = ""
    MODEL_DIR = ""
else:
    RAW_DIR = "../data/raw/"
    PROCESSED_DIR = "../data/processed/"
    OUTPUT_DIR = "../data/result/"
    MODEL_DIR = "../models/"

TRAIN_DATA_NAME = "local_train_enc.parquet"
SAVE_PATH = "../data/chuncked-dataset/"

データを10分割して保存

In [None]:

# split dataset to several parquet
train = pl.read_parquet(os.path.join(PROCESSED_DIR, TRAIN_DATA_NAME), n_rows=None)

# shuffle
train = train.sample(fraction=1, seed=Config.SEED, shuffle=True)
# trainを10分割して保存
n = 10
chunk_size = len(train) // n
for i in range(n):
    chunk = train[i*chunk_size:(i+1)*chunk_size]
    chunk.write_parquet(os.path.join(SAVE_PATH, f"local_train_enc_{i}.parquet"))
    

マスクを保存

In [14]:
FEATURES = [f'enc{i}' for i in range(142)]

for i in range(10):
    train = pl.read_parquet(os.path.join(SAVE_PATH, f"local_train_enc_{i}.parquet"), n_rows=None).to_pandas()
    mask_df = (train.values > 0).astype(int)
    mask_df = pd.DataFrame(mask_df, columns=train.columns)
    mask_df = mask_df[FEATURES]
    mask_df.to_parquet(os.path.join(SAVE_PATH, f"local_train_mask_{i}.parquet"))

In [9]:
train

Unnamed: 0,enc0,enc1,enc2,enc3,enc4,enc5,enc6,enc7,enc8,enc9,...,enc135,enc136,enc137,enc138,enc139,enc140,enc141,bind1,bind2,bind3
0,8,26,8,17,8,19,8,28,8,8,...,0,0,0,0,0,0,0,0,0,0
1,8,28,8,17,26,28,19,12,27,12,...,0,0,0,0,0,0,0,0,0,0
2,8,28,12,27,12,12,17,8,33,12,...,0,0,0,0,0,0,0,0,0,0
3,8,12,27,10,12,17,8,8,33,12,...,0,0,0,0,0,0,0,0,0,0
4,8,28,8,17,26,28,19,12,27,10,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9743140,8,28,12,27,12,12,12,17,8,19,...,0,0,0,0,0,0,0,0,0,0
9743141,8,28,12,27,35,12,12,17,33,12,...,0,0,0,0,0,0,0,0,0,0
9743142,8,28,8,28,8,8,8,33,12,27,...,0,0,0,0,0,0,0,0,0,0
9743143,8,28,12,27,12,12,17,8,33,12,...,0,0,0,0,0,0,0,0,0,0
