In [1]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [2]:
def create_kfold_dataframe(df: pd.DataFrame, n_splits=5, random_state=0):
    # KFoldを設定
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # fold列を初期化
    df['fold'] = -1

    X, y = df["image_path"], df["label"]

    for fold_number, (train_index, valid_index) in enumerate(skf.split(X, y)):
        df.loc[valid_index, 'fold'] = fold_number
    
    return df

In [3]:
ROOT = "/workspace/data/typhoon/train"
N_SPLITS = 5

In [4]:
# ルートディレクトリを指定
root_directory = Path(ROOT)

In [5]:
# tifファイルのパスを再帰的に取得
tif_files = list(root_directory.rglob("*.tif"))

In [6]:
paths, labels, dirs, filenames = [], [], [], []
for file_path in tif_files:
    paths.append(str(file_path))
    labels.append(str(file_path.parent.name))
    dirs.append(str(file_path.parent.parent.name))
    filenames.append(str(file_path.name))

In [7]:
df = pd.DataFrame({
    "image_path": paths,
    "subdir": dirs,
    "file_name": filenames,
    "label": labels,
})

In [8]:
# K分割データフレームを作成
df = create_kfold_dataframe(df, N_SPLITS)

In [9]:
df.to_csv('../inputs/train_val_kfold_split.csv', index = False)