In [1]:
import os
import random
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
allData = pd.read_csv('../dataset/output/xlsum_indonesia.csv')

In [3]:
# Menggunakan K-Fold k=5
kf = KFold(n_splits=5, shuffle=True, random_state=42) 
fold = 1

for train_index, validate_index in kf.split(allData):

    # Ambil data untuk lipatan saat ini
    trainDF = pd.DataFrame(allData.iloc[train_index])  # 80% data (training)
    validateDF = pd.DataFrame(allData.iloc[validate_index])  # 20% data (val + test)

    # Bagi validateDF menjadi validation (10%) dan test (10%)
    random.seed(fold)
    number = random.randint(1*fold, 1000000*fold)
    print(f"Random number for fold {fold}: {number}")
    testDF = validateDF.sample(frac=0.5, random_state=number)  # 50% dari 20% = 10% total
    valDF = validateDF[~validateDF.index.isin(testDF.index)]   # Sisanya = 10% total, mengambil data dari validateDF yang indexnya tidak ada di testDF

    
    print(f"Fold #{fold}, Training Size: {len(trainDF)} ({len(trainDF)/len(allData)*100:.1f}%), "
          f"Validation Size: {len(valDF)} ({len(valDF)/len(allData)*100:.1f}%), "
          f"Test Size: {len(testDF)} ({len(testDF)/len(allData)*100:.1f}%)")

    # Simpan ke file CSV
    trainDF.to_csv(f"output/train_fold{fold}.csv", index=False, encoding='utf-8')
    valDF.to_csv(f"output/val_fold{fold}.csv", index=False, encoding='utf-8')
    testDF.to_csv(f"output/test_fold{fold}.csv", index=False, encoding='utf-8')

    fold += 1

Random number for fold 1: 140892
Fold #1, Training Size: 38241 (80.0%), Validation Size: 4781 (10.0%), Test Size: 4780 (10.0%)
Random number for fold 2: 1810073
Fold #2, Training Size: 38241 (80.0%), Validation Size: 4781 (10.0%), Test Size: 4780 (10.0%)
Random number for fold 3: 998098
Fold #3, Training Size: 38242 (80.0%), Validation Size: 4780 (10.0%), Test Size: 4780 (10.0%)
Random number for fold 4: 990061
Fold #4, Training Size: 38242 (80.0%), Validation Size: 4780 (10.0%), Test Size: 4780 (10.0%)
Random number for fold 5: 2142833
Fold #5, Training Size: 38242 (80.0%), Validation Size: 4780 (10.0%), Test Size: 4780 (10.0%)
