In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

separator = ','
file_path = ""

# Columns to delete
columns_to_delete = ['T_REC', 'harpnum', 'Class_Flare']

# columns_to_delete = ['Class_Flare']
raw_df = pd.read_csv(file_path, sep=separator)

# Fix: convert dates in the correct dataframe
date1_ta = pd.to_datetime(raw_df['T_REC'], errors='coerce', format='%Y-%m-%d %H:%M:%S')
date2_ta = pd.to_datetime(raw_df['T_REC'], errors='coerce', format='%Y-%m-%d')

raw_df['T_REC'] = date1_ta.fillna(date2_ta)

# Remove timezone to avoid date shifts
raw_df['T_REC'] = raw_df['T_REC'].dt.tz_localize(None)

# Sort and save result
cleaned_df = raw_df.sort_values(by='T_REC')


# divide_folds

def divide_dataset_k_fold_harpnum(cleaned_df):
    df = cleaned_df.compute()

    # Create a reduced DataFrame by active region (harpnum)
    df_regions = df.groupby('harpnum').agg({'Class': lambda x: int(any(x == 1))}).reset_index()

    # Configure StratifiedKFold to maintain proportions by active region
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    folds = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(df_regions, df_regions['Class'])):
        # Separate active regions into train and test
        train_harpnums = df_regions.iloc[train_idx]['harpnum'].values
        test_harpnums = df_regions.iloc[test_idx]['harpnum'].values

        df_train = df[df['harpnum'].isin(train_harpnums)]
        df_test = df[df['harpnum'].isin(test_harpnums)]

        # Now split the training set into train and validation
        skf_train = StratifiedKFold(n_splits=int(1 / 0.2), shuffle=True, random_state=42)
        train_regions = df_train.groupby('harpnum').agg({'Class': lambda x: int(any(x == 1))}).reset_index()

        train_folds = list(skf_train.split(train_regions, train_regions['Class']))
        train_idx_fold, val_idx_fold = train_folds[fold % len(train_folds)]  # Select the corresponding fold

        train_harpnums = train_regions.iloc[train_idx_fold]['harpnum'].values
        val_harpnums = train_regions.iloc[val_idx_fold]['harpnum'].values

        df_train_split = df_train[df_train['harpnum'].isin(train_harpnums)]
        df_val_split = df_train[df_train['harpnum'].isin(val_harpnums)]

        # Sort by time
        df_train_split = df_train_split.sort_values(by='T_REC')
        df_val_split = df_val_split.sort_values(by='T_REC')
        df_test = df_test.sort_values(by='T_REC')

        folds.append((df_train_split, df_val_split, df_test))

    print("######################################## Fold sizes ######################################,", len(folds))

    for i in range(5):
        train_df, val_df, test_df = folds[i]
        neg_t = len(train_df.loc[(train_df['Class'] == 0)])
        pos_t = len(train_df.loc[(train_df['Class'] == 1)])

        print("Positive:", pos_t, " Negative:", neg_t, " Fold:", i)

    return folds


folds = divide_dataset_k_fold_harpnum(cleaned_df)
num_folds = 5

for n_execution in range(num_folds):
    train_df, val_df, test_df = folds[n_execution]

    train_df.to_csv(f"data/kfold/train_df_kfold_{n_execution}.csv", index=False)
    val_df.to_csv(f"data/kfold/val_df_kfold_{n_execution}.csv", index=False)
    test_df.to_csv(f"data/kfold/test_df_kfold_{n_execution}.csv", index=False)


divide_dataset_k_fold_harpnum(cleaned_df)
