In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

def split_holdout_set(input_csv, output_trainval_csv, output_holdout_csv, label_cols, test_size=0.1, random_state=42):
    """
    Splits dataset into training/validation and holdout, then saves them to CSV files.
    
    Parameters:
        input_csv (str): Path to the full input dataset CSV.
        output_trainval_csv (str): Path to save the 90% train/validation CSV.
        output_holdout_csv (str): Path to save the 10% holdout CSV.
        label_cols (list): List of one-hot label column names.
        test_size (float): Proportion to hold out (default 0.1).
        random_state (int): Seed for reproducibility.
    """
    df = pd.read_csv(input_csv)
    y = df[label_cols].values.argmax(axis=1)  # Stratify on class index

    train_val_df, holdout_df = train_test_split(
        df, test_size=test_size, stratify=y, random_state=random_state
    )

    train_val_df.to_csv(output_trainval_csv, index=False)
    holdout_df.to_csv(output_holdout_csv, index=False)
    
    print(f"✅ Split complete: {len(train_val_df)} train_val | {len(holdout_df)} holdout")
    print(f"📁 Saved: {output_trainval_csv}")
    print(f"📁 Saved: {output_holdout_csv}")



  from pandas.core import (


In [2]:

if __name__ == "__main__":
    label_columns = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'OK']
    split_holdout_set(
        input_csv=r"C:/Users/lamia/Downloads/final_selected_dataset1.csv",
        output_trainval_csv=r"C:/Users/lamia/Downloads/final_selected_dataset1_trainval.csv",
        output_holdout_csv=r"C:/Users/lamia/Downloads/final_selected_dataset1_holdout.csv",
        label_cols=label_columns
    )


✅ Split complete: 94356 train_val | 10485 holdout
📁 Saved: C:/Users/lamia/Downloads/final_selected_dataset1_trainval.csv
📁 Saved: C:/Users/lamia/Downloads/final_selected_dataset1_holdout.csv


In [2]:

if __name__ == "__main__":
    label_columns = ['oestrus', 'calving', 'lameness', 'mastitis', 'other_disease', 'OK']
    split_holdout_set(
        input_csv=r"C:/Users/lamia/Downloads/augmented_dataset11.csv",
        output_trainval_csv=r"C:/Users/lamia/Downloads/augmented_dataset1_trainval.csv",
        output_holdout_csv=r"C:/Users/lamia/Downloads/augmented_dataset1_holdout.csv",
        label_cols=label_columns
    )

✅ Split complete: 108531 train_val | 12060 holdout
📁 Saved: C:/Users/lamia/Downloads/augmented_dataset1_trainval.csv
📁 Saved: C:/Users/lamia/Downloads/augmented_dataset1_holdout.csv
