In [None]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### to split dataset file

In [None]:
import pandas as pd

df = pd.read_csv( '/content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/FileList.csv')

In [None]:
df.head()

Unnamed: 0,FileName,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
0,0X100009310A3BD7FC,78.498406,14.881368,69.210534,112,112,50,174,VAL
1,0X1002E8FBACD08477,59.101988,40.383876,98.742884,112,112,50,215,TRAIN
2,0X1005D03EED19C65B,62.363798,14.267784,37.909734,112,112,50,104,TRAIN
3,0X10075961BC11C88E,54.545097,33.143084,72.91421,112,112,55,122,TRAIN
4,0X10094BA0A028EAC3,24.887742,127.581945,169.855024,112,112,52,207,VAL


### split with mixed test set

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Define EF range
min_ef = 53
max_ef = 70
validation_proportion = 0.1  # 10% valid

# Add 'EF_Status' column
df['EF_Status'] = df['EF'].apply(lambda x: 'N' if min_ef <= x <= max_ef else 'A')
df['Split'] = 'UNASSIGNED'
test_abnormal_df = df[df['EF_Status'] == 'A']

# Split normal EF data into TRAIN_VAL (90%)  TEST_NORMAL (10%)
normal_ef_df = df[df['EF_Status'] == 'N']
train_val_df, test_normal_df = train_test_split(
    normal_ef_df,
    test_size=0.1,  # 10% to TEST
    random_state=42
)


# downsample to balance
if len(test_normal_df) > len(test_abnormal_df):
    test_normal_df = test_normal_df.sample(n=len(test_abnormal_df), random_state=42)
else:
    test_abnormal_df = test_abnormal_df.sample(n=len(test_normal_df), random_state=42)

# Combine the balanced TEST set
balanced_test_df = pd.concat([test_normal_df, test_abnormal_df])

# TEST split
df.loc[balanced_test_df.index, 'Split'] = 'TEST'

# remaining normal EF to TRAIN_VAL
train_df, val_df = train_test_split(
    train_val_df,
    test_size=validation_proportion / (1 - 0.1),  # proport
    random_state=42
)

# TRAIN and VAL splits
df.loc[train_df.index, 'Split'] = 'TRAIN'
df.loc[val_df.index, 'Split'] = 'VAL'

# print splits
print("Counts per Split:")
print(df['Split'].value_counts())
print("\nCounts per EF_Status:")
print(df['EF_Status'].value_counts())
print("\nEF_Status counts within each Split:")
print(df.groupby('Split')['EF_Status'].value_counts())

# Save the updated csv file with the updated splits
df.to_csv('/content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/updated_file.csv', index=False)

Counts per Split:
Split
TRAIN         5616
UNASSIGNED    2305
TEST          1406
VAL            703
Name: count, dtype: int64

Counts per EF_Status:
EF_Status
N    7022
A    3008
Name: count, dtype: int64

EF_Status counts within each Split:
Split       EF_Status
TEST        A             703
            N             703
TRAIN       N            5616
UNASSIGNED  A            2305
VAL         N             703
Name: count, dtype: int64


### perform thedatset split specify numbers.

In [None]:
import os
import shutil
import pandas as pd

base_path = '/content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/'
csv_path = '/content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/updated_file.csv'

# Define the specific sample size for each
sample_sizes = {
    'TRAIN': 300,
    'TEST': 2000,
    'VAL': 100
}

# clear existing folders and recreate them
for set_name in sample_sizes.keys():
    folder_path = os.path.join(base_path, set_name)
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    os.makedirs(folder_path)

# load the CSV file
df = pd.read_csv(csv_path)

# process specified sample sizes
for tag, sample_size in sample_sizes.items():

    tagged_files = df[df['Split'] == tag]
    sampled_files = tagged_files.sample(n=min(sample_size, len(tagged_files)), random_state=42)


    for _, row in sampled_files.iterrows():
        filename = row['FileName'].strip()
        if not filename.endswith('.avi'):
            filename += '.avi'

        src_path = os.path.join(base_path, filename)
        dest_path = os.path.join(base_path, tag, filename)

        print(f"Attempting to copy: {src_path} to {dest_path}")
        if os.path.exists(src_path):
            shutil.copy(src_path, dest_path)
        else:
            print(f"File not found: {src_path}")

# output summary
print("File distribution across splits:")
print(df['Split'].value_counts())

Attempting to copy: /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/0X6E00FB047E13A652.avi to /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/TRAIN/0X6E00FB047E13A652.avi
Attempting to copy: /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/0X47531256525CE22B.avi to /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/TRAIN/0X47531256525CE22B.avi
Attempting to copy: /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/0X11BE656DE80C9CF2.avi to /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/TRAIN/0X11BE656DE80C9CF2.avi
Attempting to copy: /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/0X3A58A66B6AF3DE55.avi to /content/drive/MyDrive/kaggle_data/input/echonet-dynamic/EchoNet-Dynamic/Videos/TRAIN/0X3A58A66B6AF3DE55.avi
Attempting to copy: /content/drive/MyDrive/kaggle_data/input