<a href="https://colab.research.google.com/github/jcw5937/340W/blob/main/data_resplit_70_10_20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# CBIS-DDSM Calc/Mass 70/10/20 Re-split (No Adjustments)

This notebook does the following:
1. Load the provided **calc** and **mass** *train* and *test* CSVs.  
2. Merge the splits within each category (calc-only, mass-only).  
3. Apply **one random shuffle** and split into **70% train / 10% val / 20% test**.  
4. Save the new CSVs.

no stratification, no column changes

In [1]:

from pathlib import Path
import pandas as pd
import numpy as np
from google.colab import drive

# Paths
drive.mount('/content/drive')
base_dir = Path('/content/drive/MyDrive/340W/Breast-Cancer-Data')  # Folder contains the old data

calc_train_path = base_dir / 'calc_case_description_train_set.csv'
calc_test_path  = base_dir / 'calc_case_description_test_set.csv'

mass_train_path = base_dir / 'mass_case_description_train_set.csv'
mass_test_path  = base_dir / 'mass_case_description_test_set.csv'

# Output folder
out_dir = Path('/content/drive/MyDrive/340W/Adj-Breast-Cancer-Data')  # Directory for new adjusted data

# ---- Split settings ----
TRAIN_FRAC = 0.70
VAL_FRAC   = 0.10
TEST_FRAC  = 0.20
# Adjust for floating point numbers:
# ie: 0.70 + 0.10 + 0.20 does not equal 1 exactly
# Make sure the three fractions add up to 1.0, give or take a billionth
assert abs((TRAIN_FRAC + VAL_FRAC + TEST_FRAC) - 1.0) < 1e-9, "Fractions must sum to 1."


SEED = 42      # random split
SHUFFLE = True # single shuffle before splitting


Mounted at /content/drive


In [2]:

def simple_split(df: pd.DataFrame, train_frac=0.7, val_frac=0.1, test_frac=0.2, seed=42, shuffle=True):
    """
    Return train, val, test DataFrames from a single random shuffle.
    """
    n = len(df)
    rng = np.random.default_rng(seed) # NumPy random number generator
    idx = np.arange(n) # Builds an array [0, 1, 2, ..., n-1] representing row indices
    if shuffle: # With a fixed seed, you get the same shuffle every time
        rng.shuffle(idx)
    train_end = int(n * train_frac) # how many rows go into train
    val_end   = train_end + int(n * val_frac) # how many rows to validation
    train_idx = idx[:train_end]
    val_idx   = idx[train_end:val_end]
    test_idx  = idx[val_end:]
    # Uses .iloc (position-based selection) to pull rows for each split
    return df.iloc[train_idx].copy(), df.iloc[val_idx].copy(), df.iloc[test_idx].copy()


In [3]:

# Load without modification

calc_train = pd.read_csv(calc_train_path)
calc_test  = pd.read_csv(calc_test_path)

mass_train = pd.read_csv(mass_train_path)
mass_test  = pd.read_csv(mass_test_path)

print("Loaded shapes:")
print("calc_train:", calc_train.shape, "| calc_test:", calc_test.shape)
print("mass_train:", mass_train.shape, "| mass_test:", mass_test.shape)


Loaded shapes:
calc_train: (1546, 14) | calc_test: (326, 14)
mass_train: (1318, 14) | mass_test: (378, 14)


In [4]:

calc_merged = pd.concat([calc_train, calc_test], axis=0, ignore_index=True) #stacking rows from calc_train and calc_test
mass_merged = pd.concat([mass_train, mass_test], axis=0, ignore_index=True) #stacking rows from Mass_train and mass_test

print("Merged shapes:")
print("calc_merged:", calc_merged.shape)
print("mass_merged:", mass_merged.shape)


Merged shapes:
calc_merged: (1872, 14)
mass_merged: (1696, 14)


In [5]:

# call simple_split function on the calc dataframe
calc_train_new, calc_val_new, calc_test_new = simple_split(
    calc_merged,
    train_frac=TRAIN_FRAC, val_frac=VAL_FRAC, test_frac=TEST_FRAC,
    seed=SEED, shuffle=SHUFFLE
)

print("Calc-only new splits:")
print("  Train:", calc_train_new.shape)
print("  Val  :", calc_val_new.shape)
print("  Test :", calc_test_new.shape)


Calc-only new splits:
  Train: (1310, 14)
  Val  : (187, 14)
  Test : (375, 14)


In [6]:
# call simple_split function on the mass dataframe
mass_train_new, mass_val_new, mass_test_new = simple_split(
    mass_merged,
    train_frac=TRAIN_FRAC, val_frac=VAL_FRAC, test_frac=TEST_FRAC,
    seed=SEED, shuffle=SHUFFLE
)

print("Mass-only new splits:")
print("  Train:", mass_train_new.shape)
print("  Val  :", mass_val_new.shape)
print("  Test :", mass_test_new.shape)


Mass-only new splits:
  Train: (1187, 14)
  Val  : (169, 14)
  Test : (340, 14)


In [7]:

# Building path files
calc_train_out = out_dir / 'calc_merged_train_70_10_20_nostrat.csv'
calc_val_out   = out_dir / 'calc_merged_val_70_10_20_nostrat.csv'
calc_test_out  = out_dir / 'calc_merged_test_70_10_20_nostrat.csv'

mass_train_out = out_dir / 'mass_merged_train_70_10_20_nostrat.csv'
mass_val_out   = out_dir / 'mass_merged_val_70_10_20_nostrat.csv'
mass_test_out  = out_dir / 'mass_merged_test_70_10_20_nostrat.csv'

# Saving to correct paths
calc_train_new.to_csv(calc_train_out, index=False)
calc_val_new.to_csv(calc_val_out, index=False)
calc_test_new.to_csv(calc_test_out, index=False)

mass_train_new.to_csv(mass_train_out, index=False)
mass_val_new.to_csv(mass_val_out, index=False)
mass_test_new.to_csv(mass_test_out, index=False)

# Prints a header so the next lines read nicely in the console
print("Wrote:")
for p in [calc_train_out, calc_val_out, calc_test_out, mass_train_out, mass_val_out, mass_test_out]:
    print(" -", p)


Wrote:
 - /content/drive/MyDrive/340W/Adj-Breast-Cancer-Data/calc_merged_train_70_10_20_nostrat.csv
 - /content/drive/MyDrive/340W/Adj-Breast-Cancer-Data/calc_merged_val_70_10_20_nostrat.csv
 - /content/drive/MyDrive/340W/Adj-Breast-Cancer-Data/calc_merged_test_70_10_20_nostrat.csv
 - /content/drive/MyDrive/340W/Adj-Breast-Cancer-Data/mass_merged_train_70_10_20_nostrat.csv
 - /content/drive/MyDrive/340W/Adj-Breast-Cancer-Data/mass_merged_val_70_10_20_nostrat.csv
 - /content/drive/MyDrive/340W/Adj-Breast-Cancer-Data/mass_merged_test_70_10_20_nostrat.csv


In [8]:

# Sanity checks: verify that splits don’t overlap
def check_disjoint_and_cover(a, b, c): # a: train, b: val, c: test
    # Convert each DataFrame’s index values into sets. Treats the index as the unique row ID
    a_ids = set(a.index)
    b_ids = set(b.index)
    c_ids = set(c.index)
    assert a_ids.isdisjoint(b_ids) # Fail if any row index appears in both a and b
    assert a_ids.isdisjoint(c_ids) # Fail if any row index appears in both a and c
    assert b_ids.isdisjoint(c_ids) # Fail if any row index appears in both b and c

# Note: since we copied slices, indices are preserved from the merged df; this is fine for the check.
check_disjoint_and_cover(calc_train_new, calc_val_new, calc_test_new)
check_disjoint_and_cover(mass_train_new, mass_val_new, mass_test_new)

print("Sanity checks passed: splits are disjoint for both calc and mass.")


Sanity checks passed: splits are disjoint for both calc and mass.
