In [2]:
import pandas as pd
import numpy as np

# Define the file ID for the uploaded CSV
FILE_ID = "alzheimers_disease_data.csv"
OUTPUT_FILENAME = "balanced_alzheimers_data.csv"

# --- 1. Load the Data ---
try:
    df = pd.read_csv(FILE_ID)
except FileNotFoundError:
    print(f"Error: The file with ID {FILE_ID} was not found.")
    # Create a dummy DataFrame for demonstration if the file doesn't load
    # This block should be removed in a live environment where file access is guaranteed
    data = {
        'PatientID': [100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
        'Age': [70, 65, 80, 72, 68, 75, 60, 85, 71, 69],
        'Ethnicity': [0, 1, 0, 2, 0, 1, 0, 0, 2, 0], # 0: 6, 1: 2, 2: 2 (Majority is 0)
        'BMI': [25.1, 22.5, 30.0, 26.5, 24.9, 23.3, 21.0, 29.5, 27.0, 24.5],
        'Diagnosis': [1, 0, 1, 1, 0, 0, 1, 1, 0, 1]
    }
    df = pd.DataFrame(data)


# Identify numerical columns for noise injection (based on typical Alzheimer's dataset features)
# 'PatientID' and categorical columns will be excluded from noise.
numerical_cols = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
    'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal',
    'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides',
    'MMSE', 'FunctionalAssessment', 'ADL'
]
# Filter to only include columns present in the dataframe
numerical_cols = [col for col in numerical_cols if col in df.columns]

# --- 2. Analyze Balance and Determine Needs ---
ethnicity_counts = df['Ethnicity'].value_counts()
if ethnicity_counts.empty:
    print("Error: 'Ethnicity' column is empty or missing. Cannot balance.")
    exit()

majority_count = ethnicity_counts.max()
ethnicity_needs = majority_count - ethnicity_counts
minority_ethnicities = ethnicity_needs[ethnicity_needs > 0].index.tolist()

if not minority_ethnicities:
    print("Dataset is already balanced. No synthetic data generated.")
    df.to_csv(OUTPUT_FILENAME, index=False)
    exit()

print(f"Original Ethnicity Counts:\n{ethnicity_counts}")
print(f"Majority Class Count: {majority_count}")

# --- 3. Generate Synthetic Data ---
synthetic_data_list = []
current_max_id = df['PatientID'].max() if 'PatientID' in df.columns else 0
new_id_counter = current_max_id + 1

for eth_id in minority_ethnicities:
    needed_samples = ethnicity_needs[eth_id]
    print(f"Generating {needed_samples} samples for Ethnicity {eth_id}")

    # Subset the original data for this ethnicity
    minority_df = df[df['Ethnicity'] == eth_id].copy()

    if minority_df.empty:
        print(f"Warning: No data found for Ethnicity {eth_id}. Skipping.")
        continue

    # Sample original data with replacement to create the synthetic base
    sample_indices = np.random.choice(minority_df.index, size=needed_samples, replace=True)
    synthetic_base_df = minority_df.loc[sample_indices].copy()
    synthetic_base_df.reset_index(drop=True, inplace=True)

    # 3a. Add noise to numerical columns
    for col in numerical_cols:
        if col in synthetic_base_df.columns:
            # Calculate a small noise factor based on 5% of the standard deviation
            std_dev = minority_df[col].std()
            noise_scale = std_dev * 0.05

            # Ensure a minimum noise scale for features with zero variance
            if noise_scale < 1e-6:
                noise_scale = 0.01 # Small fixed noise if STD is near zero

            # Generate and add Gaussian noise
            noise = np.random.normal(0, noise_scale, size=needed_samples)
            synthetic_base_df[col] += noise

            # Post-processing: ensure Age is an integer and values are non-negative
            if col == 'Age':
                synthetic_base_df[col] = synthetic_base_df[col].round().astype(int)

            synthetic_base_df[col] = synthetic_base_df[col].apply(lambda x: max(0, x))

    # 3b. Assign unique PatientIDs
    if 'PatientID' in synthetic_base_df.columns:
        new_ids = np.arange(new_id_counter, new_id_counter + needed_samples)
        synthetic_base_df['PatientID'] = new_ids
        new_id_counter += needed_samples

    synthetic_data_list.append(synthetic_base_df)

# --- 4. Combine and Save Data ---
if synthetic_data_list:
    synthetic_df = pd.concat(synthetic_data_list, ignore_index=True)
    balanced_df = pd.concat([df, synthetic_df], ignore_index=True)
else:
    balanced_df = df

# Final check of counts
print("\nFinal Ethnicity Counts:")
print(balanced_df['Ethnicity'].value_counts())

# Save the resulting DataFrame to the specified CSV file
balanced_df.to_csv(OUTPUT_FILENAME, index=False)

print(f"\nSuccessfully generated and saved balanced data to {OUTPUT_FILENAME}")

Original Ethnicity Counts:
Ethnicity
0    1278
1     454
3     211
2     206
Name: count, dtype: int64
Majority Class Count: 1278
Generating 824 samples for Ethnicity 1
Generating 1067 samples for Ethnicity 3
Generating 1072 samples for Ethnicity 2

Final Ethnicity Counts:
Ethnicity
0    1278
3    1278
1    1278
2    1278
Name: count, dtype: int64

Successfully generated and saved balanced data to balanced_alzheimers_data.csv
