In [13]:
import os
import pandas as pd
import matplotlib.pyplot as plt

### Balanced Dataset

In [18]:
total_count = 5000  # Total rows in the balanced dataset

base_dir = os.path.dirname(os.getcwd())  #Works in Jupyter Notebook, one level up
input_file = os.path.join(base_dir, "01 Data", "Master_Data_Preprocessed.csv")
output_file = os.path.join(base_dir, "01 Data", "Derived Datasets", "balanced_dataset.csv")

def create_balanced_dataset(input_file, total_count, output_file):
    # Load the data
    df = pd.read_csv(input_file)
    
    # Separate 'Good' and 'Bad' rows
    good_rows = df[df['Binary_credit_mix'] == 1]
    bad_rows = df[df['Binary_credit_mix'] == 0]
    
    # Determine the number of samples from each group
    half_count = total_count // 2
    # Print counts for debugging
    print(f"Good rows: {len(good_rows)}, Bad rows: {len(bad_rows)}")

    # Sample from both groups
    good_sample = good_rows.sample(n=half_count, random_state=42, replace=len(good_rows) < half_count)
    bad_sample = bad_rows.sample(n=half_count, random_state=42, replace=len(bad_rows) < half_count)
    
    # Combine and shuffle
    balanced_df = pd.concat([good_sample, bad_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Save to CSV
    balanced_df.to_csv(output_file, index=False)

create_balanced_dataset(input_file, total_count, output_file)


Good rows: 10212, Bad rows: 21534


### Unbalanced Datasets

In [19]:
# Example usage
total_count = 10000  # Total rows in the dataset
base_dir = os.path.dirname(os.getcwd())
input_file = os.path.join(base_dir, "01 Data", "Master_Data_Preprocessed.csv")
unbalanced_file = os.path.join(base_dir, "01 Data", "Derived Datasets", "derived_imbalanced_2.csv")


def create_unbalanced_dataset(input_file, total_count, output_file):
    # Load the data
    df = pd.read_csv(input_file)
    
    # Separate 'Good' and 'Bad' rows
    good_rows = df[df['Binary_credit_mix'] == 1]
    bad_rows = df[df['Binary_credit_mix'] == 0]
    
    # Determine the number of samples from each group
    good_credit = int(total_count * 0.35)
    bad_credit = total_count - good_credit

    # Print counts for debugging
    print(f"Available Good rows: {len(good_rows)}, Requested: {good_credit}")
    print(f"Available Bad rows: {len(bad_rows)}, Requested: {bad_credit}")


    # Sample from both groups
    good_sample = good_rows.sample(n=good_credit, random_state=42, replace=len(good_rows) < good_credit)
    bad_sample = bad_rows.sample(n=bad_credit, random_state=42, replace=len(bad_rows) < bad_credit)

    # Combine and shuffle
    unbalanced_df = pd.concat([good_sample, bad_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
    # Save to CSV
    unbalanced_df.to_csv(output_file, index=False)
    print(f"Unbalanced dataset saved to {output_file}")

create_unbalanced_dataset(input_file, total_count, unbalanced_file)

Available Good rows: 10212, Requested: 3500
Available Bad rows: 21534, Requested: 6500
Unbalanced dataset saved to /Users/mdelaluz/Documents/MSC KIDS UCL/Foundations of Machine Learning/Assessment/ML-Assignment/01 Data/Derived Datasets/derived_imbalanced_2.csv
