In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

### Balanced Dataset

In [3]:
def create_balanced_dataset(input_file, total_count, output_file):
    # Load the data
    df = pd.read_csv(input_file)
    
    # Separate 'Good' and 'Bad' rows
    good_rows = df[df['Binary_credit_mix'] == '1']
    bad_rows = df[df['Binary_credit_mix'] == '0']
    
    # Determine the number of samples from each group
    half_count = total_count // 2
    # Print counts for debugging
    print(f"Good rows: {len(good_rows)}, Bad rows: {len(bad_rows)}")

    # Sample from both groups
    good_sample = good_rows.sample(n=half_count, random_state=42, replace=len(good_rows) < half_count)
    bad_sample = bad_rows.sample(n=half_count, random_state=42, replace=len(bad_rows) < half_count)
    
    # Combine and shuffle
    balanced_df = pd.concat([good_sample, bad_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Save to CSV
    balanced_df.to_csv(output_file, index=False)

# Example usage
total_count = 5000  # Total rows in the balanced dataset
#base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))  #Works in Jupyter Notebook
input_file = "/Users/mdelaluz/Documents/MSC KIDS UCL/Foundations of Machine Learning/Assessment/ML-Assignment/01 Data/Master_Data_Preprocessed.csv"
output_file = "/Users/mdelaluz/Documents/MSC KIDS UCL/Foundations of Machine Learning/Assessment/ML-Assignment/01 Data/Derived Datasets/balanced_dataset.csv"
create_balanced_dataset(input_file, total_count, output_file)
print(f"Ruta del archivo de salida: {output_file}")

Good rows: 0, Bad rows: 0


ValueError: a must be greater than 0 unless no samples are taken

### Unbalanced Dataset

In [None]:
def create_unbalanced_dataset(input_file, total_count, output_file):
    # Load the data
    df = pd.read_csv(input_file)
    
    # Separate 'Good' and 'Bad' rows
    good_rows = df[df['Binary_credit_mix'] == '1']
    bad_rows = df[df['Binary_credit_mix'] == '0']
    
    # Determine the number of samples from each group
    good_credit = int(total_count * 0.15)
    bad_credit = total_count - good_credit

    # Print counts for debugging
    print(f"Available Good rows: {len(good_rows)}, Requested: {good_credit}")
    print(f"Available Bad rows: {len(bad_rows)}, Requested: {bad_credit}")


    # Sample from both groups
    good_sample = good_rows.sample(n=good_credit, random_state=42, replace=len(good_rows) < good_credit)
    bad_sample = bad_rows.sample(n=bad_credit, random_state=42, replace=len(bad_rows) < bad_credit)

    # Combine and shuffle
    unbalanced_df = pd.concat([good_sample, bad_sample]).sample(frac=1, random_state=42).reset_index(drop=True)
    # Save to CSV
    unbalanced_df.to_csv(output_file, index=False)
    print(f"Unbalanced dataset saved to {output_file}")

# Example usage
total_count = 10000  # Total rows in the dataset
current_dir = os.path.dirname(os.path.abspath(__file__))
input_file = os.path.join(current_dir, "../01 Data/Master_Data_Preprocessed.csv")
unbalanced_file = "/Users/mdelaluz/Documents/MSC KIDS UCL/Foundations of Machine Learning/Assessment/Database/unbalanced_dataset_3.csv"

create_unbalanced_dataset(input_file, total_count, unbalanced_file)