In [16]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv("../data/raw/fairlabs_data.csv")

In [10]:
KEY_COLUMN_NAME = 'encounter_id'
ACTION_COLUMN_NAME = 'uds_order_id'
RESULTS_COLUMN_NAME = 'cps_reporting_date'

SENSITIVE_COLUMN_NAME = 'maternal_race'

COLOR = "#009999"

#### Outlier Handling

In [11]:
def detect_outliers_iqr(data, column, multiplier=1.5):
    """
    Detect outliers using the Interquartile Range (IQR) method.

    Parameters:
        data (array-like): Data to detect outliers from.
        multiplier (float): Multiplier for the IQR. Defaults to 1.5.

    Returns:
        outliers (list): List of outlier values.
    """
    # Calculate the first and third quartiles
    Q1 = np.percentile(data[column], 25)
    Q3 = np.percentile(data[column], 75)

    # Calculate the interquartile range (IQR)
    IQR = Q3 - Q1

    # Define the outlier bounds
    lower_bound = Q1 - multiplier * IQR
    upper_bound = Q3 + multiplier * IQR

    # Identify outliers
    outliers = [value for value in data[column] if value < lower_bound or value > upper_bound]
    outliers_encounter_id = data.loc[(data[column] < lower_bound) | (data[column] > upper_bound), 'encounter_id'].tolist()

    return outliers, outliers_encounter_id

In [12]:
def remove_rows_by_column_value(data, column_name, column_values):
    """
    Remove rows with specified column values from the DataFrame.

    Parameters:
        data (DataFrame): DataFrame containing the data.
        column_name (str): Name of the column to check for values.
        column_values (list): List of values to remove.

    Returns:
        cleaned_data (DataFrame): DataFrame with specified rows removed.
    """
    cleaned_data = data[~data[column_name].isin(column_values)].copy()
    print("Dataframe shape before removing rows:", data.shape)
    print("Dataframe shape after removing rows:", cleaned_data.shape)

    return cleaned_data

In [18]:
def find_and_remove_outliers(data=df, column_name=column_name, multiplier=multiplier):
    outliers, outliers_encounter_id = detect_outliers_iqr(data=df, column='maternal_age', multiplier=multiplier)
    print(f"Outliers detected in the column {column_name} using IQR method with multiplier {multiplier}:")
    for i, outlier in enumerate(outliers):
        print(f"Outlier: {outlier}, Encounter ID: {outliers_encounter_id[i]}")
    
    return remove_rows_by_column_value(data=df, column_name=KEY_COLUMN_NAME, column_values=outliers_encounter_id)

In [19]:
multiplier=2.5
column_name="maternal_age"
new_df = find_and_remove_outliers(data=df, column_name=column_name, multiplier=multiplier)

Outliers detected in the column maternal_age using IQR method with multiplier 2.5:
Outlier: 123, Encounter ID: encounter_6813
Dataframe shape before removing rows: (6643, 55)
Dataframe shape after removing rows: (6642, 55)
