In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
import sqlite3

In [2]:
conn = sqlite3.connect('../data/dataset.db')

In [2]:
df = pd.read_csv("../data/2022_public_lar.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16080210 entries, 0 to 16080209
Data columns (total 99 columns):
 #   Column                                    Dtype  
---  ------                                    -----  
 0   activity_year                             int64  
 1   lei                                       object 
 2   derived_msa_md                            int64  
 3   state_code                                object 
 4   county_code                               float64
 5   census_tract                              float64
 6   conforming_loan_limit                     object 
 7   derived_loan_product_type                 object 
 8   derived_dwelling_category                 object 
 9   derived_ethnicity                         object 
 10  derived_race                              object 
 11  derived_sex                               object 
 12  action_taken                              int64  
 13  purchaser_type                            int64  
 14  

In [4]:
df.to_sql('2022_public_lar', conn, if_exists='replace', index=False)

In [4]:
pd.set_option('display.max_rows', None)

In [5]:
df.isnull().sum()

activity_year                                      0
lei                                                0
derived_msa_md                                     0
state_code                                    184147
county_code                                   299660
census_tract                                  347665
conforming_loan_limit                          72197
derived_loan_product_type                          0
derived_dwelling_category                          0
derived_ethnicity                                  0
derived_race                                       0
derived_sex                                        0
action_taken                                       0
purchaser_type                                     0
preapproval                                        0
loan_type                                          0
loan_purpose                                       0
lien_status                                        0
reverse_mortgage                              

In [3]:
def recode_categorical_cols(df, recode_map):
    for col, mapping in recode_map.items():
        df[col] = df[col].replace(mapping)
    return df

In [4]:
# Drop unnecessary, redundant, and sparse columns (based on domain knowledge)
unnecessary = df.columns[list(range(47, 77)) + list(range(78, 99)) + [0, 1, 13, 18]]
redundant = ['census_tract', 'derived_msa_md', 'county_code', 'loan_type', 'lien_status', 'construction_method']
sparse = ['discount_points', 'total_points_and_fees', 'lender_credits', 'prepayment_penalty_term', 'intro_rate_period', 'multifamily_affordable_units']
df = df.drop(columns=unnecessary)
df = df.drop(columns=redundant + sparse)

In [5]:
df.shape

(16080210, 32)

In [6]:
# Drop observations with not useful values (e.g., Not Available, Free From Text Only)
df = df[~df['derived_ethnicity'].isin(['Ethnicity Not Available', 'Free Form Text Only', 'Joint'])]
df = df[~df['derived_race'].isin(['Race Not Available', 'Free Form Text Only', 'Joint'])]
df = df[~df['derived_sex'].isin(['Sex Not Available', 'Joint'])]
df = df[df['applicant_age'] != '8888']

In [7]:
df = df[df['action_taken'].isin([1, 3])]
df['loan_approved'] = df['action_taken'].apply(lambda x: 1 if x == 1 else 0)
df = df.drop(columns=['action_taken'])

In [8]:
df.shape

(5165521, 32)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5165521 entries, 0 to 16080206
Data columns (total 32 columns):
 #   Column                                    Dtype  
---  ------                                    -----  
 0   state_code                                object 
 1   conforming_loan_limit                     object 
 2   derived_loan_product_type                 object 
 3   derived_dwelling_category                 object 
 4   derived_ethnicity                         object 
 5   derived_race                              object 
 6   derived_sex                               object 
 7   preapproval                               int64  
 8   loan_purpose                              int64  
 9   open_end_line_of_credit                   int64  
 10  business_or_commercial_purpose            int64  
 11  loan_amount                               int64  
 12  combined_loan_to_value_ratio              object 
 13  interest_rate                             object 
 14  r

In [9]:
# Rename certain columns and typecasting
rename_map = {'derived_loan_product_type': 'loan_product_type', 'derived_dwelling_category': 'dwelling_category', 'derived_ethnicity': 'ethnicity', 'derived_race': 'race', 'derived_sex': 'sex', 'applicant_age': 'age'}
df = df.rename(columns=rename_map)

In [10]:
numeric_cols = ['combined_loan_to_value_ratio', 'interest_rate', 'rate_spread', 'origination_charges', 'loan_term', 'property_value', 'total_units', 'total_loan_costs']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [11]:
# Replace values before one-hot encoding
recode_map = {
    'preapproval': {2: 0},
    'loan_purpose': {1: 'Purchase', 2: 'Improvement', 31: 'Refinancing', 32: 'Cash-out', 4: 'Other', 5: 'N/a'},
    'hoepa_status': {1: 'Yes', 2: 'No', 3: 'N/a'},
    'occupancy_type': {1: 'Principal', 2: 'Second', 3: 'Investment'},
    'manufactured_home_land_property_interest': {1: 'Direct', 2: 'Indirect', 3: 'Paid', 4: 'Unpaid', 5: 'N/a', 1111: 'Exempt'},
    'debt_to_income_ratio': {'39.0': '39', '38.0': '38'},
    'open_end_line_of_credit': {1: 'Yes', 2: 'No', 1111: 'Exempt'}
}
df = recode_categorical_cols(df, recode_map)
df['preapproval'] = df['preapproval'].astype(object)

In [12]:
X = df.drop('loan_approved', axis=1)
y = df['loan_approved']

In [19]:
X.to_csv("../data/X.csv", index=False)

In [None]:
y.to_csv("../data/y.csv", index=False)

In [40]:
X_sampled = X.sample(100000, random_state=334).reset_index(drop=True)
y_sampled = y.sample(100000, random_state=334).reset_index(drop=True)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.3, random_state=334)

In [42]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [17]:
X_train.to_csv("../data/X_train.csv", index=False)

In [18]:
X_test.to_csv("../data/X_test.csv", index=False)

In [19]:
y_train.to_csv("../data/y_train.csv", index=False)

In [20]:
y_test.to_csv("../data/y_test.csv", index=False)

In [43]:
def imputing(X_train, X_test):
    # Impute missing values with median for floating point columns and most frequent values (mode) for others
    imputer_mode = SimpleImputer(strategy='most_frequent')
    imputer_median = SimpleImputer(strategy='median')
    for col in X_train.columns:
        if X_train[col].dtype == 'float64':
            X_train[col] = imputer_median.fit_transform(X_train[[col]]).ravel()
            X_test[col] = imputer_median.transform(X_test[[col]]).ravel()
        else:
            X_train[col] = imputer_mode.fit_transform(X_train[[col]]).ravel()
            X_test[col] = imputer_mode.transform(X_test[[col]]).ravel()

    return X_train, X_test

In [44]:
X_train, X_test = imputing(X_train, X_test)

In [45]:
def scaling(X_train, X_test):
    scaler = StandardScaler()
    numeric_cols = ['loan_amount', 'combined_loan_to_value_ratio', 'interest_rate', 'rate_spread',
                    'total_loan_costs', 'origination_charges', 'loan_term', 'property_value', 'total_units', 'income']
    for col in numeric_cols:
        X_train[col] = scaler.fit_transform(X_train[[col]])
        X_test[col] = scaler.transform(X_test[[col]])

    return X_train, X_test

In [46]:
X_train, X_test = scaling(X_train, X_test)

In [49]:
def select_features(X_train, X_test, y_train):
    train_data = pd.concat([X_train, y_train], axis=1)
    corr_matrix = train_data.corr()
    target_corr = corr_matrix.iloc[:-1, -1].abs()
    col_to_drop = set(target_corr[target_corr > 0.8].index.tolist())

    for i in range(len(corr_matrix.columns) - 1):
        for j in range(i+1, len(corr_matrix.columns) - 1):
            if abs(corr_matrix.iloc[i, j]) > 0.9:
                if target_corr[i] > target_corr[j]:
                    col_to_drop.add(corr_matrix.columns[j])
                else:
                    col_to_drop.add(corr_matrix.columns[i])

    X_train = X_train.drop(columns=list(col_to_drop))
    X_test = X_test.drop(columns=list(col_to_drop))

    return X_train, X_test

In [51]:
X_train, X_test = select_features(X_train, X_test, y_train)

In [52]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 26 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   state_code                                70000 non-null  object 
 1   conforming_loan_limit                     70000 non-null  object 
 2   loan_product_type                         70000 non-null  object 
 3   dwelling_category                         70000 non-null  object 
 4   ethnicity                                 70000 non-null  object 
 5   race                                      70000 non-null  object 
 6   sex                                       70000 non-null  object 
 7   preapproval                               70000 non-null  object 
 8   loan_purpose                              70000 non-null  object 
 9   open_end_line_of_credit                   70000 non-null  object 
 10  loan_amount                       

In [53]:
X_train['manufactured_home_secured_property_type'] = X_train['manufactured_home_secured_property_type'].astype(object)
X_test['manufactured_home_secured_property_type'] = X_test['manufactured_home_secured_property_type'].astype(object)

In [54]:
def encoding(X_train, X_test):
    categorical_cols = list(range(0, 7)) + [8, 9, 14, 19, 20, 21, 24, 25]
    categorical_names = X_train.columns[categorical_cols]

    X_train_encoded = pd.get_dummies(X_train[categorical_names])
    X_test_encoded = pd.get_dummies(X_test[categorical_names])

    X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='outer', axis=1, fill_value=0)
    
    X_train = X_train.drop(columns=categorical_names)
    X_test = X_test.drop(columns=categorical_names)

    X_train = pd.concat([X_train, X_train_encoded], axis=1)
    X_test = pd.concat([X_test, X_test_encoded], axis=1)

    return X_train, X_test

In [55]:
X_train, X_test = encoding(X_train, X_test)

In [56]:
X_train, X_test = select_features(X_train, X_test, y_train)

In [59]:
X_train.to_csv("../data/preprocessed_X_train.csv", index=False)

In [60]:
X_test.to_csv("../data/preprocessed_X_test.csv", index=False)

In [61]:
y_train.to_csv("../data/preprocessed_y_train.csv", index=False)

In [62]:
y_test.to_csv("../data/preprocessed_y_test.csv", index=False)

In [26]:
# Privileged: 1, Unprivileged: 0
df['race'] = df['race'].apply(lambda x: 1 if x == 'White' else 0)
df['ethnicity'] = df['ethnicity'].apply(lambda x: 1 if x == 'Not Hispanic or Latino' else 0)
df['sex'] = df['sex'].apply(lambda x: 1 if x == 'Male' else 0)
df['age'] = df['age'].apply(lambda x: 0 if x in ['<25', '>74'] else 1)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5165521 entries, 0 to 16080206
Data columns (total 25 columns):
 #   Column                                    Dtype  
---  ------                                    -----  
 0   state_code                                object 
 1   conforming_loan_limit                     object 
 2   loan_product_type                         object 
 3   dwelling_category                         object 
 4   ethnicity                                 int64  
 5   race                                      int64  
 6   sex                                       int64  
 7   preapproval                               int64  
 8   loan_purpose                              int64  
 9   loan_amount                               int64  
 10  combined_loan_to_value_ratio              float64
 11  interest_rate                             float64
 12  rate_spread                               float64
 13  hoepa_status                              int64  
 14  t

In [28]:
recode_map = {
    'preapproval': {2: 0},
    'loan_purpose': {1: 'Purchase', 2: 'Improvement', 31: 'Refinancing', 32: 'Cash-out', 4: 'Other', 5: 'N/a'},
    'hoepa_status': {1: 'Yes', 2: 'No', 3: 'N/a'},
    'occupancy_type': {1: 'Principal', 2: 'Second', 3: 'Investment'},
    'manufactured_home_land_property_interest': {1: 'Direct', 2: 'Indirect', 3: 'Paid', 4: 'Unpaid', 5: 'N/a', 1111: 'Exempt'},
    'debt_to_income_ratio': {'39.0': '39', '38.0': '38'}
    }
df = recode_categorical_cols(df, recode_map)
df['preapproval'] = df['preapproval'].astype(object)

In [29]:
df.to_csv("../data/preprocessed_2022_public_lar.csv", index=False)

In [30]:
scaler = StandardScaler()
numeric_cols = ['loan_amount', 'combined_loan_to_value_ratio', 'interest_rate', 'rate_spread', 'total_loan_costs', 'origination_charges', 'loan_term', 'property_value', 'total_units', 'income']
for col in numeric_cols:
    df[col] = scaler.fit_transform(df[[col]])

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5165521 entries, 0 to 16080206
Data columns (total 25 columns):
 #   Column                                    Dtype  
---  ------                                    -----  
 0   state_code                                object 
 1   conforming_loan_limit                     object 
 2   loan_product_type                         object 
 3   dwelling_category                         object 
 4   ethnicity                                 int64  
 5   race                                      int64  
 6   sex                                       int64  
 7   preapproval                               object 
 8   loan_purpose                              object 
 9   loan_amount                               float64
 10  combined_loan_to_value_ratio              float64
 11  interest_rate                             float64
 12  rate_spread                               float64
 13  hoepa_status                              object 
 14  t

In [32]:
categorical_cols = list(range(0, 4)) + [8, 13, 18, 19, 22]
encoded = pd.get_dummies(df.iloc[:, categorical_cols])
df = df.reset_index(drop=True)
encoded = encoded.reset_index(drop=True)
df = pd.concat([df, encoded], axis=1)
df = df.drop(df.columns[categorical_cols], axis=1)

In [33]:
pd.set_option('display.max_rows', None)

In [34]:
df.columns.tolist()

['ethnicity',
 'race',
 'sex',
 'preapproval',
 'loan_amount',
 'combined_loan_to_value_ratio',
 'interest_rate',
 'rate_spread',
 'total_loan_costs',
 'origination_charges',
 'loan_term',
 'property_value',
 'total_units',
 'income',
 'age',
 'loan_approved',
 'state_code_AK',
 'state_code_AL',
 'state_code_AR',
 'state_code_AZ',
 'state_code_CA',
 'state_code_CO',
 'state_code_CT',
 'state_code_DC',
 'state_code_DE',
 'state_code_FL',
 'state_code_GA',
 'state_code_GU',
 'state_code_HI',
 'state_code_IA',
 'state_code_ID',
 'state_code_IL',
 'state_code_IN',
 'state_code_KS',
 'state_code_KY',
 'state_code_LA',
 'state_code_MA',
 'state_code_MD',
 'state_code_ME',
 'state_code_MI',
 'state_code_MN',
 'state_code_MO',
 'state_code_MS',
 'state_code_MT',
 'state_code_NC',
 'state_code_ND',
 'state_code_NE',
 'state_code_NH',
 'state_code_NJ',
 'state_code_NM',
 'state_code_NV',
 'state_code_NY',
 'state_code_OH',
 'state_code_OK',
 'state_code_OR',
 'state_code_PA',
 'state_code_PR',
 

In [18]:
df.to_csv("../data/preprocessed_data.csv", index=False)

In [3]:
df = pd.read_csv("../data/preprocessed_data.csv")

In [35]:
sampled_df = df.sample(100000, random_state=334)

In [36]:
sampled_df = sampled_df.reset_index(drop=True)

In [37]:
sampled_df.to_csv("../data/sampled_preprocessed_data.csv", index=False)

## Columns
* `activity_year`: The calendar year the data submission covers
* `lei`: A financial institution’s Legal Entity Identifier
* `derived_msa-md`: The 5 digit derived MSA (metropolitan statistical area) or MD (metropolitan division) code. An MSA/MD is an area that has at least one urbanized area of 50,000 or more population.
* `state_code`: Two-letter state code
* `country_code`: State-county FIPS code
* `census_tract`: 11 digit census tract number
* `derived_loan_product_type`: Derived loan product type from Loan Type and Lien Status fields for easier querying of specific records
    - Conventional:First Lien
    - FHA:First Lien
    - VA:First Lien
    - FSA/RHS:First Lien
    - Conventional:Subordinate Lien
    - FHA:Subordinate Lien
    - VA:Subordinate Lien
    - FSA/RHS:Subordinate Lien
* `derived_dwelling_category`: Derived dwelling type from Construction Method and Total Units fields for easier querying of specific records
    - Single Family (1-4 Units):Site-Built
    - Multifamily:Site-Built (5+ Units)
    - Single Family (1-4 Units):Manufactured
    - Multifamily:Manufactured (5+ Units)
* `conforming_loan_limit`: Indicates whether the reported loan amount exceeds the GSE (government sponsored enterprise) conforming loan limit
    - C (Conforming)
    - NC (Nonconforming)
    - U (Undetermined)
    - NA (Not Applicable)
* `derived_ethnicity`: Single aggregated ethnicity categorization derived from applicant/borrower and co-applicant/co-borrower ethnicity fields
    - Hispanic or Latino
    - Not Hispanic or Latino
    - Joint
    - Ethnicity Not Available
    - Free Form Text Only
* `derived_race`: Single aggregated race categorization derived from applicant/borrower and co-applicant/co-borrower race fields
    - American Indian or Alaska Native
    - Asian
    - Black or African American
    - Native Hawaiian or Other Pacific Islander
    - White
    - 2 or more minority races
    - Joint
    - Free Form Text Only
    - Race Not Available
* `derived_sex`: Single aggregated sex categorization derived from applicant/borrower and co-applicant/co-borrower sex fields
    - Male
    - Female
    - Joint
    - Sex Not Available
* `action_taken`: The action taken on the covered loan or application
    - 1 - Loan originated
    - 2 - Application approved but not accepted
    - 3 - Application denied
    - 4 - Application withdrawn by applicant
    - 5 - File closed for incompleteness
    - 6 - Purchased loan
    - 7 - Preapproval request denied
    - 8 - Preapproval request approved but not accepted
* `purchaser_type`: Type of entity purchasing a covered loan from the institution
    - 0 - Not applicable
    - 1 - Fannie Mae
    - 2 - Ginnie Mae
    - 3 - Freddie Mac
    - 4 - Farmer Mac
    - 5 - Private securitizer
    - 6 - Commercial bank, savings bank, or savings association
    - 71 - Credit union, mortgage company, or finance company
    - 72 - Life insurance company
    - 8 - Affiliate institution
    - 9 - Other type of purchaser
* `preapproval`: Whether the covered loan or application involved a request for a preapproval of a home purchase loan under a preapproval program
    - 1 - Preapproval requested
    - 2 - Preapproval not requested
* `loan_type`: The type of covered loan or application
    - 1 - Conventional (not insured or guaranteed by FHA, VA, RHS, or FSA)
    - 2 - Federal Housing Administration insured (FHA)
    - 3 - Veterans Affairs guaranteed (VA)
    - 4 - USDA Rural Housing Service or Farm Service Agency guaranteed (RHS or FSA)
* `loan_purpose`
    - 1 - Home purchase
    - 2 - Home improvement
    - 31 - Refinancing
    - 32 - Cash-out refinancing
    - 4 - Other purpose
    - 5 - Not applicable
* `lien_status`
    - 1 - Secured by a first lien
    - 2 - Secured by a subordinate lien
* `reverse_mortgage`: Whether the covered loan or application is for a reverse mortgage
    - 1 - Reverse mortgage
    - 2 - Not a reverse mortgage
    - 1111 - Exempt
* `open-end_line_of_credit`: Whether the covered loan or application is for an open-end line of credit
    - 1 - Open-end line of credit
    - 2 - Not an open-end line of credit
    - 1111 - Exempt
* `business_or_commercial_purpose`: Whether the covered loan or application is primarily for a business or commercial purpose
    - 1 - Primarily for a business or commercial purpose
    - 2 - Not primarily for a business or commercial purpose
    - 1111 - Exempt
* `loan_amount`: The amount of the covered loan, or the amount applied for
* `combined_loan_to_value_ratio`: The ratio of the total amount of debt secured by the property to the value of the property relied on in making the credit decision
* `interest_rate`: The interest rate for the covered loan or application
* `rate_spread`: The difference between the covered loan’s annual percentage rate (APR) and the average prime offer rate (APOR) for a comparable transaction as of the date the interest rate is set
* `hoepa_status`: Whether the covered loan is a high-cost mortgage
    - 1 - High-cost mortgage
    - 2 - Not a high-cost mortgage
    - 3 - Not applicable
* `total_loan_costs`: The amount, in dollars, of total loan costs
* `total_points_and_fees`: The total points and fees, in dollars, charged in connection with the covered loan
* `origination_charges`: The total of all itemized amounts, in dollars, that are designated borrower-paid at or before closing
* `discount_points`: The points paid, in dollars, to the creditor to reduce the interest rate
* `lender_credits`: The amount, in dollars, of lender credits
* `loan_term`: The number of months after which the legal obligation will mature or terminate, or would have matured or terminated
* `prepayment_penalty_term`: The term, in months, of any prepayment penalty
* `intro_rate_period`: The number of months, or proposed number of months in the case of an application, until the first date the interest rate may change after closing or account opening
* `negative_amortization`: Whether the contractual terms include, or would have included, a term that would cause the covered loan to be a negative amortization loan
    - 1 - Negative amortization
    - 2 - No negative amortization
    - 1111 - Exempt
* `interest_only_payment`: Whether the contractual terms include, or would have included, interest-only payments
    - 1 - Interest-only payments
    - 2 - No interest-only payments
    - 1111 - Exempt
* `balloon_payment`: Whether the contractual terms include, or would have included, a balloon payment
    - 1 - Balloon payment
    - 2 - No balloon payment
    - 1111 - Exempt
* `other_nonamortizing_features`: Whether the contractual terms include, or would have included, any term, other than those described in Paragraphs 1003.4(a)(27)(i), (ii), and (iii) that would allow for payments other than fully amortizing payments during the loan term
    - 1 - Other non-fully amortizing features
    - 2 - No other non-fully amortizing features
    - 1111 - Exempt
* `property_value`: The value of the property securing the covered loan or, in the case of an application, proposed to secure the covered loan, relied on in making the credit decision
* `construction_method`: Construction method for the dwelling
    - 1 - Site-built
    - 2 - Manufactured home
* `occupancy_type`: Occupancy type for the dwelling
    - 1 - Principal residence
    - 2 - Second residence
    - 3 - Investment property
* `manufactured_home_secured_property_type`: Whether the covered loan or application is, or would have been, secured by a manufactured home and land, or by a manufactured home and not land
    - 1 - Manufactured home and land
    - 2 - Manufactured home and not land
    - 3 - Not applicable
    - 1111 - Exempt
* `manufactured_home_land_property_interest`: The applicant’s or borrower’s land property interest in the land on which a manufactured home is, or will be, located
    - 1 - Direct ownership
    - 2 - Indirect ownership
    - 3 - Paid leasehold
    - 4 - Unpaid leasehold
    - 5 - Not applicable
    - 1111 - Exempt
* `total_units`: The number of individual dwelling units related to the property securing the covered loan or, in the case of an application, proposed to secure the covered loan
    - 1
    - 2
    - 3
    - 4
    - 5-24
    - 25-49
    - 50-99
    - 100-149
    - \>149
* `ageapplicant`: The age of the applicant
    - <25
    - 25-34
    - 35-44
    - 45-54 
    - 55-64
    - 65-74
    - \>74
    - 8888
* `multifamily_affordable_units`: Reported values as a percentage, rounded to the nearest whole number, of the value reported for Total Units 
* `income`: The gross annual income, in thousands of dollars, relied on in making the credit decision, or if a credit decision was not made, the gross annual income relied on in processing the application
* `debt_to_income_ratio`: The ratio, as a percentage, of the applicant’s or borrower’s total monthly debt to the total monthly income relied on in making the credit decision
    - <20%
    - 20%-<30%
    - 30%-<36%
    - 37%
    - 38%
    - 39%
    - 40%
    - 41%
    - 42%
    - 43%
    - 44%
    - 45%
    - 46%
    - 47%
    - 48%
    - 49%
    - 50%-60%
    - \>60%
    - NA
    - Exempt


## Steps
1. Identify Protected Attributes (e.g., race, age, gender, income level, etc.)
2. Data Cleaning (e.g., handling missing values, removing duplicates, and correcting errors)
3. Feature Selection
4. Data Transformation (e.g., standardization, one-hot encoding, etc.)
5. Exploratory Data Analysis (e.g., summary, visualization, bias exploration)
6. Data Splitting
7. Model Selection (e.g., logistic regression, decision trees, random forest, XGBoost, neural networks)
8. Hyperparamter Tuning
9. Model Evaluation
10. Suggestion on Bias Mitigation