# DATA ACQUISITION

In [None]:
import pandas as pd

filePath = './state_MA.csv'

columnDataTypes = {'activity_year': 'int64', 'lei': 'object', 'derived_msa-md': 'int64', 'state_code': 'object', 'county_code': 'float64', 'census_tract': 'float64', 'derived_loan_product_type': 'object', 'derived_dwelling_category': 'object', 'derived_ethnicity': 'object', 'derived_race': 'object', 'derived_sex': 'object', 'action_taken': 'int64', 'purchaser_type': 'int64', 'preapproval': 'int64', 'loan_type': 'int64', 'loan_purpose': 'int64', 'lien_status': 'int64', 'reverse_mortgage': 'int64', 'open-end_line_of_credit': 'int64', 'business_or_commercial_purpose': 'int64', 'loan_amount': 'float64', 'hoepa_status': 'int64', 'negative_amortization': 'int64', 'interest_only_payment': 'int64', 'balloon_payment': 'int64', 'other_nonamortizing_features': 'int64', 'construction_method': 'int64', 'occupancy_type': 'int64', 'manufactured_home_secured_property_type': 'int64', 'manufactured_home_land_property_interest': 'int64', 'income': 'float64', 'applicant_credit_score_type': 'int64', 'co-applicant_credit_score_type': 'int64', 'applicant_ethnicity-1': 'float64', 'applicant_ethnicity-2': 'float64', 'applicant_ethnicity-3': 'float64', 'applicant_ethnicity-4': 'float64', 'applicant_ethnicity-5': 'float64', 'co-applicant_ethnicity-1': 'float64', 'co-applicant_ethnicity-2': 'float64', 'co-applicant_ethnicity-3': 'float64', 'co-applicant_ethnicity-4': 'float64', 'co-applicant_ethnicity-5': 'float64', 'applicant_ethnicity_observed': 'int64', 'co-applicant_ethnicity_observed': 'int64', 'applicant_race-1': 'float64', 'applicant_race-2': 'float64', 'applicant_race-3': 'float64', 'applicant_race-4': 'float64', 'applicant_race-5': 'float64', 'co-applicant_race-1': 'float64', 'co-applicant_race-2': 'float64', 'co-applicant_race-3': 'float64', 'co-applicant_race-4': 'float64', 'co-applicant_race-5': 'float64', 'applicant_race_observed': 'int64', 'co-applicant_race_observed': 'int64', 'applicant_sex': 'int64', 'co-applicant_sex': 'int64', 'applicant_sex_observed': 'int64', 'co-applicant_sex_observed': 'int64', 'applicant_age': 'object', 'co-applicant_age': 'object', 'submission_of_application': 'int64', 'initially_payable_to_institution': 'int64', 'aus-1': 'int64', 'aus-2': 'float64', 'aus-3': 'float64', 'aus-4': 'float64', 'aus-5': 'float64', 'denial_reason-1': 'int64', 'denial_reason-2': 'float64', 'denial_reason-3': 'float64', 'denial_reason-4': 'float64', 'tract_population': 'int64', 'tract_minority_population_percent': 'float64', 'ffiec_msa_md_median_family_income': 'int64', 'tract_to_msa_income_percentage': 'float64', 'tract_owner_occupied_units': 'int64', 'tract_one_to_four_family_homes': 'int64', 'tract_median_age_of_housing_units': 'int64',

#Mixed Datatypes, originally import as object--> String to then perform pre-processing to determine conversions
'loan_to_value_ratio' : 'object',
'interest_rate' : 'object',
'rate_spread' :  'object',
'total_loan_costs' : 'object',
'total_points_and_fees' : 'object',
'origination_charges' : 'object',
'discount_points' : 'object',
'lender_credits' :  'object',
'loan_term' : 'object',
'prepayment_penalty_term' : 'object',
'intro_rate_period' : 'object',
'property_value' : 'object',
'total_units' : 'object',
'multifamily_affordable_units' : 'object',
'applicant_age_above_62' : 'object', 
'conforming_loan_limit' : 'object', 
'debt_to_income_ratio' : 'object',
'co-applicant_age_above_62' : 'object'
}


raw_data = pd.read_csv(filePath, dtype=columnDataTypes)
print("PD Dataframe Shape:" , raw_data.shape)
raw_data.info()

In [None]:
#Data Head
raw_data.head()

In [None]:
for col in raw_data.columns:
    print(raw_data[col].value_counts(), "\n")

# DATA PREPROCESSING

## Categorical Features

We will use the data dictionary provided by FIFEC to ensure that the categorical feature values are the correctly encoded to their correct values, as for some, the datatype selected by pandas is numerical, while the HDMA dictionary states that certain entries should allow for "NA" as text instead of 'nan' - Not a Number.


In [None]:
#Since dataset only spans one year=2023, and one state = MA, these columns are not necessary.
#Furthermore, after analysing the data, we realize that the 'co-applicant_ethnicity-4' and 'co-applicant_ethnicity-5' columns don't have any data so we will drop them
columnsToDrop = {'activity_year','state_code','co-applicant_ethnicity-4', 'co-applicant_ethnicity-5'}
#Checking if columns exist to avoid errors if running code multiple times.
raw_data.drop(columns=columnsToDrop.intersection(set(raw_data.columns)) , inplace=True)

#Valid column values for Loan/Application Register (LAR) : https://ffiec.cfpb.gov/documentation/publications/loan-level-datasets/lar-data-fields
#If a LAR column is of type varied (Ex: loan_amount), we will process these seperately.
hdmaDictionary = {
    "derived_msa-md" : {
        15764: "Hartford-West Hartford-East Hartford, CT MSA",
        38340: "Providence-Warwick, RI-MA MSA",
        39300: "Springfield, MA MSA",
        14454: "Boston-Cambridge-Newton, MA-NH MSA",
        49340: "Worcester, MA-CT MSA",
        12700: "New Bedford, MA MSA",
        44140: "Springfield, MA MSA",
        99999 : "NA / Unknown"
    },
    "conforming_loan_limit" : {
        "C": "Conforming",
        "NC": "Nonconforming",
        "U": "Undetermined",
        "NA": "Not Applicable"
    },
    "derived_loan_product_type" : {
        "Conventional:First Lien": "Conventional:First Lien",
        "FHA:First Lien": "FHA:First Lien",
        "VA:First Lien": "VA:First Lien",
        "FSA/RHS:First Lien": "FSA/RHS:First Lien",
        "Conventional:Subordinate Lien": "Conventional:Subordinate Lien",
        "FHA:Subordinate Lien": "FHA:Subordinate Lien",
        "VA:Subordinate Lien": "VA:Subordinate Lien",
        "FSA/RHS:Subordinate Lien": "FSA/RHS:Subordinate Lien"
    },
    "derived_dwelling_category" : {
        "Single Family (1-4 Units):Site-Built": "Single Family (1-4 Units):Site-Built",
        "Multifamily:Site-Built": "Multifamily:Site-Built (5+ Units)",
        "Single Family (1-4 Units):Manufactured": "Single Family (1-4 Units):Manufactured",
        "Multifamily:Manufactured": "Multifamily:Manufactured (5+ Units)"
    },
    "derived_ethnicity" : {
        "Hispanic or Latino": "Hispanic or Latino",
        "Not Hispanic or Latino": "Not Hispanic or Latino",
        "Joint": "Joint",
        "Ethnicity Not Available": "Ethnicity Not Available",
        "Free Form Text Only": "Free Form Text Only"
    },
    "derived_race": {
        "American Indian or Alaska Native": "American Indian or Alaska Native",
        "Asian": "Asian",
        "Black or African American": "Black or African American",
        "Native Hawaiian or Other Pacific Islander": "Native Hawaiian or Other Pacific Islander",
        "White": "White",
        "2 or more minority races": "2 or more minority races",
        "Joint": "Joint",
        "Free Form Text Only": "Free Form Text Only",
        "Race Not Available": "Race Not Available"
    },
    "derived_sex"  : {
        "Male": "Male",
        "Female": "Female",
        "Joint": "Joint",
        "Sex Not Available": "Sex Not Available"
    },
    "action_taken": {
        1: "Loan originated",
        2: "Application approved but not accepted",
        3: "Application denied",
        4: "Application withdrawn by applicant",
        5: "File closed for incompleteness",
        6: "Loan purchased by the institution",
        7: "Preapproval request denied",
        8: "Preapproval request approved but not accepted"
    },
    "purchaser_type": { 
        0: "Not applicable",
        1: "Fannie Mae",
        2: "Ginnie Mae",
        3: "Freddie Mac",
        4: "Farmer Mac",
        5: "Private securitizer",
        6: "Commercial bank, savings bank, or savings association",
        71: "Credit union, mortgage company, or finance company",
        72: "Life insurance company",
        8: "Affiliate institution",
        9: "Other type of purchaser"
    },
    "preapproval": {
        1: "Preapproval requested",
        2: "Preapproval not requested"
    },
    "loan_type" : { 
        1: "Conventional (not insured or guaranteed by FHA, VA, RHS, or FSA)",
        2: "Federal Housing Administration insured (FHA)",
        3: "Veterans Affairs guaranteed (VA)",
        4: "USDA Rural Housing Service or Farm Service Agency guaranteed (RHS or FSA)"
    },
    "loan_purpose" : {
        1: "Home purchase",
        2: "Home improvement",
        31: "Refinancing",
        32: "Cash-out refinancing",
        4: "Other purpose",
        5: "Not applicable"
    },
    "lien_status" : {
        1: "Secured by a first lien",
        2: "Secured by a subordinate lien"
    },
    "reverse_mortgage" : {
        1: "Reverse mortgage",
        2: "Not a reverse mortgage",
        1111: "Exempt"
    },
    "open-end_line_of_credit" : {
        1: "Open-end line of credit",
        2: "Not an open-end line of credit",
        1111: "Exempt"
    },
    "business_or_commercial_purpose" : {
        1: "Primarily for a business or commercial purpose",
        2: "Not primarily for a business or commercial purpose",
        1111: "Exempt"
    },
    "hoepa_status" : {
        1: "High-cost mortgage",
        2: "Not a high-cost mortgage",
        3: "Not applicable"
    },
    "negative_amortization" :{
        1: "Negative amortization",
        2: "No negative amortization",
        1111: "Exempt"
    },
    "interest_only_payment" : {
        1: "Interest-only payments",
        2: "No interest-only payments",
        1111: "Exempt"
    },
    "balloon_payment" : {
        1: "Balloon payment",
        2: "No balloon payment",
        1111: "Exempt"        
    },
    "other_nonamortizing_features" : {
        1: "Other non-fully amortizing features",
        2: "No other non-fully amortizing features",
        1111: "Exempt"        
    },
    "construction_method" : {
        1: "Site-built",
        2: "Manufactured home"
    },
    "occupancy_type" : {
        1: "Principal residence",
        2: "Second residence",
        3: "Investment property"
    },
    "manufactured_home_secured_property_type" : {
        1: "Manufactured home and land",
        2: "Manufactured home and not land",
        3: "Not applicable",
        1111: "Exempt"
    },
    "manufactured_home_land_property_interest" : {
        1: "Direct ownership",
        2: "Indirect ownership",
        3: "Paid leasehold",
        4: "Unpaid leasehold",
        5: "Not applicable",
        1111: "Exempt"
    },
    "total_units" :{
        '1': "1",
        '2': "2",
        '3': "3",
        '4': "4",
        "5-24": "5-24",
        "25-49": "25-49",
        "50-99": "50-99",
        "100-149": "100-149",
        ">149": ">149"
    },
    "debt_to_income_ratio" : {
        "<20%": "<20%",
        "20%-<30%": "20%-<30%",
        "30%-<36%": "30%-<36%",
        "36" : "36%",
        "37": "37%",
        "38": "38%",
        "39": "39%",
        "40": "40%",
        "41": "41%",
        "42": "42%",
        "43": "43%",
        "44": "44%",
        "45": "45%",
        "46": "46%",
        "47": "47%",
        "48": "48%",
        "49": "49%",
        "50%-60%": "50%-60%",
        ">60%": ">60%",
        "NA": "NA",
        "Exempt": "Exempt"
    },
    "applicant_credit_score_type" : {
        1: "Equifax Beacon 5.0",
        2: "Experian Fair Isaac",
        3: "FICO Risk Score Classic 04",
        4: "FICO Risk Score Classic 98",
        5: "VantageScore 2.0",
        6: "VantageScore 3.0",
        7: "More than one credit scoring model",
        8: "Other credit scoring model",
        9: "Not applicable",
        11 : " FICO Score 9", #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that this is a new credit score filing system introduced after original dictionary was created.
        1111: "Exempt"        
    },
    "co-applicant_credit_score_type" :{    
        1: "Equifax Beacon 5.0",
        2: "Experian Fair Isaac",
        3: "FICO Risk Score Classic 04",
        4: "FICO Risk Score Classic 98",
        5: "VantageScore 2.0",
        6: "VantageScore 3.0",
        7: "More than one credit scoring model",
        8: "Other credit scoring model",
        9: "Not applicable",
        10: "No co-applicant",
        11 : " FICO Score 9", #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that this is a new credit score filing system introduced after original dictionary was created.
        1111: "Exempt"
    },
    "applicant_ethnicity-1" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable"
    },
    "applicant_ethnicity-2" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable"
    },
    "applicant_ethnicity-3" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable"
    },
    "applicant_ethnicity-4" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable"
    },
    "applicant_ethnicity-5" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable"
    },
    "co-applicant_ethnicity-1" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable",
        5: "No co-applicant" # If the Co-Applicant or Co-Borrower did not select any ethnicity(ies), but only provided ethnicity(ies) in the Ethnicity of Co-Applicant or Co-Borrower: Free Form Text Field for Other Hispanic or Latino, either leave this this data field blank or enter Code 14.
    },
    "co-applicant_ethnicity-2" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable"
    },
    "co-applicant_ethnicity-3" : {
        1: "Hispanic or Latino",
        11: "Mexican",
        12: "Puerto Rican",
        13: "Cuban",
        14: "Other Hispanic or Latino",
        2: "Not Hispanic or Latino",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable"
    },
    "applicant_ethnicity_observed": {
        1: "Collected on the basis of visual observation or surname",
        2: "Not collected on the basis of visual observation or surname",
        3: "Not applicable"        
    },
    "co-applicant_ethnicity_observed": {
        1: "Collected on the basis of visual observation or surname",
        2: "Not collected on the basis of visual observation or surname",
        3: "Not applicable",
        4: "No co-applicant"
    },
    "applicant_race-1" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "applicant_race-2" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "applicant_race-3" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "applicant_race-4" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "applicant_race-5" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "co-applicant_race-1" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable",
        8: "No co-applicant" #If the Co-Applicant or Co-Borrower did not select any race(s) and only provided race(s) in the Race of Co-Applicant or Co-Borrower: Free Form Text Field for American Indian or Alaska Native Enrolled or Principal Tribe, Race of Co-Applicant or Co-Borrower: Free Form Text Field for Other Asian, and/or Race of Co-Applicant or Co-Borrower: Free Form Text Field for Other Pacific Islander, either leave this data field blank or enter, as appropriate, Code 1, 27, or 44.
    },
    "co-applicant_race-2" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "co-applicant_race-3" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "co-applicant_race-4" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "co-applicant_race-5" : {
        1: "American Indian or Alaska Native",
        2: "Asian",
        21: "Asian Indian",
        22: "Chinese",
        23: "Filipino",
        24: "Japanese",
        25: "Korean",
        26: "Vietnamese",
        27: "Other Asian",
        3: "Black or African American",
        4: "Native Hawaiian or Other Pacific Islander",
        41: "Native Hawaiian",
        42: "Guamanian or Chamorro",
        43: "Samoan",
        44: "Other Pacific Islander",
        5: "White",
        6: "Information not provided by applicant in mail, internet, or telephone application",
        7: "Not applicable"
    },
    "applicant_race_observed" : {
        1: "Collected on the basis of visual observation or surname",
        2: "Not collected on the basis of visual observation or surname",
        3: "Not applicable"
    },
    "co-applicant_race_observed" : {
        1: "Collected on the basis of visual observation or surname",
        2: "Not collected on the basis of visual observation or surname",
        3: "Not applicable",
        4: "No co-applicant"
    },
    "applicant_sex" : {
        1: "Male",
        2: "Female",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable",
        6: "Applicant selected both male and female"
    },
     "co-applicant_sex" : {
        1: "Male",
        2: "Female",
        3: "Information not provided by applicant in mail, internet, or telephone application",
        4: "Not applicable",
        5 :  "No co-applicant",
        6: "Applicant selected both male and female"
    },
    "applicant_sex_observed" : {
        1: "Collected on the basis of visual observation or surname",
        2: "Not collected on the basis of visual observation or surname",
        3: "Not applicable",
        4: "No co-applicant"
    },
    "co-applicant_sex_observed" : {
        1: "Collected on the basis of visual observation or surname",
        2: "Not collected on the basis of visual observation or surname",
        3: "Not applicable",
        4: "No co-applicant"
    },
    "applicant_age" : {
        "<25": "<25",
        "25-34": "25-34",
        "35-44": "35-44",
        "45-54": "45-54",
        "55-64": "55-64",
        "65-74": "65-74",
        ">74": ">74",
        "8888": "Unknown or Not Provided"        
    },
    "co-applicant_age" : {
        "<25": "<25",
        "25-34": "25-34",
        "35-44": "35-44",
        "45-54": "45-54",
        "55-64": "55-64",
        "65-74": "65-74",
        ">74": ">74",
        "8888": "Unknown or Not Provided",
        "9999" : "No co-applicant"
    },
    "applicant_age_above_62" : {
        "Yes": "Yes",
        "No": "No",
        "NA": "NA"
    },
    "co-applicant_age_above_62" : {
        "Yes": "Yes",
        "No": "No",
        "NA": "NA"
    },
    "submission_of_application" : {
        1: "Submitted directly to your institution",
        2: "Not submitted directly to your institution",
        3: "Not applicable",
        1111: "Exempt"
    },
    "initially_payable_to_institution" : {
        1: "Initially payable to your institution",
        2: "Not initially payable to your institution",
        3: "Not applicable",
        1111: "Exempt"
    },
    "aus-1": {
        1: "Desktop Underwriter (DU)",
        2: "Loan Prospector (LP) or Loan Product Advisor",
        3: "Technology Open to Approved Lenders (TOTAL) Scorecard",
        4: "Guaranteed Underwriting System (GUS)",
        5: "Other",
        6: "Not applicable",
        7: "Internal Proprietary System",
        1111: "Exempt"        
    },
    "aus-2": {
        1: "Desktop Underwriter (DU)",
        2: "Loan Prospector (LP) or Loan Product Advisor",
        3: "Technology Open to Approved Lenders (TOTAL) Scorecard",
        4: "Guaranteed Underwriting System (GUS)",
        5: "Other",
        6: "Not applicable",
        7: "Internal Proprietary System",
        "" : "Field does not contain entry" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    },
    "aus-3": {
        1: "Desktop Underwriter (DU)",
        2: "Loan Prospector (LP) or Loan Product Advisor",
        3: "Technology Open to Approved Lenders (TOTAL) Scorecard",
        4: "Guaranteed Underwriting System (GUS)",
        5: "Other",
        6: "Not applicable",
        7: "Internal Proprietary System",
        "" : "Field does not contain entry" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    },
    "aus-4": {
        1: "Desktop Underwriter (DU)",
        2: "Loan Prospector (LP) or Loan Product Advisor",
        3: "Technology Open to Approved Lenders (TOTAL) Scorecard",
        4: "Guaranteed Underwriting System (GUS)",
        5: "Other",
        6: "Not applicable",
        7: "Internal Proprietary System",
        "" : "Field does not contain entry" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    },
    "aus-5": {
        1: "Desktop Underwriter (DU)",
        2: "Loan Prospector (LP) or Loan Product Advisor",
        3: "Technology Open to Approved Lenders (TOTAL) Scorecard",
        4: "Guaranteed Underwriting System (GUS)",
        5: "Other",
        6: "Not applicable",
        7: "Internal Proprietary System",
        "" : "Field does not contain entry" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    },
    "denial_reason-1" : {
        1: "Debt-to-income ratio",
        2: "Employment history",
        3: "Credit history",
        4: "Collateral",
        5: "Insufficient cash (downpayment, closing costs)",
        6: "Unverifiable information",
        7: "Credit application incomplete",
        8: "Mortgage insurance denied",
        9: "Other",
        10: "Not applicable",
        1111 : "Exempt" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that this is a new exception for denial.
    },
    "denial_reason-2" : {
        1: "Debt-to-income ratio",
        2: "Employment history",
        3: "Credit history",
        4: "Collateral",
        5: "Insufficient cash (downpayment, closing costs)",
        6: "Unverifiable information",
        7: "Credit application incomplete",
        8: "Mortgage insurance denied",
        9: "Other",
        "" : "Field does not contain entry" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    },
    "denial_reason-3" : {
        1: "Debt-to-income ratio",
        2: "Employment history",
        3: "Credit history",
        4: "Collateral",
        5: "Insufficient cash (downpayment, closing costs)",
        6: "Unverifiable information",
        7: "Credit application incomplete",
        8: "Mortgage insurance denied",
        9: "Other",
        "" : "Field does not contain entry" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    },
    "denial_reason-4" : {
        1: "Debt-to-income ratio",
        2: "Employment history",
        3: "Credit history",
        4: "Collateral",
        5: "Insufficient cash (downpayment, closing costs)",
        6: "Unverifiable information",
        7: "Credit application incomplete",
        8: "Mortgage insurance denied",
        9: "Other",
        "" : "Field does not contain entry" #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    }    
}

#Determine if Column does not contain any expected value
valueNotFound = dict()
for col in raw_data.columns:
    uniqueValues = pd.unique(raw_data[col])
    #Used to output varied types.
    # if (col not in hdmaDictionary.keys()):
    #         print("Column " + col + " not found in HDMA Dictionary due to Varied Type")
    # else:
        #Check If Column only includes designated values.
    if col in hdmaDictionary.keys():  
        for unique in uniqueValues:
            if (col in hdmaDictionary.keys()) and (unique not in hdmaDictionary[col].keys()):
                #Add to dictionary
                valueNotFound[col] = unique
                print("----------------------------------------------------------------------")
                print("Col: {} of type: {} with uniqueValue: {} Does NOT EXIST in options: {}".format(col, raw_data[col].dtype, unique, hdmaDictionary[col].keys()))
                if str(unique) == "nan":
                    print("With Number of Instances : {} ({:.2f} %)".format(raw_data[col].isna().sum(), (raw_data[col].isna().sum() / raw_data.shape[0]) * 100))
                else:
                    print("With Number of Instances : {} ({:.2f} %)".format((raw_data[col] == unique).sum(), ((raw_data[col] == unique).sum() / raw_data.shape[0]) * 100))

## Categorical Values with nan

Many of the categorical values in this dictionary include nan entries for features such as 'debt_to_income_ratio'. These entries need to be re-encoded to their expected value, as this is the correct value that should be displayed according to the HDMA's LAR (Loan Asset Register) documentation.

In [None]:
#Re-Encoding of nan values to match expected column value
hdmaNANMapping = {
    "conforming_loan_limit": "NA",
    "debt_to_income_ratio" : "NA",
    "applicant_ethnicity-1": "NA",
    "applicant_ethnicity-2" : "NA",
    "applicant_ethnicity-3" : "NA",
    "applicant_ethnicity-4" : "NA",
    "applicant_ethnicity-5" : "NA",
    "co-applicant_ethnicity-1" : "NA",
    "co-applicant_ethnicity-2" : "NA",
    "co-applicant_ethnicity-3" : "NA",
    "applicant_race-1" : "NA",
    "applicant_race-2" : "NA",
    "applicant_race-3" : "NA",
    "applicant_race-4" : "NA",
    "applicant_race-5" : "NA",
    "co-applicant_race-1" :"NA",
    "co-applicant_race-2" :"NA",
    "co-applicant_race-3" :"NA",
    "co-applicant_race-4" :"NA",
    "co-applicant_race-5" :"NA",
    "applicant_age_above_62":"NA",
    "co-applicant_age_above_62":"NA",
    "aus-2":"", #If this data field does not contain an entry, leave it blank
    "aus-3": "", #If this data field does not contain an entry, leave it blank
    "aus-4": "", #If this data field does not contain an entry, leave it blank
    "aus-5": "", #If this data field does not contain an entry, leave it blank
    "denial_reason-2" : "", #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    "denial_reason-3" : "", #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
    "denial_reason-4" : "", #Checked updated filing instructions release (https://ffiec.cfpb.gov/documentation/fig/2023/overview) determined that if this data field does not contain an entry, leave it blank
}

#If Na, we want to re-encode this as NA
processed_data = raw_data.fillna(value=hdmaNANMapping)
processed_data.head()

In [None]:
for col in hdmaNANMapping.keys():
    print("{} \n".format(processed_data[col].value_counts()))

#### Categorical Feature Type Reassignment 

Given that we have modified these datatypes to contain the HDMA required values of "NA" or "" for nan/null cases, we must ensure that the datatype for the pandas dataframe reflects this. 
Some values were intially loaded as numerical for some cases ex: applicant_ethnicity-1  = [1,2,13,11,12,14], however due to the inclusion of "NA" as a value we must set the type to a string to support this.

In [None]:
for col in hdmaNANMapping.keys():
    processed_data[col] = processed_data[col].astype(str)
    print(processed_data[col].dtype)

## Numerical Features

For processing the numerical features, we simply determine which columns have not been modified previously and briefly analyze the data. We can note that for several of these remaining 25 features, there are some cases, in which although these as supposed to be numerical values, we also see an option for 'Excempt' for instance for loan_to_value_ratio. After doing some investigation, we realize that these values correspond to special loan products for the following cases:

1. Government-Backed Loans
2. Special Programs and Grants
3. Certain Refinancing Programs

As such, we must pay attention to these cases, and make sure that setting the numeric datatype will not disregard the 'Excempt' value cases.


In [None]:
#Pre-processing varied i.e: Numerical,  column types
hdmaDictKeys = set(hdmaDictionary.keys())
processed_dataColumns = set(processed_data.columns)
numericalColumns = hdmaDictKeys.symmetric_difference(processed_dataColumns)

for col in numericalColumns:
    print("{} \n".format(processed_data[col].value_counts()))

# 

In [None]:
#Determining which columns have mixed data type (i.e: May have numeric values with 'Exempt' case)
mixed_columns = dict()
for column in processed_data.columns:
    types = processed_data[column].map(type).nunique()
    if types > 1:
        mixed_columns[column] = processed_data[column].map(type).unique()
        print("Column: {}, Types: {}".format(column, mixed_columns[column]))

Similarly to the re-encoding of categorical variables, we must make sure that the 'Exempt' cases are not disregarded, thus, we must also re-encode these as string to support for this case. Given that blank values were set to be Null on the previous case, we can also re-encode these to be the expected "NA" value as set by the HDMA LAR standard. We can perform both of these operations at once with the code below.

In [None]:
for col in mixed_columns.keys():
    processed_data[col] = processed_data[col].fillna("NA").astype(str)
    
for col in numericalColumns:
    print("{} \n".format(processed_data[col].value_counts()))


Perform last check to determine if there are null values for numerical value columns.

In [None]:
#Determine if there are any NA columns left
for f in processed_data.columns:
    if processed_data[f].isna().sum() > 0:
        print("Feature: {} : {} : Type: {}".format(f, processed_data[f].isna().sum(),processed_data[f].dtype))

After further reading of the HDMA Regulatory Reference Chart https://files.consumerfinance.gov/f/documents/cfpb_reportable-hmda-data_regulatory-and-reporting-overview-reference-chart_2023-02.pdf
 - For County Code and Census Tract: Enter “NA” for: Applications only if the state, county, or census tract in which the property is located is not known before the application is denied, withdrawn, or closed for incompleteness.
 - For Income: Enter “NA” for: Covered loans or applications for which the credit decision did not consider, or would not have considered income, Covered loans or applications when applicant or co-applicant is not a natural person, Covered loan is secured by, or application is proposed to be secured by, a multifamily dwelling, Purchased covered loans for which the financial institution chooses not to report the income, Covered loan to, or an application from, the institution’s employees to protect their privacy, even if the institution relied on their income in making the credit decision

In [None]:
additionalMixedColumns = ['county_code','census_tract','income']
for col in additionalMixedColumns:
    processed_data[col] = processed_data[col].fillna("NA").astype(str)
    
for col in numericalColumns:
    print("{}: {} \n".format(processed_data[col].dtype, processed_data[col].value_counts()))


## Feature Analysis

We can now finally perform feature analysis by encoding features as necessary first then performing standardization

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

target = processed_data['action_taken']
features = processed_data.drop('action_taken', axis=1)

#Create copy to plot them when performing univariate analysis
originalFeatures = features.copy(deep=True)

#Complete list of purely numerical features
numericalFeatures = features.select_dtypes(include=[np.number]).columns
categoricalFeatures = features.select_dtypes(exclude=[np.number]).columns

print("Numerical Features : ", numericalFeatures)
print("Categorical Features : ", categoricalFeatures)

#Use label encoder for categorical values
encoder = LabelEncoder()
for column in categoricalFeatures:
    features[column] = encoder.fit_transform(features[column])

#Standarize Numerical features
scaler = StandardScaler()
features[numericalFeatures] = scaler.fit_transform(features[numericalFeatures])

#See resultant data
features.head()

In [None]:
features.describe()

## Feature Selection

Given that we have a large amount of features, it is crucial that we make a deliberate effort in limiting the amount of features used in our model. To that end we can use a cross-correlation matrix, which can help in determining the relationships between features in a dataset. We can identify redundant, highly correlated features such as the loan_type and the derived_product_loan type as an example. By performing this analysis we can reduce dimensionality, reduce, multicollinearity and identify important features.


In [None]:
#Correlations
# Calculate the correlation matrix
correlationMatrix = features.corr()

correlationMatrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(50, 50))
sns.heatmap(correlationMatrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Cross Correlation Matrix')
plt.show()

We can now proceed to remove features that show a high degree of correlation. 

In [None]:
correlation_matrix = features.corr().abs()

# Set the threshold for high correlation
threshold = 0.8

# Find feature pairs with correlation above the threshold
high_corr_pairs = np.where(correlation_matrix > threshold)
high_corr_pairs = [(correlation_matrix.index[x], correlation_matrix.columns[y]) 
                   for x, y in zip(*high_corr_pairs) if x != y and x < y]

# Create a set to hold features to drop
features_to_drop = set()

for i, j in high_corr_pairs:
    if i not in features_to_drop and j not in features_to_drop:
        # You can choose to drop either i or j, here we drop j
        features_to_drop.add(j)

# Drop the features
features_reduced = features.drop(columns=features_to_drop)

print("Features to drop due to high correlation:", features_to_drop)
print("Remaining features:", features_reduced.columns)

In [None]:
# Calculate the correlation matrix for the reduced features
reduced_correlation_matrix = features_reduced.corr()

# Plotting the reduced correlation matrix
plt.figure(figsize=(50, 50))
sns.heatmap(reduced_correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Reduced Cross Correlation Matrix')
plt.show()

We can now perform univariate analysis on our features by displaying histograms for numerical values and bar plots for categorical values.

\* WARNING RUNNING THE CODE BELOW WILL TAKE A LONG TIME, Uncomment lines 36,39 to run*


In [None]:
import os

# Create directory to save plots if it doesn't exist
plot_dir = "UnivariateAnalysisPlots"
os.makedirs(plot_dir, exist_ok=True)

# Function to save histograms for numerical features
def save_numerical_features_plots(data, numerical_features, plot_dir):
    for feature in numerical_features:
        plt.figure(figsize=(8, 6))
        sns.histplot(data[feature], kde=True)
        plt.title(f'Distribution of {feature}')
        plt.xticks(rotation=45)  # Rotate x-axis labels
        plt.subplots_adjust(bottom=0.2, left=0.2)
        plt.savefig(os.path.join(plot_dir, f'{feature}_distribution.png'))
        plt.close()

# Function to save bar plots for categorical features
def save_categorical_features_plots(data, categorical_features, plot_dir):
    for feature in categorical_features:
        plt.figure(figsize=(8, 6))
        sns.countplot(y=data[feature], order=data[feature].value_counts().index)
        plt.title(f'Count of {feature}')
        plt.xticks(rotation=45)  # Rotate x-axis labels
        plt.subplots_adjust(bottom=0.2, left=0.2)
        plt.savefig(os.path.join(plot_dir, f'{feature}_count.png'))
        plt.close()

#Drop features as per cross correlation reduction
originalFeatures.drop(columns=features_to_drop)

originalNumericalFeatures = originalFeatures.select_dtypes(include=[np.number]).columns
originalCategoricalFeatures = originalFeatures.select_dtypes(exclude=[np.number]).columns

# Save numerical features plots
# save_numerical_features_plots(originalFeatures, originalNumericalFeatures, plot_dir)
# 
# # Save categorical features plots
# save_categorical_features_plots(originalFeatures, originalCategoricalFeatures, plot_dir)

In [None]:
features_reduced.shape

# Model Selection

Given the fact that the outcome variable for this dataset is not binary, we must use a multinomial model. A simple model fit for this task is a multinomial logistic regression model. This is a suitable choice due to the fact that:

- We have a categorical dependent variable that is multinominal -> action_taken = [1,2,3,4,5,6,7,8]
- It is likely that there are non-linear relationships between the features (For instance: A linear reduction in debt-to-income ratio will have a more than linearly proportional effect on chances of obtaining a mortgage loan.) This model does not assume a linear relationshop between these features.

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, make_scorer, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import numpy as np

# Define the logistic regression model
model = LogisticRegression(multi_class='multinomial', max_iter=500, n_jobs=-1)

# Define scoring metrics
# Define scoring metrics with zero_division parameter
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score, average='macro', zero_division=0),
    'recall': make_scorer(recall_score, average='macro', zero_division=0),
    'f1': make_scorer(f1_score, average='macro', zero_division=0)
}

# Perform cross-validation
cv_results = cross_validate(model, features_reduced, target, cv=5, scoring=scoring, return_train_score=True)

# Print the results of cross-validation
print(f"Cross-validation results:\n")
for metric in scoring.keys():
    print(f"{metric} - Train: {np.mean(cv_results[f'train_{metric}'])} (+/- {np.std(cv_results[f'train_{metric}'])})")
    print(f"{metric} - Test: {np.mean(cv_results[f'test_{metric}'])} (+/- {np.std(cv_results[f'test_{metric}'])})")

# Fit the model on the entire dataset
model.fit(features_reduced, target)

# Make predictions on the same dataset (or use a separate test set if available)
y_pred = model.predict(features_reduced)

# Evaluate the model
accuracy = accuracy_score(target, y_pred)
report = classification_report(target, y_pred, zero_division=0)

print(f'\nAccuracy: {accuracy}')
print(f'Classification Report:\n{report}')

In [None]:
hdmaDictionary['action_taken']

In [None]:
# Generate the classification report
report = classification_report(target, y_pred, zero_division=0, output_dict=True)
report_df = pd.DataFrame(report).transpose().reset_index()
report_df = report_df.drop(columns=['support'])

class_labels = hdmaDictionary['action_taken']

# Replace numeric class labels with actual labels where possible
report_df['index'] = report_df['index'].replace(class_labels)

# Ensure metrics like 'accuracy' are correctly positioned
metrics = ['precision', 'recall', 'f1-score', 'support']
report_df['index'] = report_df['index'].apply(lambda x: class_labels[int(x)] if x.isdigit() and int(x) in class_labels else x)

# Plotting the classification report heatmap
plt.figure(figsize=(10, 6))
heatmap = sns.heatmap(report_df.iloc[:, 1:], annot=True, cmap="YlGnBu", cbar=True, fmt='.2f')
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, horizontalalignment='right')
heatmap.set_yticklabels(report_df['index'], rotation=0)

# Title and labels
plt.title('Classification Report Heatmap')
plt.xlabel('Metrics')
plt.ylabel('Classes')
plt.show()

In [None]:
# Get the labels from the dictionary
labels = [hdmaDictionary['action_taken'][key] for key in sorted(hdmaDictionary['action_taken'].keys())]

# Compute and display the confusion matrix

conf_matrix = confusion_matrix(target, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=labels)
disp.plot(cmap=plt.cm.Blues)
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Calculate and display feature importance
feature_importance = np.abs(model.coef_).flatten()
feature_names = features_reduced.columns  # Assuming features_reduced is a DataFrame
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance
sorted_features = sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)

# print("\nFeature Importance:")
# for feature, importance in sorted_features:
#     print(f"{feature}: {importance}")
#     
# Visualize feature importance
features, importances = zip(*sorted_features)
plt.figure(figsize=(30, 30))
plt.barh(features, importances, align='center')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for Logistic Regression Model')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()

In [None]:
# Visualize top 10 feature importance
# Display the top 10 features
top_10_features = sorted_features[:10]
features, importances = zip(*top_10_features)
plt.figure(figsize=(10, 8))
plt.barh(features, importances, align='center')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Top 10 Feature Importance for Multinomial Logistic Regression Model')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()

In [None]:
# Get feature importance for a particular class (e.g., class index 0)
class_index = 1  # Change this index to get feature importance for a different class
feature_importance = model.coef_[class_index]
feature_names = features_reduced.columns  # Assuming features_reduced is a DataFrame
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance for the specified class
sorted_features = sorted(feature_importance_dict.items(), key=lambda item: abs(item[1]), reverse=True)

# Display the top 10 features for the specified class
top_10_features = sorted_features[:10]

print(f"\nTop 10 Feature Importance for Class {class_index}:")
for feature, importance in top_10_features:
    print(f"{feature}: {importance}")

# Visualize top 10 feature importance for the specified class
features, importances = zip(*top_10_features)
plt.figure(figsize=(10, 8))
plt.barh(features, importances, align='center')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title(f'Top 10 Feature Importance for Loan Origination in Logistic Regression Model')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()

In [None]:
# Get feature importance for a particular class (e.g., class index 0)
class_index = 3  # Change this index to get feature importance for a different class
feature_importance = model.coef_[class_index]
feature_names = features_reduced.columns  # Assuming features_reduced is a DataFrame
feature_importance_dict = dict(zip(feature_names, feature_importance))

# Sort features by importance for the specified class
sorted_features = sorted(feature_importance_dict.items(), key=lambda item: abs(item[1]), reverse=True)

# Display the top 10 features for the specified class
top_10_features = sorted_features[:10]

print(f"\nTop 10 Feature Importance for Class {class_index}:")
for feature, importance in top_10_features:
    print(f"{feature}: {importance}")

# Visualize top 10 feature importance for the specified class
features, importances = zip(*top_10_features)
plt.figure(figsize=(10, 8))
plt.barh(features, importances, align='center')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title(f'Top 10 Feature Importance for Application Denial in Logistic Regression Model')
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature at the top
plt.show()

In [None]:
#Class Percentages
target.value_counts() / target.shape[0] * 100

Once running the model we note that we obtain a relatively high degree of accuracy as a whole, however we note that the performance in prediction for classes 2,5,7,8 is extremely poor and 4 being slightly better than chance. This is due to the extremely unbalanced dataset with 2,4,5,7,8 accounting for only 17.8% of the entire dataset.

We could perform additional measures such as SMOAT, or Ensemble methods to improve the performance of the model for undersampled classes.