## Import data

<ul>
    <li><a href="https://ffiec.cfpb.gov/data-publication/dynamic-national-loan-level-dataset/2022">Dynamic National Loan Level Dataset 2022</a></li>
    <ul>
        <li><a href="https://ffiec.cfpb.gov/documentation/publications/loan-level-datasets/lar-data-fields">LAR Field label information</a></li>
    </ul>
    <li>Loan/Application Records (LAR) & Transmittal Sheet Records (TS)
</ul>


In [53]:
import pandas as pd
import numpy as np
data = "C:\\Users\\WilliamRobinson\\OneDrive - HW Publishing LLC\\Documents\\Data\\HMDA\\2022\\2022_lar.txt"

In [54]:
df = pd.read_csv(data, delimiter='|')

  df = pd.read_csv(data, delimiter='|')


In [55]:
df.head()

Unnamed: 0,activity_year,lei,derived_msa_md,state_code,county_code,census_tract,conforming_loan_limit,derived_loan_product_type,derived_dwelling_category,derived_ethnicity,...,denial_reason_2,denial_reason_3,denial_reason_4,tract_population,tract_minority_population_percent,ffiec_msa_md_median_family_income,tract_to_msa_income_percentage,tract_owner_occupied_units,tract_one_to_four_family_homes,tract_median_age_of_housing_units
0,2022,549300E2UX99HKDBR481,12580,MD,24027.0,24027610000.0,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,6287,64.75,116100,91.0,1520,1690,51
1,2022,549300E2UX99HKDBR481,27260,FL,12031.0,12031020000.0,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,3661,34.36,86100,102.0,847,1195,36
2,2022,549300E2UX99HKDBR481,41180,MO,29189.0,29189210000.0,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,3429,55.23,96800,63.0,489,1173,58
3,2022,549300E2UX99HKDBR481,16740,NC,37119.0,37119010000.0,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Hispanic or Latino,...,,,,3718,35.53,91700,103.0,1134,1469,41
4,2022,549300E2UX99HKDBR481,34940,FL,12021.0,12021010000.0,C,Conventional:First Lien,Single Family (1-4 Units):Site-Built,Not Hispanic or Latino,...,,,,2321,10.77,98600,126.0,1133,1048,23


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16085841 entries, 0 to 16085840
Data columns (total 99 columns):
 #   Column                                    Dtype  
---  ------                                    -----  
 0   activity_year                             int64  
 1   lei                                       object 
 2   derived_msa_md                            int64  
 3   state_code                                object 
 4   county_code                               float64
 5   census_tract                              float64
 6   conforming_loan_limit                     object 
 7   derived_loan_product_type                 object 
 8   derived_dwelling_category                 object 
 9   derived_ethnicity                         object 
 10  derived_race                              object 
 11  derived_sex                               object 
 12  action_taken                              int64  
 13  purchaser_type                            int64  
 14  

# Filter data

In [57]:
# keep only originated loans
originated = df.loc[df.action_taken == 1]
len(originated)

8397078

In [58]:
# keep only single-family homes (including manufactured homes)
singleFam = originated.loc[df.derived_dwelling_category.isin(['Single Family (1-4 Units):Site-Built', 'Single Family (1-4 Units):Manufactured'])]
len(singleFam)

8336005

In [59]:
# exclude business loans
nonbusiness = singleFam.loc[singleFam.business_or_commercial_purpose == 2]
len(nonbusiness)

7701735

In [60]:
# reduce columns
final = nonbusiness[[
    'activity_year', 'lei', 'derived_msa_md', 'state_code', 'county_code', 'census_tract', 
    'conforming_loan_limit', 'purchaser_type', 'loan_type', 'loan_purpose', 'lien_status', 
    'reverse_mortgage', 'loan_amount', 'combined_loan_to_value_ratio', 'interest_rate', 
    'rate_spread', 'hoepa_status', 'total_loan_costs', 'total_points_and_fees', 
    'origination_charges', 'discount_points', 'lender_credits', 'loan_term', 'intro_rate_period',
    'property_value', 'construction_method', 'occupancy_type', 'total_units',
    'debt_to_income_ratio'
]]

# Clean data

In [61]:
# translate categorical data

purchasers = {
    0:  'NA',
    1:  'Fannie Mae',
    2:  'Ginnia Mae',
    3:  'Freddie Mac',
    4:  'Farmer Mac',
    5:  'Private securitizer',
    6:  'Commercial bank, savings bank, or savings association',
    71: 'Credit union, mortgage company, or finance company',
    72: 'Life insurance company',
    8:  'Affiliate institution',
    9:  'Other type of purchaser'
}
final.loc[:, 'purchaser_type'] = [purchasers[k] for k in final.purchaser_type]

loanTypes = {
    1: 'Conventional (not insured or guaranteed by FHA, VA, RHS, or FSA)',
    2: 'Federal Housing Administration insured (FHA)',
    3: 'Veterans Affairs guaranteed (VA)',
    4: 'USDA Rural Housing Service or Farm Service Agency guaranteed (RHS or FSA)'
}
final.loc[:, 'loan_type'] = [loanTypes[k] for k in final.loan_type]

purposes = {
    1:  'Home purchase',
    2:  'Home improvement',
    31: 'Refinancing',
    32: 'Cash-out refinancing',
    4:  'Other purpose',
    5:  'NA'
}
final.loc[:, 'loan_purpose'] = [purposes[k] for k in final.loan_purpose]

liens = {
    1: 'First lien',
    2: 'Subordinate lien'
}
final.loc[:, 'lien_status'] = [liens[k] for k in final.lien_status]

revMor = {
    1:    'Yes',
    2:    'No', 
    1111: 'Exempt'
}
final.loc[:, 'reverse_mortgage'] = [revMor[k] for k in final.reverse_mortgage]

hoepas = {
    1: 'High-cost mortgage', 
    2: 'Not high-cost mortgage',
    3: 'NA'
}
final.loc[:, 'hoepa_status'] = [hoepas[k] for k in final.hoepa_status]

constructions = {
    1: 'Site-built',
    2: 'Manufactured home'
}
final.loc[:, 'construction_method'] = [constructions[k] for k in final.construction_method]

occupancies = {
    1: 'Principal residence',
    2: 'Second residence',
    3: 'Investment property'
}
final.loc[:, 'occupancy_type'] = [occupancies[k] for k in final.occupancy_type]


In [62]:
# convert value columns to float

fltCols = [ 'combined_loan_to_value_ratio', 'interest_rate', 'rate_spread', 'total_loan_costs',
            'total_points_and_fees', 'origination_charges', 'discount_points', 'lender_credits',
            'loan_term', 'intro_rate_period', 'property_value', 'total_units' ]
for col in fltCols:
    final.loc[:, col] = final[col].replace('Exempt', np.nan).astype('float64')

# Final data

In [63]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7701735 entries, 0 to 16085832
Data columns (total 29 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   activity_year                 int64  
 1   lei                           object 
 2   derived_msa_md                int64  
 3   state_code                    object 
 4   county_code                   float64
 5   census_tract                  float64
 6   conforming_loan_limit         object 
 7   purchaser_type                object 
 8   loan_type                     object 
 9   loan_purpose                  object 
 10  lien_status                   object 
 11  reverse_mortgage              object 
 12  loan_amount                   int64  
 13  combined_loan_to_value_ratio  object 
 14  interest_rate                 object 
 15  rate_spread                   object 
 16  hoepa_status                  object 
 17  total_loan_costs              object 
 18  total_points_and_fees     

In [64]:
final.to_csv('sfmOrigNonbiz22.csv')