In [1]:
import pandas as pd
import csv
import glob
import plotly.graph_objects as go
import plotly_express as px

In [2]:
### Column headers on the public_lar file data. Definitions: https://ffiec.cfpb.gov/documentation/2018/lar-data-fields/

names = ['activity_year', 'lei', 'derived_msa_md', 'state_code', 'county_code', 'census_tract', 'conforming_loan_limit', 'derived_loan_product_type', 'derived_dwelling_category', 'derived_ethnicity', 'derived_race', 'derived_sex', 'action_taken', 'purchaser_type', 'preapproval', 'loan_type', 'loan_purpose', 'lien_status', 'reverse_mortgage', 'open_end_line_of_credit', 'business_or_commercial_purpose', 'loan_amount', 'loan_to_value_ratio', 'interest_rate', 'rate_spread', 'hoepa_status', 'total_loan_costs', 'total_points_and_fees', 'origination_charges', 'discount_points', 'lender_credits', 'loan_term', 'prepayment_penalty_term', 'intro_rate_period', 'negative_amortization', 'interest_only_payment', 'balloon_payment', 'other_nonamortizing_features', 'property_value', 'construction_method', 'occupancy_type', 'manufactured_home_secured_property_type', 'manufactured_home_land_property_interest', 'total_units', 'multifamily_affordable_units', 'income', 'debt_to_income_ratio', 'applicant_credit_score_type', 'co_applicant_credit_score_type', 'applicant_ethnicity_1', 'applicant_ethnicity_2', 'applicant_ethnicity_3', 'applicant_ethnicity_4', 'applicant_ethnicity_5', 'co_applicant_ethnicity_1', 'co_applicant_ethnicity_2', 'co_applicant_ethnicity_3', 'co_applicant_ethnicity_4', 'co_applicant_ethnicity_5', 'applicant_ethnicity_observed', 'co_applicant_ethnicity_observed', 'applicant_race_1', 'applicant_race_2', 'applicant_race_3', 'applicant_race_4', 'applicant_race_5', 'co_applicant_race_1', 'co_applicant_race_2', 'co_applicant_race_3', 'co_applicant_race_4', 'co_applicant_race_5', 'applicant_race_observed', 'co_applicant_race_observed', 'applicant_sex', 'co_applicant_sex', 'applicant_sex_observed', 'co_applicant_sex_observed', 'applicant_age', 'co_applicant_age', 'applicant_age_above_62', 'co_applicant_age_above_62', 'submission_of_application', 'initially_payable_to_institution', 'aus_1', 'aus_2', 'aus_3', 'aus_4', 'aus_5', 'denial_reason_1', 'denial_reason_2', 'denial_reason_3', 'denial_reason_4', 'tract_population', 'tract_minority_population_percent', 'ffiec_msa_md_median_family_income', 'tract_to_msa_income_percentage', 'tract_owner_occupied_units', 'tract_one_to_four_family_homes', 'tract_median_age_of_housing_units']

In [3]:
### Setting variables for what we might want to filter by

mo = ['MO']
il = ['IL']
stl = ['41180']
stl_c = [29189]
stl_city = [29510]
sf = ['Single Family (1-4 Units):Site-Built', 'Single Family (1-4 Units):Manufactured']
orig = ['1']

In [4]:
### Reading in public_lar csv files one at a time and filtering to get only Missouri/Illinois data

# df31 = pd.read_csv('./data/public_lar/public_lar_2018-31.csv', names=names, low_memory=False)


### Combines all public_lar files nationwide and loads into a huge df. Takes forever. 

# path = './data/public_lar/'
# all_files = glob.glob(path + "/*.csv")

# li = []

# for filename in all_files:
    # df = pd.read_csv(filename, index_col=None, header=0, low_memory=False)
    # li.append(df)

# frame = pd.concat(li, axis=0, ignore_index=True)

### Combines separate Missouri/Illinois loan app files generated from prev processing
### Combines into a large dataframe for each state.
### Takes a little less time. 

# test_path = './data/public_lar/test/'
mo_path = './data/public_lar/mo/'
il_path = './data/public_lar/il/'
# mo_files = glob.glob(mo_path + "/*.csv")
# il_files = glob.glob(il_path + "/*.csv")

# for filename in mo_files:
    # df = pd.read_csv(filename, index_col=None, low_memory=False)
    # mo_list.append(df)

# for filename in il_files:
    # df = pd.read_csv(filename, index_col=None, low_memory=False)
    # il_list.append(df)

# mo_frame = pd.concat(mo_list, axis=0, ignore_index=True)
# il_frame = pd.concat(il_list, axis=0, ignore_index=True)

In [5]:
### Exporting each state's loan application data into its own file

# mo_frame.to_csv('./data/public_lar/missouri.csv')
# il_frame.to_csv('./data/public_lar/illinois.csv')

### Loading the public_lar files for Missouri and Illinois, adding each df to a list, and concatenating all frames 
### So there is one df for each state

mo_filepath = ('./data/public_lar/mo/*.csv')
il_filepath = ('./data/public_lar/il/*.csv')

my_list_mo = []
my_list_il = []

final_mo = []
final_il = []

# for name in glob.glob(mo_filepath):
    # my_list_mo.append(name)
    
# for name in glob.glob(il_filepath):
    # my_list_il.append(name)

# for file in my_list_mo:
   # temp = pd.read_csv(file, engine='python')
# final_mo.append(temp)

# for file in my_list_il:
    # temp = pd.read_csv(file, engine='python')
# final_il.append(temp)

# mo_frame = pd.read_csv('./data/public_lar/mo/missouri.csv', low_memory = False)
# il_frame = pd.read_csv('./data/public_lar/il/illinois.csv', low_memory = False)

In [6]:
mo_07 = pd.read_csv('./data/public_lar/mo/hmda_2007_mo_all-records_labels.csv', low_memory=False)
mo_08 = pd.read_csv('./data/public_lar/mo/hmda_2008_mo_all-records_labels.csv', low_memory=False)
mo_09 = pd.read_csv('./data/public_lar/mo/hmda_2009_mo_all-records_labels.csv', low_memory=False)
mo_10 = pd.read_csv('./data/public_lar/mo/hmda_2010_mo_all-records_labels.csv', low_memory=False)
mo_11 = pd.read_csv('./data/public_lar/mo/hmda_2011_mo_all-records_labels.csv', low_memory=False)
mo_12 = pd.read_csv('./data/public_lar/mo/hmda_2012_mo_all-records_labels.csv', low_memory=False)
mo_13 = pd.read_csv('./data/public_lar/mo/hmda_2013_mo_all-records_labels.csv', low_memory=False)
mo_14 = pd.read_csv('./data/public_lar/mo/hmda_2014_mo_all-records_labels.csv', low_memory=False)
mo_15 = pd.read_csv('./data/public_lar/mo/hmda_2015_mo_all-records_labels.csv', low_memory=False)
mo_16 = pd.read_csv('./data/public_lar/mo/hmda_2016_mo_all-records_labels.csv', low_memory=False)
mo_17 = pd.read_csv('./data/public_lar/mo/hmda_2017_mo_all-records_labels.csv', low_memory=False)
mo_18 = pd.read_csv('./data/public_lar/mo/missouri.csv', low_memory=False)

In [7]:
il_07 = pd.read_csv('./data/public_lar/il/hmda_2007_il_all-records_labels.csv', low_memory=False)
il_08 = pd.read_csv('./data/public_lar/il/hmda_2008_il_all-records_labels.csv', low_memory=False)
il_09 = pd.read_csv('./data/public_lar/il/hmda_2009_il_all-records_labels.csv', low_memory=False)
il_10 = pd.read_csv('./data/public_lar/il/hmda_2010_il_all-records_labels.csv', low_memory=False)
il_11 = pd.read_csv('./data/public_lar/il/hmda_2011_il_all-records_labels.csv', low_memory=False)
il_12 = pd.read_csv('./data/public_lar/il/hmda_2012_il_all-records_labels.csv', low_memory=False)
il_13 = pd.read_csv('./data/public_lar/il/hmda_2013_il_all-records_labels.csv', low_memory=False)
il_14 = pd.read_csv('./data/public_lar/il/hmda_2014_il_all-records_labels.csv', low_memory=False)
il_15 = pd.read_csv('./data/public_lar/il/hmda_2015_il_all-records_labels.csv', low_memory=False)
il_16 = pd.read_csv('./data/public_lar/il/hmda_2016_il_all-records_labels.csv', low_memory=False)
il_17 = pd.read_csv('./data/public_lar/il/hmda_2017_il_all-records_labels.csv', low_memory=False)
il_18 = pd.read_csv('./data/public_lar/il/illinois.csv', low_memory=False)

In [8]:
m07 = mo_07['as_of_year'].count()
m08 = mo_08['as_of_year'].count()
m09 = mo_09['as_of_year'].count()
m10 = mo_10['as_of_year'].count()
m11 = mo_11['as_of_year'].count()
m12 = mo_12['as_of_year'].count()
m13 = mo_13['as_of_year'].count()
m14 = mo_14['as_of_year'].count()
m15 = mo_15['as_of_year'].count()
m16 = mo_16['as_of_year'].count()
m17 = mo_17['as_of_year'].count()
m18 = mo_18['activity_year'].count()

In [9]:
i07 = il_07['as_of_year'].count()
i08 = il_08['as_of_year'].count()
i09 = il_09['as_of_year'].count()
i10 = il_10['as_of_year'].count()
i11 = il_11['as_of_year'].count()
i12 = il_12['as_of_year'].count()
i13 = il_13['as_of_year'].count()
i14 = il_14['as_of_year'].count()
i15 = il_15['as_of_year'].count()
i16 = il_16['as_of_year'].count()
i17 = il_17['as_of_year'].count()
i18 = il_18['activity_year'].count()

In [10]:
count_headers = ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']
mo_counts = [m07, m08, m09, m10, m11, m12, m13, m14, m15, m16, m17, m18]
il_counts = [i07, i08, i09, i10, i11, i12, i13, i14, i15, i16, i17, i18]
mo_counts_df = pd.DataFrame([mo_counts], columns=count_headers)
il_counts_df = pd.DataFrame([il_counts], columns=count_headers)
mo_counts_df['state'] = 'missouri'
il_counts_df['state'] = 'illinois'

In [11]:
final_counts_df = pd.concat([mo_counts_df, il_counts_df])

In [12]:
final_counts = final_counts_df.transpose()

In [13]:
final_counts = final_counts.reset_index()

In [14]:
final_counts.columns = ['year', 'missouri', 'illinois']

In [89]:
fig_mo = px.line(final_counts, x='year', y='missouri')

In [90]:
fig_mo.update_yaxes(rangemode="tozero")

In [91]:
fig_il = px.line(final_counts, x='year', y='illinois')

In [92]:
fig_il.update_yaxes(rangemode="tozero")

In [21]:
stl_07 = mo_07.loc[mo_07['msamd'].isin(stl)]
stl_08 = mo_08.loc[mo_08['msamd'].isin(stl)]
stl_09 = mo_09.loc[mo_09['msamd'].isin(stl)]
stl_10 = mo_10.loc[mo_10['msamd'].isin(stl)]
stl_11 = mo_11.loc[mo_11['msamd'].isin(stl)]
stl_12 = mo_12.loc[mo_12['msamd'].isin(stl)]
stl_13 = mo_13.loc[mo_13['msamd'].isin(stl)]
stl_14 = mo_14.loc[mo_14['msamd'].isin(stl)]
stl_15 = mo_15.loc[mo_15['msamd'].isin(stl)]
stl_16 = mo_16.loc[mo_16['msamd'].isin(stl)]
stl_17 = mo_17.loc[mo_17['msamd'].isin(stl)]
stl_18 = mo_18.loc[mo_18['derived_msa_md'].isin(stl)]

In [74]:
stl_mo_all_years_list = [stl_07, stl_08, stl_09, stl_10, stl_11, stl_12, stl_13, stl_14, stl_15, stl_16, stl_17]

In [75]:
final_mo_stl = pd.concat(stl_mo_all_years_list)

In [106]:
final_mo_stl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704226 entries, 2 to 277842
Data columns (total 78 columns):
as_of_year                        int64
respondent_id                     object
agency_name                       object
agency_abbr                       object
agency_code                       int64
loan_type_name                    object
loan_type                         int64
property_type_name                object
property_type                     int64
loan_purpose_name                 object
loan_purpose                      int64
owner_occupancy_name              object
owner_occupancy                   int64
loan_amount_000s                  float64
preapproval_name                  object
preapproval                       int64
action_taken_name                 object
action_taken                      int64
msamd_name                        object
msamd                             float64
state_name                        object
state_abbr                       

In [22]:
stl_07_il = il_07.loc[il_07['msamd'].isin(stl)]
stl_08_il = il_08.loc[il_08['msamd'].isin(stl)]
stl_09_il = il_09.loc[il_09['msamd'].isin(stl)]
stl_10_il = il_10.loc[il_10['msamd'].isin(stl)]
stl_11_il = il_11.loc[il_11['msamd'].isin(stl)]
stl_12_il = il_12.loc[il_12['msamd'].isin(stl)]
stl_13_il = il_13.loc[il_13['msamd'].isin(stl)]
stl_14_il = il_14.loc[il_14['msamd'].isin(stl)]
stl_15_il = il_15.loc[il_15['msamd'].isin(stl)]
stl_16_il = il_16.loc[il_16['msamd'].isin(stl)]
stl_17_il = il_17.loc[il_17['msamd'].isin(stl)]
stl_18_il = il_18.loc[il_18['derived_msa_md'].isin(stl)]

In [76]:
stl_il_all_years_list = [stl_07_il, stl_08_il, stl_09_il, stl_10_il, stl_11_il, stl_12_il, stl_13_il, stl_14_il, stl_15_il, stl_16_il, stl_17_il]

In [77]:
final_il_stl = pd.concat(stl_il_all_years_list)

In [78]:
final_mo_stl.to_csv('stl_mo_07-17.csv')
stl_18.to_csv('stl_mo_18.csv')

In [79]:
final_il_stl.to_csv('stl_il_07-17.csv')
stl_18_il.to_csv('stl_il_18.csv')

In [23]:
stl_07_c = stl_07['as_of_year'].count()
stl_08_c = stl_08['as_of_year'].count()
stl_09_c = stl_09['as_of_year'].count()
stl_10_c = stl_10['as_of_year'].count()
stl_11_c = stl_11['as_of_year'].count()
stl_12_c = stl_12['as_of_year'].count()
stl_13_c = stl_13['as_of_year'].count()
stl_14_c = stl_14['as_of_year'].count()
stl_15_c = stl_15['as_of_year'].count()
stl_16_c = stl_16['as_of_year'].count()
stl_17_c = stl_17['as_of_year'].count()
stl_18_c = stl_18['activity_year'].count()

In [33]:
stl_07_il_c = stl_07_il['as_of_year'].count()
stl_08_il_c = stl_08_il['as_of_year'].count()
stl_09_il_c = stl_09_il['as_of_year'].count()
stl_10_il_c = stl_10_il['as_of_year'].count()
stl_11_il_c = stl_11_il['as_of_year'].count()
stl_12_il_c = stl_12_il['as_of_year'].count()
stl_13_il_c = stl_13_il['as_of_year'].count()
stl_14_il_c = stl_14_il['as_of_year'].count()
stl_15_il_c = stl_15_il['as_of_year'].count()
stl_16_il_c = stl_16_il['as_of_year'].count()
stl_17_il_c = stl_17_il['as_of_year'].count()
stl_18_il_c = stl_18_il['activity_year'].count()

In [34]:
stl_counts_mo = [stl_07_c, stl_08_c, stl_09_c, stl_10_c, stl_11_c, stl_12_c, stl_13_c, stl_14_c, stl_15_c, stl_16_c, stl_17_c, stl_18_c]

In [35]:
stl_counts_il = [stl_07_il_c, stl_08_il_c, stl_09_il_c, stl_10_il_c, stl_11_il_c, stl_12_il_c, stl_13_il_c, stl_14_il_c, stl_15_il_c, stl_16_il_c, stl_17_il_c, stl_18_il_c]

In [36]:
stl_mo_df = pd.DataFrame([stl_counts_mo], columns=count_headers)
stl_il_df = pd.DataFrame([stl_counts_il], columns=count_headers)

In [37]:
stl_mo_df['state'] = 'missouri'
stl_il_df['state'] = 'illinois'

In [54]:
stl_final_df = pd.concat([stl_mo_df, stl_il_df])

In [55]:
stl_final = stl_final_df.transpose()

In [56]:
stl_final = stl_final.reset_index()

In [57]:
stl_final.columns = ['year', 'missouri', 'illinois']

In [66]:
stl_final = stl_final.drop(stl_final.index[12])

In [68]:
stl_final['both'] = stl_final['missouri'] + stl_final['illinois']

In [69]:
print stl_final

    year missouri illinois    both
0   2007   232507    58717  291224
1   2008   167538    43679  211217
2   2009   207826    53578  261404
3   2010   166614    45822  212436
4   2011   136530    38238  174768
5   2012   174480    46208  220688
6   2013   152249    38873  191122
7   2014    97056    26749  123805
8   2015   120427    30142  150569
9   2016   134399    33124  167523
10  2017   114600    29216  143816
11  2018   109757    27119  136876


In [93]:
stl_fig = px.line(stl_final, x='year', y='both')

In [94]:
stl_fig.update_yaxes(rangemode="tozero")

In [109]:
final_mo_groups = final_mo_stl.groupby(['property_type', 'as_of_year']).size()
print final_mo_groups

property_type  as_of_year
1              2007          229141
               2008          164832
               2009          206116
               2010          164975
               2011          134833
               2012          172609
               2013          150035
               2014           95610
               2015          118988
               2016          132800
               2017          112693
2              2007            2975
               2008            2297
               2009            1478
               2010            1391
               2011            1435
               2012            1598
               2013            1898
               2014            1157
               2015            1080
               2016            1281
               2017            1538
3              2007             391
               2008             409
               2009             232
               2010             248
               2011             262
  

In [112]:
stl_18_groups = stl_18.groupby(['derived_dwelling_category', 'activity_year']).size()
print stl_18_groups

derived_dwelling_category               activity_year
Multifamily:Manufactured                2018                 18
Multifamily:Site-Built                  2018                467
Single Family (1-4 Units):Manufactured  2018               1328
Single Family (1-4 Units):Site-Built    2018             107944
dtype: int64


In [113]:
final_il_groups = final_il_stl.groupby(['property_type', 'as_of_year']).size()
print final_il_groups

property_type  as_of_year
1              2007          57600
               2008          42730
               2009          52903
               2010          44953
               2011          37486
               2012          45483
               2013          38213
               2014          26089
               2015          29477
               2016          32501
               2017          28535
2              2007            997
               2008            776
               2009            577
               2010            770
               2011            661
               2012            617
               2013            536
               2014            570
               2015            561
               2016            519
               2017            565
3              2007            120
               2008            173
               2009             98
               2010             99
               2011             91
               2012          

In [114]:
stl_18_il_groups = stl_18_il.groupby(['derived_dwelling_category', 'activity_year']).size()
print stl_18_il_groups

derived_dwelling_category               activity_year
Multifamily:Manufactured                2018                 6
Multifamily:Site-Built                  2018               183
Single Family (1-4 Units):Manufactured  2018               538
Single Family (1-4 Units):Site-Built    2018             26392
dtype: int64


In [118]:
final_mo_groups_race = final_mo_stl.groupby(['applicant_race_name_1', 'as_of_year']).size()
print final_mo_groups_race['Black or African American']

as_of_year
2007    33189
2008    19031
2009    12137
2010     8560
2011     6517
2012     8110
2013     8943
2014     6953
2015     7803
2016     8806
2017     8230
dtype: int64


In [119]:
print final_mo_groups_race['White']

as_of_year
2007    142000
2008    107204
2009    140415
2010    123035
2011     96260
2012    126827
2013    107269
2014     63407
2015     79743
2016     90646
2017     76956
dtype: int64


In [126]:
stl_18_groups_race = stl_18.groupby(['applicant_race_1', 'activity_year']).size()
print stl_18_groups_race[3]

activity_year
2018    7990
dtype: int64


In [127]:
print stl_18_groups_race[5]

activity_year
2018    74723
dtype: int64


In [None]:
#############################

In [120]:
### Putting the two dataframes into a list

stl_frames_list = [stl_frame1, stl_frame2]

In [121]:
### Concatenating the two separate StL dfs into one, exporting a .csv of loan applications in the StL metro area

stl_frames = pd.concat(stl_frames_list)
stl_frames.to_csv('./data/public_lar/stl_metro.csv')

In [151]:
### Df for StL city

stl_city_df = stl_frames.loc[stl_frames['county_code'].isin(stl_city)]

In [152]:
### Checking out StL city info

stl_city_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11507 entries, 46 to 162491
Data columns (total 100 columns):
Unnamed: 0                                  11507 non-null int64
activity_year                               11507 non-null int64
lei                                         11507 non-null object
derived_msa_md                              11507 non-null int64
state_code                                  11507 non-null object
county_code                                 11507 non-null object
census_tract                                11507 non-null object
conforming_loan_limit                       11306 non-null object
derived_loan_product_type                   11507 non-null object
derived_dwelling_category                   11507 non-null object
derived_ethnicity                           11507 non-null object
derived_race                                11507 non-null object
derived_sex                                 11507 non-null object
action_taken                     

In [153]:
### Filtering to just single-family homes in the metro

stl_sf = stl_frames.loc[stl_frames['derived_dwelling_category'].isin(sf)]

### To just single-family in the city

stl_city_sf = stl_sf.loc[stl_sf['county_code'].isin(stl_city)]

In [161]:
### Checking out how many applications for single-family mortgages

stl_city_sf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11306 entries, 46 to 162491
Data columns (total 100 columns):
Unnamed: 0                                  11306 non-null int64
activity_year                               11306 non-null int64
lei                                         11306 non-null object
derived_msa_md                              11306 non-null int64
state_code                                  11306 non-null object
county_code                                 11306 non-null object
census_tract                                11306 non-null object
conforming_loan_limit                       11306 non-null object
derived_loan_product_type                   11306 non-null object
derived_dwelling_category                   11306 non-null object
derived_ethnicity                           11306 non-null object
derived_race                                11306 non-null object
derived_sex                                 11306 non-null object
action_taken                     

In [162]:
### How many single-family home loans were made in the city of St. Louis?

stl_city_sf_made = stl_city_sf.loc[stl_city_sf['action_taken'].isin(orig)]
stl_city_sf_made.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5252 entries, 84 to 162491
Data columns (total 100 columns):
Unnamed: 0                                  5252 non-null int64
activity_year                               5252 non-null int64
lei                                         5252 non-null object
derived_msa_md                              5252 non-null int64
state_code                                  5252 non-null object
county_code                                 5252 non-null object
census_tract                                5252 non-null object
conforming_loan_limit                       5252 non-null object
derived_loan_product_type                   5252 non-null object
derived_dwelling_category                   5252 non-null object
derived_ethnicity                           5252 non-null object
derived_race                                5252 non-null object
derived_sex                                 5252 non-null object
action_taken                                525

In [156]:
# StL Metro single-family home mortgage data

stl_sf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136202 entries, 0 to 502571
Data columns (total 100 columns):
Unnamed: 0                                  136202 non-null int64
activity_year                               136202 non-null int64
lei                                         136202 non-null object
derived_msa_md                              136202 non-null int64
state_code                                  136202 non-null object
county_code                                 136198 non-null object
census_tract                                136202 non-null object
conforming_loan_limit                       136202 non-null object
derived_loan_product_type                   136202 non-null object
derived_dwelling_category                   136202 non-null object
derived_ethnicity                           136202 non-null object
derived_race                                136202 non-null object
derived_sex                                 136202 non-null object
action_taken        

In [130]:
stl_frames.county_code.unique()

array([29099.0, 29189.0, 29183.0, 29510.0, 29071.0, 29113.0, 29219.0,
       17163.0, 17005.0, '29071', '29189', '29113', '29183', '29099',
       '29510', '29219', nan, 17133.0, 17119.0, 17117.0, 17027.0, 17083.0,
       17013.0], dtype=object)

In [122]:
stl_frames.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136876 entries, 0 to 502571
Data columns (total 100 columns):
Unnamed: 0                                  136876 non-null int64
activity_year                               136876 non-null int64
lei                                         136876 non-null object
derived_msa_md                              136876 non-null int64
state_code                                  136876 non-null object
county_code                                 136872 non-null object
census_tract                                136876 non-null object
conforming_loan_limit                       136202 non-null object
derived_loan_product_type                   136876 non-null object
derived_dwelling_category                   136876 non-null object
derived_ethnicity                           136876 non-null object
derived_race                                136876 non-null object
derived_sex                                 136876 non-null object
action_taken        

In [100]:
### Filtering the larger dataframes into just the states we are interested in

df31_mo = df31.loc[df31['state_code'].isin(mo)]
df31_il = df31.loc[df31['state_code'].isin(il)]

In [101]:
### Exporting each state frame to a .csv

df31_mo.to_csv('./data/public_lar/mo/mo31.csv')
df31_il.to_csv('./data/public_lar/il/il31.csv')

In [None]:
###############################################

In [None]:
frame = pd.concat(df1, df2)

In [9]:
d1_mo = d1.loc[d1['state_code'].isin(mo)]

In [15]:
d1_stl = d1.loc[d1['derived_msa_md'].isin(stl)]

In [None]:
d1_stl = d1.loc[d1['derived_msa_md'].isin(stl)]

In [24]:
# To preview data 

row_count = 5
chunks = []
for chunk in pd.read_csv('./data/2018_public_lar_csv.csv', chunksize=row_count): 
    print chunk.head()
    # yeschunk = ['derived_msa_md']=='41180'

   activity_year                   lei  derived_msa_md state_code  \
0           2018  549300HW662MN1WU8550           40140         CA   
1           2018  549300HW662MN1WU8550           33460         MN   
2           2018  549300HW662MN1WU8550           47664         MI   
3           2018  549300HW662MN1WU8550           12060         GA   
4           2018  549300HW662MN1WU8550           29820         NV   

   county_code  census_tract conforming_loan_limit derived_loan_product_type  \
0         6071    6071009709                     C            FHA:First Lien   
1        27141   27141030101                     C   Conventional:First Lien   
2        26125   26125183900                     C   Conventional:First Lien   
3        13057   13057090400                     C   Conventional:First Lien   
4        32003   32003005853                     C   Conventional:First Lien   

              derived_dwelling_category       derived_ethnicity  ...  \
0  Single Family (1-4 Units):Sit

    activity_year                   lei  derived_msa_md state_code  \
20           2018  549300HW662MN1WU8550           27340         NC   
21           2018  549300HW662MN1WU8550           47664         MI   
22           2018  549300HW662MN1WU8550           47664         MI   
23           2018  549300HW662MN1WU8550           16974         IL   
24           2018  549300HW662MN1WU8550           40140         CA   

    county_code  census_tract conforming_loan_limit derived_loan_product_type  \
20        37133   37133002201                     C            FHA:First Lien   
21        26125   26125128900                     C   Conventional:First Lien   
22        26125   26125145400                     C   Conventional:First Lien   
23        17031   17031808002                     C   Conventional:First Lien   
24         6065    6065046403                     C   Conventional:First Lien   

               derived_dwelling_category       derived_ethnicity  ...  \
20  Single Family (

    activity_year                   lei  derived_msa_md state_code  \
40           2018  549300HW662MN1WU8550           29180         LA   
41           2018  549300HW662MN1WU8550           48424         FL   
42           2018  549300HW662MN1WU8550           27260         FL   
43           2018  549300HW662MN1WU8550           29460         FL   
44           2018  549300HW662MN1WU8550           31084         CA   

    county_code  census_tract conforming_loan_limit derived_loan_product_type  \
40        22055   22055002103                     C            FHA:First Lien   
41        12099   12099005814                     C            FHA:First Lien   
42        12109   12109020804                     C   Conventional:First Lien   
43        12105   12105011902                     C   Conventional:First Lien   
44         6037    6037236100                     C   Conventional:First Lien   

               derived_dwelling_category        derived_ethnicity  ...  \
40  Single Family 

    activity_year                   lei  derived_msa_md state_code  \
60           2018  549300HW662MN1WU8550           35614         NJ   
61           2018  549300HW662MN1WU8550           11244         CA   
62           2018  549300HW662MN1WU8550           36740         FL   
63           2018  549300HW662MN1WU8550           38300         PA   
64           2018  549300HW662MN1WU8550           36740         FL   

    county_code  census_tract conforming_loan_limit derived_loan_product_type  \
60        34017   34017006100                     C   Conventional:First Lien   
61         6059    6059063903                     C   Conventional:First Lien   
62        12097   12097043204                     C             VA:First Lien   
63        42003   42003413201                     C   Conventional:First Lien   
64        12095   12095017804                     C            FHA:First Lien   

               derived_dwelling_category       derived_ethnicity  ...  \
60  Single Family (

    activity_year                   lei  derived_msa_md state_code  \
80           2018  549300HW662MN1WU8550           12580         MD   
81           2018  549300HW662MN1WU8550           31140         KY   
82           2018  549300HW662MN1WU8550           36260         UT   
83           2018  549300HW662MN1WU8550           30860         UT   
84           2018  549300HW662MN1WU8550           30860         UT   

    county_code  census_tract conforming_loan_limit derived_loan_product_type  \
80        24005   24005400400                     C   Conventional:First Lien   
81        21111   21111011110                     C   Conventional:First Lien   
82        49011   49011126002                     C   Conventional:First Lien   
83        49005   49005000202                     C   Conventional:First Lien   
84        49005   49005000202                     C   Conventional:First Lien   

               derived_dwelling_category        derived_ethnicity  ...  \
80  Single Family 

    activity_year                   lei  derived_msa_md state_code  \
95           2018  549300HW662MN1WU8550           45300         FL   
96           2018  549300HW662MN1WU8550           12060         GA   
97           2018  549300HW662MN1WU8550           26420         TX   
98           2018  549300HW662MN1WU8550           45300         FL   
99           2018  549300HW662MN1WU8550           12060         GA   

    county_code  census_tract conforming_loan_limit derived_loan_product_type  \
95        12103   12103022501                     C            FHA:First Lien   
96        13067   13067030905                     C   Conventional:First Lien   
97        48201   48201250701                     C   Conventional:First Lien   
98        12057   12057012213                     C   Conventional:First Lien   
99        13057   13057090702                     C   Conventional:First Lien   

               derived_dwelling_category        derived_ethnicity  ...  \
95  Single Family 

     activity_year                   lei  derived_msa_md state_code  \
115           2018  549300HW662MN1WU8550           99999         MS   
116           2018  549300HW662MN1WU8550           15764         MA   
117           2018  549300HW662MN1WU8550           45300         FL   
118           2018  549300HW662MN1WU8550           23420         CA   
119           2018  549300HW662MN1WU8550           29460         FL   

     county_code  census_tract conforming_loan_limit  \
115        28087   28087000101                     C   
116        25017   25017310200                     C   
117        12101   12101032902                     C   
118         6019    6019006700                     C   
119        12105   12105012506                     C   

    derived_loan_product_type             derived_dwelling_category  \
115             VA:First Lien  Single Family (1-4 Units):Site-Built   
116   Conventional:First Lien  Single Family (1-4 Units):Site-Built   
117             VA:Firs

     activity_year                   lei  derived_msa_md state_code  \
135           2018  549300HW662MN1WU8550           21660         OR   
136           2018  549300HW662MN1WU8550           39740         PA   
137           2018  549300HW662MN1WU8550           35614         NY   
138           2018  549300HW662MN1WU8550           47664         MI   
139           2018  549300HW662MN1WU8550           47664         MI   

     county_code  census_tract conforming_loan_limit  \
135        41039   41039005100                     C   
136        42011   42011011902                     C   
137        36087   36087011102                     C   
138        26099   26099225800                     C   
139        26125   26125194400                     C   

    derived_loan_product_type             derived_dwelling_category  \
135   Conventional:First Lien  Single Family (1-4 Units):Site-Built   
136   Conventional:First Lien  Single Family (1-4 Units):Site-Built   
137   Conventional:Firs

     activity_year                   lei  derived_msa_md state_code  \
155           2018  549300HW662MN1WU8550           22744         FL   
156           2018  549300HW662MN1WU8550           36260         UT   
157           2018  549300HW662MN1WU8550           16974         IL   
158           2018  549300HW662MN1WU8550           19740         CO   
159           2018  549300HW662MN1WU8550           12060         GA   

     county_code  census_tract conforming_loan_limit  \
155        12011   12011043100                     C   
156        49011   49011126801                     C   
157        17031   17031081403                     C   
158         8005    8005007302                     C   
159        13067   13067030310                     C   

    derived_loan_product_type             derived_dwelling_category  \
155   Conventional:First Lien  Single Family (1-4 Units):Site-Built   
156   Conventional:First Lien  Single Family (1-4 Units):Site-Built   
157   Conventional:Firs

     activity_year                   lei  derived_msa_md state_code  \
170           2018  549300HW662MN1WU8550           10580         NY   
171           2018  549300HW662MN1WU8550           37964         PA   
172           2018  549300HW662MN1WU8550           24540         CO   
173           2018  549300HW662MN1WU8550           33124         FL   
174           2018  549300HW662MN1WU8550           11460         MI   

     county_code  census_tract conforming_loan_limit  \
170        36083   36083051600                     C   
171        42101   42101003702                     C   
172         8123    8123001005                     C   
173        12086   12086016900                     C   
174        26161   26161407600                     C   

    derived_loan_product_type             derived_dwelling_category  \
170            FHA:First Lien  Single Family (1-4 Units):Site-Built   
171            FHA:First Lien  Single Family (1-4 Units):Site-Built   
172   Conventional:Firs

     activity_year                   lei  derived_msa_md state_code  \
190           2018  549300HW662MN1WU8550           38060         AZ   
191           2018  549300HW662MN1WU8550           38940         FL   
192           2018  549300HW662MN1WU8550           48300         WA   
193           2018  549300HW662MN1WU8550           38060         AZ   
194           2018  549300HW662MN1WU8550           35614         NY   

     county_code  census_tract conforming_loan_limit  \
190         4013    4013420214                     C   
191        12111   12111380400                     C   
192        53017   53017950700                     C   
193         4013    4013420210                     C   
194        36081   36081137700                     C   

    derived_loan_product_type             derived_dwelling_category  \
190            FHA:First Lien  Single Family (1-4 Units):Site-Built   
191   Conventional:First Lien  Single Family (1-4 Units):Site-Built   
192            FHA:Firs

     activity_year                   lei  derived_msa_md state_code  \
210           2018  549300HW662MN1WU8550           44140         MA   
211           2018  549300HW662MN1WU8550           99999         NE   
212           2018  549300HW662MN1WU8550           29340         LA   
213           2018  549300HW662MN1WU8550           15380         NY   
214           2018  549300HW662MN1WU8550           45540         FL   

     county_code  census_tract conforming_loan_limit  \
210        25013   25013811000                     C   
211        31053   31053963800                     C   
212        22019   22019001000                     C   
213        36029   36029000800                     C   
214        12119   12119911200                     C   

    derived_loan_product_type             derived_dwelling_category  \
210   Conventional:First Lien  Single Family (1-4 Units):Site-Built   
211            FHA:First Lien  Single Family (1-4 Units):Site-Built   
212   Conventional:Firs

KeyboardInterrupt: 

In [2]:
# To split into smaller csvs of 500,000

divisor = 500000

outfileno = 1
outfile = None

with open('./data/2018_public_lar_csv.csv', 'r') as infile:
    for index, row in enumerate(csv.reader(infile)):
        if index % divisor == 0:
            if outfile is not None:
                outfile.close()
            outfilename = './data/public_lar_2018-{}.csv'.format(outfileno)
            outfile = open(outfilename, 'w')
            outfileno += 1
            writer = csv.writer(outfile)
        writer.writerow(row)