# Dataset Ingestion and Combination
Key Factors of Interest:
**Outcome** - House Price per County

Predictors:
* Income level by county
* Climate Risk by county
* Population
* Type of climate risk by county
* Population Density or County Size
* Average School District rating
* Employment Rate
* Demographic data by county
* Democratic vs Republican party affiliation
* Tax Rate by County
* Crime Rate

# PROGRESS: Risk, House Price and Average income have been cleaned/merged 
This includes the factors: income level, climate risk, population, county size, house prices.

Remaining datasets: 
* Demographics (Atreya)
* Crime Rate (J)
* Tax Rate (S)
* Political Leaning (J)
* Employment Rate (S)
* School Ratings -- if we find data!

In [2]:
# load libraries
import pandas as pd
import numpy as np
import re

## 1. Dataset: National Risk Index by County

In [4]:
risk = pd.read_csv('data/NRI_Table-Counties.csv')
risk.head()

Unnamed: 0,OID_,NRI_ID,STATE,STATEABBRV,STATEFIPS,COUNTY,COUNTYTYPE,COUNTYFIPS,STCOFIPS,POPULATION,...,WNTW_EALS,WNTW_EALR,WNTW_ALRB,WNTW_ALRP,WNTW_ALRA,WNTW_ALR_NPCTL,WNTW_RISKV,WNTW_RISKS,WNTW_RISKR,NRI_VER
0,68,C02013,Alaska,AK,2,Aleutians East,Borough,13,2013,3374,...,21.603219,Relatively Low,1.34042e-06,2.98117e-07,0.0,73.939957,19576.20893,23.926185,Very Low,Mar-23
1,69,C02016,Alaska,AK,2,Aleutians West,Census Area,16,2016,5168,...,22.253172,Relatively Low,6.66633e-07,1.96719e-07,0.0,58.712473,17666.42991,21.826281,Very Low,Mar-23
2,70,C02020,Alaska,AK,2,Anchorage,Municipality,20,2020,290985,...,44.599195,Relatively Low,1.63971e-09,1.10758e-08,2e-06,8.356546,44202.71367,42.729876,Relatively Low,Mar-23
3,71,C02050,Alaska,AK,2,Bethel,Census Area,50,2050,18633,...,66.914268,Relatively Moderate,5.23063e-07,4.26661e-07,0.0,80.810894,160878.4458,75.119313,Relatively Moderate,Mar-23
4,72,C02060,Alaska,AK,2,Bristol Bay,Borough,60,2060,843,...,11.513463,Very Low,1.03385e-06,4.30138e-07,0.0,74.58991,5566.346968,8.335985,Very Low,Mar-23


### 1.1. Feature Evaluation:
What is the difference between RISK_VALUE, RISK_SCORE AND RISK_RATNG? Which do we want to focus on?
## Decision: Use RISK_SCORE

In [6]:
# Create order to risk ratings (can later be used instead of categories for ordinal variable)
risk_ord = pd.DataFrame({'RISK_RATNG':['Insufficient Data', 'Very Low', 'Relatively Low', 'Relatively Moderate', 'Relatively High', 
                                       'Very High'], 
                         'risk_level_num':[0,1,2,3,4,5]})
risk_ord.columns

Index(['RISK_RATNG', 'risk_level_num'], dtype='object')

In [7]:
# Compare Risk Rating to Risk Values
rate_val = risk[['RISK_RATNG', 'RISK_VALUE']].groupby('RISK_RATNG').describe().reset_index()
rate_val.columns = [col_tup[0] + col_tup[1] for col_tup in rate_val.columns]
pd.merge(rate_val, risk_ord, on = 'RISK_RATNG',how="left").sort_values(
    'risk_level_num').set_index('RISK_RATNG').drop(columns = ['risk_level_num'])

Unnamed: 0_level_0,RISK_VALUEcount,RISK_VALUEmean,RISK_VALUEstd,RISK_VALUEmin,RISK_VALUE25%,RISK_VALUE50%,RISK_VALUE75%,RISK_VALUEmax
RISK_RATNG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Insufficient Data,0.0,,,,,,,
Very Low,1511.0,3046019.0,1520213.0,35741.27,1822784.0,3038482.0,4244946.0,5928197.0
Relatively Low,1091.0,11628200.0,4793438.0,5945910.0,7593445.0,10428520.0,14581740.0,23838670.0
Relatively Moderate,397.0,46984400.0,21437230.0,23882580.0,28830070.0,40390640.0,58408930.0,105313200.0
Relatively High,129.0,223091700.0,104134500.0,108473000.0,140565400.0,190796000.0,283543100.0,553807400.0
Very High,15.0,1518860000.0,1190628000.0,656902700.0,921100600.0,1246484000.0,1446401000.0,5326193000.0


In [8]:
# Compare Risk Rating to Risk Scores
rate_score = risk[['RISK_RATNG', 'RISK_SCORE']].groupby('RISK_RATNG').describe().reset_index()
rate_score.columns = [col_tup[0] + col_tup[1] for col_tup in rate_score.columns]
pd.merge(rate_score, risk_ord, on = 'RISK_RATNG',how="left").sort_values(
    'risk_level_num').set_index('RISK_RATNG').drop(columns = ['risk_level_num'])

Unnamed: 0_level_0,RISK_SCOREcount,RISK_SCOREmean,RISK_SCOREstd,RISK_SCOREmin,RISK_SCORE25%,RISK_SCORE50%,RISK_SCORE75%,RISK_SCOREmax
RISK_RATNG,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Insufficient Data,0.0,,,,,,,
Very Low,1511.0,24.053452,13.882674,0.031817,12.042634,24.053452,36.06427,48.075088
Relatively Low,1091.0,65.447025,10.025099,48.106904,56.776965,65.447025,74.117086,82.787146
Relatively Moderate,397.0,89.118676,3.650916,82.818963,85.96882,89.118676,92.268533,95.41839
Relatively High,129.0,97.486478,1.18941,95.450207,96.468342,97.486478,98.504613,99.522749
Very High,15.0,99.777283,0.142289,99.554566,99.665924,99.777283,99.888641,100.0


In [16]:
# Key Metrics
main_df = risk[['STATEABBRV', 'COUNTY', 'POPULATION', 'AREA', 'RISK_VALUE', 'RISK_SCORE', 'RISK_RATNG']]
main_df.columns = [col.lower() for col in main_df.columns]
main_df = main_df.rename(columns = {'stateabbrv': 'state'})

# convert county names St. -> Saint
new_county = []
for county in main_df['county'].to_list():
    if county[:3] == 'St.':
        county = 'Saint' + county[3:]
    county = county.replace('ñ', 'n')
    new_county.append(county)
main_df['county'] = new_county

# make county match col which removes all special characters, spaces and lowercases county name
main_df['county_match'] = [re.sub("[^a-zA-Z]", "", county).lower() for county in main_df['county'].to_list()]
    
main_df.head()

Unnamed: 0,state,county,population,area,risk_value,risk_score,risk_ratng,county_match
0,AK,Aleutians East,3374,15167.69423,894965.3,4.231626,Very Low,aleutianseast
1,AK,Aleutians West,5168,14258.9931,4151810.0,35.221126,Very Low,aleutianswest
2,AK,Anchorage,290985,1966.338483,94565710.0,94.845689,Relatively Moderate,anchorage
3,AK,Bethel,18633,46015.50707,2354071.0,16.926503,Very Low,bethel
4,AK,Bristol Bay,843,857.255664,105560.5,0.159084,Very Low,bristolbay


## 2. Dataset: Average House Price by County
These are average house price for the 3 bedroom homes in that county.

In [18]:
house_price = pd.read_csv('data/County_house_price.csv')
house_price.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,Metro,StateCodeFIPS,MunicipalCodeFIPS,2000-01-31,...,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30,2024-07-31,2024-08-31,2024-09-30,2024-10-31
0,3101,0,Los Angeles County,county,CA,CA,"Los Angeles-Long Beach-Anaheim, CA",6,37,201912.261325,...,835213.356408,830842.704551,827397.218457,828352.309368,831693.814162,834118.549764,837980.932568,843352.706676,850013.948373,855185.284307
1,139,1,Cook County,county,IL,IL,"Chicago-Naperville-Elgin, IL-IN-WI",17,31,138786.730715,...,299896.888776,301161.714525,303681.845144,306872.630594,309311.366294,310861.359712,311880.935833,312853.381077,313759.921596,314437.668729
2,1090,2,Harris County,county,TX,TX,"Houston-The Woodlands-Sugar Land, TX",48,201,93763.835667,...,251433.448989,251981.268405,252949.926573,253822.006471,254347.717766,254240.281757,253935.30037,253584.97344,253302.433848,252839.522624
3,2402,3,Maricopa County,county,AZ,AZ,"Phoenix-Mesa-Chandler, AZ",4,13,128670.519165,...,436641.580342,437524.657383,439110.043819,440866.652127,442237.549448,442554.952236,442213.24428,441299.790052,440419.600392,439637.593015
4,2841,4,San Diego County,county,CA,CA,"San Diego-Chula Vista-Carlsbad, CA",6,73,203600.293045,...,856166.371406,860330.459584,867899.083605,877411.697822,885481.043789,889656.992704,891049.286486,890860.303286,890429.118481,889771.059763


In [20]:
print(f'The average house price dataset contains {house_price.shape[0]} counties.')
print(f'The risk dataset contains {main_df.shape[0]} rows.')
print('So there will not be complete overlap for all data.')

The average house price dataset contains 2802 counties.
The risk dataset contains 3231 rows.
So there will not be complete overlap for all data.


In [22]:
# Key Factors Subset
price_subset = house_price[['RegionName', 'State', '2023-12-31']]
price_subset.columns = ['county', 'state', 'price']
price_subset

Unnamed: 0,county,state,price
0,Los Angeles County,CA,835952.468135
1,Cook County,IL,299433.169006
2,Harris County,TX,251136.409812
3,Maricopa County,AZ,435967.020147
4,San Diego County,CA,854004.938333
...,...,...,...
2797,Keya Paha County,NE,229801.607299
2798,Golden Valley County,MT,248437.014530
2799,Mineral County,CO,444288.990340
2800,Hooker County,NE,101293.812292


### 2.1. Combine House Price

In [25]:
# strip 'County', 'Borough' from RegionName
new_reg_name = []
for county in price_subset['county']:
    if county[-6:] == 'County':
        county = county[:-7]
    elif county[-7:] == 'Borough':
        county = county[:-8]
    elif county[-4:] == 'City':
        county = county[:-5]
    elif county[-6:] == 'Parish':
        county = county[:-7]
    new_reg_name.append(county)
price_subset['county'] = new_reg_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_subset['county'] = new_reg_name


In [27]:
price_subset['county_match'] = [re.sub("[^a-zA-Z]", "", county).lower() for county in price_subset['county'].to_list()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  price_subset['county_match'] = [re.sub("[^a-zA-Z]", "", county).lower() for county in price_subset['county'].to_list()]


In [29]:
merged = pd.merge(main_df, price_subset, on = ['state', 'county_match'], how = 'outer')
merged

Unnamed: 0,state,county_x,population,area,risk_value,risk_score,risk_ratng,county_match,county_y,price
0,AK,Aleutians East,3374.0,15167.694230,8.949653e+05,4.231626,Very Low,aleutianseast,,
1,AK,Aleutians West,5168.0,14258.993100,4.151810e+06,35.221126,Very Low,aleutianswest,,
2,AK,Anchorage,290985.0,1966.338483,9.456571e+07,94.845689,Relatively Moderate,anchorage,Anchorage,3.733779e+05
3,AK,Bethel,18633.0,46015.507070,2.354071e+06,16.926503,Very Low,bethel,,
4,AK,Bristol Bay,843.0,857.255664,1.055605e+05,0.159084,Very Low,bristolbay,,
...,...,...,...,...,...,...,...,...,...,...
3240,WY,Sweetwater,42238.0,10597.074000,1.189081e+06,6.617881,Very Low,sweetwater,Sweetwater,2.761582e+05
3241,WY,Teton,23250.0,4259.833177,1.141156e+07,68.501432,Relatively Low,teton,Teton,2.334408e+06
3242,WY,Uinta,20412.0,2109.220457,2.821381e+06,21.699014,Very Low,uinta,Uinta,2.883248e+05
3243,WY,Washakie,7662.0,2265.341661,6.211963e+05,2.513522,Very Low,washakie,Washakie,2.509363e+05


In [31]:
# Counties that exist in House Price df, but not in risk df
merged[merged['risk_ratng'].isnull()]

Unnamed: 0,state,county_x,population,area,risk_value,risk_score,risk_ratng,county_match,county_y,price
1501,MO,,,,,,,saintegenevieve,Sainte Genevieve,223287.106181
1977,NV,,,,,,,carson,Carson,446056.873731


In [33]:
# Counties that exist in Risk df, but not in House Price df
merged[merged['price'].isnull()]

Unnamed: 0,state,county_x,population,area,risk_value,risk_score,risk_ratng,county_match,county_y,price
0,AK,Aleutians East,3374.0,15167.694230,8.949653e+05,4.231626,Very Low,aleutianseast,,
1,AK,Aleutians West,5168.0,14258.993100,4.151810e+06,35.221126,Very Low,aleutianswest,,
3,AK,Bethel,18633.0,46015.507070,2.354071e+06,16.926503,Very Low,bethel,,
4,AK,Bristol Bay,843.0,857.255664,1.055605e+05,0.159084,Very Low,bristolbay,,
5,AK,Chugach,7011.0,15296.227860,1.412609e+06,8.017817,Very Low,chugach,,
...,...,...,...,...,...,...,...,...,...,...
3040,VI,Saint John,3882.0,92.690701,,,Insufficient Data,saintjohn,,
3041,VI,Saint Thomas,41412.0,311.621622,,,Insufficient Data,saintthomas,,
3062,WA,Columbia,3947.0,882.252750,7.855905e+05,3.595291,Very Low,columbia,,
3067,WA,Garfield,2285.0,725.536389,3.339078e+05,0.827235,Very Low,garfield,,


In [35]:
# Define new main_df as merged:
main_df = merged

## 3. Dataset: Average Income by County

In [38]:
income = pd.read_excel('data/county_income.xlsx', header = [0,1,2,3])
income.drop([0,3218, 3219, 3220], inplace=True)

In [82]:
income

Unnamed: 0_level_0,"Table 1. Per Capita Personal Income, by County, 2021–2023","Table 1. Per Capita Personal Income, by County, 2021–2023","Table 1. Per Capita Personal Income, by County, 2021–2023","Table 1. Per Capita Personal Income, by County, 2021–2023","Table 1. Per Capita Personal Income, by County, 2021–2023","Table 1. Per Capita Personal Income, by County, 2021–2023","Table 1. Per Capita Personal Income, by County, 2021–2023","Table 1. Per Capita Personal Income, by County, 2021–2023"
Unnamed: 0_level_1,Unnamed: 0_level_1,Per capita personal income1,Per capita personal income1,Per capita personal income1,Per capita personal income1,Percent change from preceding period,Percent change from preceding period,Percent change from preceding period
Unnamed: 0_level_2,Unnamed: 0_level_2,Dollars,Dollars,Dollars,Rank in state,Percent change,Percent change,Rank in state
Unnamed: 0_level_3,Unnamed: 0_level_3,2021,2022,2023,2023,2022,2023,2023
1,,,,,,,,
2,Alabama,50483.0,51683.0,54209.0,--,2.4,4.9,--
3,Autauga,49174.0,49811.0,53079.0,10,1.3,6.6,6
4,Baldwin,56285.0,57621.0,60969.0,4,2.4,5.8,12
5,Barbour,40954.0,41031.0,41531.0,56,0.2,1.2,58
...,...,...,...,...,...,...,...,...
3213,Sweetwater,56150.0,60115.0,64115.0,11,7.1,6.7,8
3214,Teton,353263.0,418669.0,471751.0,1,18.5,12.7,1
3215,Uinta,44685.0,46401.0,49350.0,23,3.8,6.4,11
3216,Washakie,58601.0,54752.0,57474.0,17,-6.6,5.0,17


In [40]:
income_subset = income[[('Table 1. Per Capita Personal Income, by County, 2021–2023',
  'Unnamed: 0_level_1',
  'Unnamed: 0_level_2',
  'Unnamed: 0_level_3'),('Table 1. Per Capita Personal Income, by County, 2021–2023',
  'Per capita personal income1',
  'Dollars',
  2023)]]
income_subset.columns = ['county', 'income']
income_subset['state'] = [None]*income_subset.shape[0]
income_subset.drop(income_subset.index[2986:3018], inplace = True)
income_subset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_subset['state'] = [None]*income_subset.shape[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_subset.drop(income_subset.index[2986:3018], inplace = True)


Unnamed: 0,county,income,state
1,,,
2,Alabama,54209.0,
3,Autauga,53079.0,
4,Baldwin,60969.0,
5,Barbour,41531.0,
...,...,...,...
3213,Sweetwater,64115.0,
3214,Teton,471751.0,
3215,Uinta,49350.0,
3216,Washakie,57474.0,


In [42]:
null_indexes = list(income_subset[income_subset['county'].isna()].index) + [3218]
for i in range(len(null_indexes)-1):
    start, end = null_indexes[i], null_indexes[i+1]
    income_subset.loc[start+1:end-1, "state"] = income_subset.loc[start+1]['county']

In [44]:
null_indexes = null_indexes[:-1]
income_subset.drop(null_indexes, inplace = True)
income_subset.drop(np.array(null_indexes) + 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_subset.drop(null_indexes, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  income_subset.drop(np.array(null_indexes) + 1, inplace = True)


In [46]:
states = [
    ('Alabama', 'AL'), ('Alaska', 'AK'), ('Arizona', 'AZ'), ('Arkansas', 'AR'), ('California', 'CA'),
    ('Colorado', 'CO'), ('Connecticut', 'CT'), ('Delaware', 'DE'), ('Florida', 'FL'), ('Georgia', 'GA'),
    ('Hawaii', 'HI'), ('Idaho', 'ID'), ('Illinois', 'IL'), ('Indiana', 'IN'), ('Iowa', 'IA'),
    ('Kansas', 'KS'), ('Kentucky', 'KY'), ('Louisiana', 'LA'), ('Maine', 'ME'), ('Maryland', 'MD'),
    ('Massachusetts', 'MA'), ('Michigan', 'MI'), ('Minnesota', 'MN'), ('Mississippi', 'MS'), ('Missouri', 'MO'),
    ('Montana', 'MT'), ('Nebraska', 'NE'), ('Nevada', 'NV'), ('New Hampshire', 'NH'), ('New Jersey', 'NJ'),
    ('New Mexico', 'NM'), ('New York', 'NY'), ('North Carolina', 'NC'), ('North Dakota', 'ND'), ('Ohio', 'OH'),
    ('Oklahoma', 'OK'), ('Oregon', 'OR'), ('Pennsylvania', 'PA'), ('Rhode Island', 'RI'), ('South Carolina', 'SC'),
    ('South Dakota', 'SD'), ('Tennessee', 'TN'), ('Texas', 'TX'), ('Utah', 'UT'), ('Vermont', 'VT'),
    ('Virginia', 'VA'), ('Washington', 'WA'), ('West Virginia', 'WV'), ('Wisconsin', 'WI'), ('Wyoming', 'WY')
]

state_abbr_df = pd.DataFrame(states, columns=['state', 'abbreviation'])

In [48]:
inc = pd.merge(income_subset, state_abbr_df, on = 'state', how='left')
inc.drop(columns = 'state', inplace = True)
inc.rename(columns = {'abbreviation': 'state'}, inplace = True)
inc

Unnamed: 0,county,income,state
0,Autauga,53079.0,AL
1,Baldwin,60969.0,AL
2,Barbour,41531.0,AL
3,Bibb,39835.0,AL
4,Blount,45021.0,AL
...,...,...,...
3078,Sweetwater,64115.0,WY
3079,Teton,471751.0,WY
3080,Uinta,49350.0,WY
3081,Washakie,57474.0,WY


### 3.1. Merge Income Data

In [51]:
print(f'Income Data has {inc.shape[0]} counties. So, some counties that may be in price/risk datasets will not be here.')

Income Data has 3083 counties. So, some counties that may be in price/risk datasets will not be here.


In [53]:
main_df.rename(columns = {'county_x': 'county_risk', 'county_y':'county_price'}, inplace = True)

In [55]:
# remove endings like "borough" from end of county name
new_county = []
for county in inc['county'].to_list():
    tails = ['City and Borough', 'Borough', 'Census Area', 'Municipality']
    for end in tails:
        if county[-len(end):] == end:
            county = county[:-len(end)-1]
    if county == 'Wise + Norton':
        county = 'Wise'
    if county == 'York + Poquoson':
        county = "York"
    new_county.append(county.replace('St.', 'Saint'))
inc['county'] = new_county

inc['county_match'] = [re.sub("[^a-zA-Z]", "", county).lower() for county in inc['county'].to_list()]
inc.rename(columns = {'county': 'county_inc'}, inplace = True)
inc

Unnamed: 0,county_inc,income,state,county_match
0,Autauga,53079.0,AL,autauga
1,Baldwin,60969.0,AL,baldwin
2,Barbour,41531.0,AL,barbour
3,Bibb,39835.0,AL,bibb
4,Blount,45021.0,AL,blount
...,...,...,...,...
3078,Sweetwater,64115.0,WY,sweetwater
3079,Teton,471751.0,WY,teton
3080,Uinta,49350.0,WY,uinta
3081,Washakie,57474.0,WY,washakie


In [57]:
merged = pd.merge(main_df, inc, on = ['state','county_match'], how = 'left')
merged

Unnamed: 0,state,county_risk,population,area,risk_value,risk_score,risk_ratng,county_match,county_price,price,county_inc,income
0,AK,Aleutians East,3374.0,15167.694230,8.949653e+05,4.231626,Very Low,aleutianseast,,,Aleutians East,65421.0
1,AK,Aleutians West,5168.0,14258.993100,4.151810e+06,35.221126,Very Low,aleutianswest,,,Aleutians West,72234.0
2,AK,Anchorage,290985.0,1966.338483,9.456571e+07,94.845689,Relatively Moderate,anchorage,Anchorage,3.733779e+05,Anchorage,79183.0
3,AK,Bethel,18633.0,46015.507070,2.354071e+06,16.926503,Very Low,bethel,,,Bethel,48831.0
4,AK,Bristol Bay,843.0,857.255664,1.055605e+05,0.159084,Very Low,bristolbay,,,Bristol Bay,169751.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3240,WY,Sweetwater,42238.0,10597.074000,1.189081e+06,6.617881,Very Low,sweetwater,Sweetwater,2.761582e+05,Sweetwater,64115.0
3241,WY,Teton,23250.0,4259.833177,1.141156e+07,68.501432,Relatively Low,teton,Teton,2.334408e+06,Teton,471751.0
3242,WY,Uinta,20412.0,2109.220457,2.821381e+06,21.699014,Very Low,uinta,Uinta,2.883248e+05,Uinta,49350.0
3243,WY,Washakie,7662.0,2265.341661,6.211963e+05,2.513522,Very Low,washakie,Washakie,2.509363e+05,Washakie,57474.0


Income data does not have US Territory data - only data within the 50 states. It is also missing granular data in Virginia, but has data regarding combinations of areas, which we may be able to leverage if interested.

In [60]:
# reset main_df to merged
main_df = merged
main_df

Unnamed: 0,state,county_risk,population,area,risk_value,risk_score,risk_ratng,county_match,county_price,price,county_inc,income
0,AK,Aleutians East,3374.0,15167.694230,8.949653e+05,4.231626,Very Low,aleutianseast,,,Aleutians East,65421.0
1,AK,Aleutians West,5168.0,14258.993100,4.151810e+06,35.221126,Very Low,aleutianswest,,,Aleutians West,72234.0
2,AK,Anchorage,290985.0,1966.338483,9.456571e+07,94.845689,Relatively Moderate,anchorage,Anchorage,3.733779e+05,Anchorage,79183.0
3,AK,Bethel,18633.0,46015.507070,2.354071e+06,16.926503,Very Low,bethel,,,Bethel,48831.0
4,AK,Bristol Bay,843.0,857.255664,1.055605e+05,0.159084,Very Low,bristolbay,,,Bristol Bay,169751.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3240,WY,Sweetwater,42238.0,10597.074000,1.189081e+06,6.617881,Very Low,sweetwater,Sweetwater,2.761582e+05,Sweetwater,64115.0
3241,WY,Teton,23250.0,4259.833177,1.141156e+07,68.501432,Relatively Low,teton,Teton,2.334408e+06,Teton,471751.0
3242,WY,Uinta,20412.0,2109.220457,2.821381e+06,21.699014,Very Low,uinta,Uinta,2.883248e+05,Uinta,49350.0
3243,WY,Washakie,7662.0,2265.341661,6.211963e+05,2.513522,Very Low,washakie,Washakie,2.509363e+05,Washakie,57474.0


## 4. Crime Rate

In [70]:
crime = pd.read_csv('data/crime_report.csv')
crime.drop(crime[crime['report_year'] != 2015].index, inplace = True)
crime

Unnamed: 0,report_year,agency_code,agency_jurisdiction,population,violent_crimes,homicides,rapes,assaults,robberies,months_reported,crimes_percapita,homicides_percapita,rapes_percapita,assaults_percapita,robberies_percapita
2760,2015,NM00101,"Albuquerque, NM",559721.0,5406.0,43.0,404.0,3273.0,1686.0,,965.84,7.68,72.18,584.76,301.22
2761,2015,TX22001,"Arlington, TX",387565.0,1946.0,8.0,208.0,1201.0,529.0,,502.11,2.06,53.67,309.88,136.49
2762,2015,GAAPD00,"Atlanta, GA",464710.0,5203.0,94.0,170.0,2944.0,1995.0,,1119.62,20.23,36.58,633.51,429.30
2763,2015,CO00101,"Aurora, CO",360237.0,1660.0,24.0,352.0,837.0,447.0,,460.81,6.66,97.71,232.35,124.08
2764,2015,TX22701,"Austin, TX",938728.0,3497.0,23.0,487.0,2058.0,929.0,,372.53,2.45,51.88,219.23,98.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2824,2015,OK07205,"Tulsa, OK",401520.0,3628.0,55.0,365.0,2354.0,854.0,,903.57,13.70,90.90,586.27,212.69
2825,2015,VA12800,"Virginia Beach, VA",452797.0,626.0,19.0,103.0,234.0,270.0,,138.25,4.20,22.75,51.68,59.63
2826,2015,DCMPD00,"Washington, DC",672228.0,8084.0,162.0,494.0,4024.0,3404.0,,1202.57,24.10,73.49,598.61,506.38
2827,2015,KS08703,"Wichita, KS",389824.0,3839.0,27.0,349.0,2730.0,733.0,,984.80,6.93,89.53,700.32,188.03


In [78]:
crime['total_crimes'] = (crime['crimes_percapita'] + crime['homicides_percapita'] + crime['rapes_percapita'] + 
                         crime['assaults_percapita'] + crime['robberies_percapita'])


In [None]:
crime[]