# Import Data and Create Dataframes

- set up the dataframes
- clean up as needed
- join as needed
- save cleaned and joined dataframes

## DEBUGDEBUG
- population_projections has float columns that should be int
- agi_by_state_2017 and agi_by_zip dataframes are displaying weird when I do .info. Take a closer look


# Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import and Setup Dataframes

In [2]:
# Setup filename variables
outpatient_csv = '../data/tennessee_outpatient_clean.csv'
cbsa2name_csv = '../data/core-based-statistical-areas-cbsas-and-combined-statistical-areas-csas.csv'
agi_by_state_2017_csv = '../data/Adjusted Gross Income Percentiles by State Tax Year 2017.csv'
agi_by_zip_2017_csv = '../data/income by zip 2017.csv'
population_projections_csv = '../data/State Population Projections 2004-2030.csv'

# import files to dataframes
cbsa_to_name = pd.read_csv(cbsa2name_csv, engine='python')
outpatient = pd.read_csv(outpatient_csv, low_memory=False,
                         dtype={"provider_name": object, "provider_street_address": object, "provider_city": object})
agi_by_state_2017 = pd.read_csv(agi_by_state_2017_csv)
agi_by_zip_2017 = pd.read_csv(agi_by_zip_2017_csv)
population_projections = pd.read_csv(population_projections_csv, sep='\t')



In [3]:
# confirm imports
print('This is: cbsa_to_name')
print(cbsa_to_name.info())

print('\n\nThis is: outpatient')
print(outpatient.info())

print('\n\nThis is: agi_by_state_2017')
print(agi_by_state_2017.info())

print('\n\nThis is: agi_by_zip_2017')
print(agi_by_zip_2017.info())

print('\n\nThis is: population_projections')
print(population_projections.info())




This is: cbsa_to_name
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
CBSA Code                                     1918 non-null object
Metropolitan Division Code                    111 non-null object
CSA Code                                      1256 non-null object
CBSA Title                                    1916 non-null object
Metropolitan/Micropolitan Statistical Area    1915 non-null object
Metropolitan Division Title                   1915 non-null object
CSA Title                                     110 non-null object
County/County Equivalent                      1255 non-null object
State Name                                    1915 non-null object
FIPS State Code                               1915 non-null object
FIPS County Code                              1915 non-null object
Central/Outlying County                       1915 non-null object
columnm                                       1915 non-null object
co

# Clean Up Column Names and Dtypes

In [4]:
# convert spaces to underscores and make lowercase
# NOTE: Make this a function and add to my library
cbsa_to_name.columns = cbsa_to_name.columns.str.replace(' ', '_').str.lower()
agi_by_state_2017.columns = agi_by_state_2017.columns.str.replace(' ', '_').str.lower()
agi_by_zip_2017.columns = agi_by_zip_2017.columns.str.replace(' ', '_').str.lower()
population_projections.columns = population_projections.columns.str.replace(' ', '_').str.lower()


In [5]:
# confirm conversion
print('This is: cbsa_to_name')
print(cbsa_to_name.info())

print('\n\nThis is: agi_by_state_2017')
print(agi_by_state_2017.info())

print('\n\nThis is: agi_by_zip_2017')
print(agi_by_zip_2017.info())

print('\n\nThis is: population_projections')
print(population_projections.info())




This is: cbsa_to_name
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
cbsa_code                                     1918 non-null object
metropolitan_division_code                    111 non-null object
csa_code                                      1256 non-null object
cbsa_title                                    1916 non-null object
metropolitan/micropolitan_statistical_area    1915 non-null object
metropolitan_division_title                   1915 non-null object
csa_title                                     110 non-null object
county/county_equivalent                      1255 non-null object
state_name                                    1915 non-null object
fips_state_code                               1915 non-null object
fips_county_code                              1915 non-null object
central/outlying_county                       1915 non-null object
columnm                                       1915 non-null object
co

In [6]:
# Cast CBSA code to numeric
cbsa_to_name['cbsa_code'] = pd.to_numeric(cbsa_to_name['cbsa_code'], errors='coerce')
print(cbsa_to_name.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
cbsa_code                                     1915 non-null float64
metropolitan_division_code                    111 non-null object
csa_code                                      1256 non-null object
cbsa_title                                    1916 non-null object
metropolitan/micropolitan_statistical_area    1915 non-null object
metropolitan_division_title                   1915 non-null object
csa_title                                     110 non-null object
county/county_equivalent                      1255 non-null object
state_name                                    1915 non-null object
fips_state_code                               1915 non-null object
fips_county_code                              1915 non-null object
central/outlying_county                       1915 non-null object
columnm                                       1915 non-null object
columnn                

In [7]:
###################################################
# drop the rows where 'notes'=='Total', they have NaN in 'age_group' column
# this leaves us with only year and population info

# df = df[df['EPS'].notna()]
population_projections=population_projections[population_projections['age_group'].notna()]

# Convert Year and Population columns to int.
# >>> df[list("ABCD")] = df[list("ABCD")].astype(int)
population_projections[['year','year_code','projected_populations']] = population_projections[['year','year_code','projected_populations']].astype(int)

population_projections

Unnamed: 0,notes,year,year_code,age_group,age_group_code,projected_populations
0,,2025,2025,0-4 years,0-4,481714
1,,2025,2025,5-9 years,5-9,466987
2,,2025,2025,10-14 years,10-14,462242
3,,2025,2025,15-19 years,15-19,464989
4,,2025,2025,20-24 years,20-24,466395
5,,2025,2025,25-29 years,25-29,442275
6,,2025,2025,30-34 years,30-34,445935
7,,2025,2025,35-39 years,35-39,436057
8,,2025,2025,40-44 years,40-44,427959
9,,2025,2025,45-49 years,45-49,419593


In [8]:
# these look wrong, take a closer look
print('\n\nThis is: agi_by_state_2017')
print(agi_by_state_2017.head())

print('\n\nThis is: agi_by_zip_2017')
print(agi_by_zip_2017.head())




This is: agi_by_state_2017
   statefips state     state_name      total   top_01   top_05    top_10  \
0          0    US  United States  140871623  1408716  7043581  14087162   
1          1    AL        Alabama    1912769    19128    95638    191277   
2          2    AK         Alaska     317892     3179    15895     31789   
3          4    AZ        Arizona    2837210    28372   141861    283721   
4          5    AR       Arkansas    1147443    11474    57372    114744   

     top_25    top_50     top_75  ...  sum_scorp_50  num_scorp_75  \
0  35217906  70435812  105653717  ...  7.268620e+11       7962545   
1    478192    956385    1434577  ...  7.302828e+09         85913   
2     79473    158946     238419  ...  1.315947e+09         17060   
3    709303   1418605    2127908  ...  1.118380e+10        148837   
4    286861    573722     860582  ...  3.876006e+09         60420   

   sum_scorp_75     total_tax    sum_tax_01    sum_tax_05    sum_tax_10  \
0  7.318500e+11  1.60269

# Join Files? 
- not sure if I should do this yet as file are still having issues. Commented out for now.

In [9]:
outpatient = pd.merge(left=outpatient, right=cbsa_to_name, 
                                      left_on='cbsa', right_on='cbsa_code', how='inner')

print(outpatient.info())
outpatient.head(30)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191227 entries, 0 to 191226
Data columns (total 38 columns):
Unnamed: 0                                    191227 non-null int64
provider_id                                   417 non-null float64
provider_name                                 417 non-null object
provider_street_address                       417 non-null object
provider_city                                 417 non-null object
provider_state                                417 non-null object
provider_zip_code                             417 non-null float64
provider_hospital_referral_region_(hrr)       417 non-null object
apc                                           417 non-null float64
apc_description                               417 non-null object
beneficiaries                                 415 non-null float64
comprehensive_apc_services                    417 non-null float64
average_estimated_total_submitted_charges     417 non-null float64
average_medicare_allowe

Unnamed: 0.1,Unnamed: 0,provider_id,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code,provider_hospital_referral_region_(hrr),apc,apc_description,...,metropolitan_division_title,csa_title,county/county_equivalent,state_name,fips_state_code,fips_county_code,central/outlying_county,columnm,columnn,columno
0,0,440002.0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,TN - Jackson,5302.0,Level 2 Upper GI Procedures,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
1,17633,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
2,17637,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
3,17655,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
4,17670,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
5,17684,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
6,17687,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
7,17689,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
8,17693,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
9,17878,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,


# Look at data
- .info()
- .head()
- .tail()
- .shape

# Export clean files

In [12]:
outpatient.to_csv('../data/outpatient_cbsa_clean.csv')
# cbsa_to_name.to_csv('../data/cbsa2name_clean.csv')
agi_by_state_2017.to_csv('../data/agi_by_state_2017_clean.csv')
agi_by_zip_2017.to_csv('../data/agi_by_zip_2017_clean.csv')
population_projections.to_csv('../data/population_projections_clean.csv')
