# Import Data and Look Around

- set up the dataframes
- take a look at their characteristics
- clean up as needed
- join as needed
- save cleaned and joined dataframes

# Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import and Setup Dataframes

In [7]:
outpatient_csv = '../data/tennessee_outpatient_clean.csv'
cbsa2name_csv = '../data/core-based-statistical-areas-cbsas-and-combined-statistical-areas-csas.csv'

cbsa_to_name = pd.read_csv(cbsa2name_csv, engine='python')
outpatient = pd.read_csv(outpatient_csv, 
                         dtype={"provider_name": object, "provider_street_address": object, "provider_city": object})

# NOTE: was getting an error so changed these dtypes on import. The ones that should be 'int' caused more errors
#       so they're commented out
#                          dtype={"provider_id": int, "provider_name": object, "provider_street_address": object, 
#                                 "provider_city": object, "provider_zip_code": int, "apc": int})

print(cbsa_to_name.info())
print(outpatient.info())
outpatient.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
CBSA Code                                     1918 non-null object
Metropolitan Division Code                    111 non-null object
CSA Code                                      1256 non-null object
CBSA Title                                    1916 non-null object
Metropolitan/Micropolitan Statistical Area    1915 non-null object
Metropolitan Division Title                   1915 non-null object
CSA Title                                     110 non-null object
County/County Equivalent                      1255 non-null object
State Name                                    1915 non-null object
FIPS State Code                               1915 non-null object
FIPS County Code                              1915 non-null object
Central/Outlying County                       1915 non-null object
columnm                                       1915 non-null object
columnn                 

Unnamed: 0.1,Unnamed: 0,provider_id,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code,provider_hospital_referral_region_(hrr),apc,apc_description,...,average_medicare_allowed_amount,average_medicare_payment_amount,outlier_comprehensive_apc_services,average_medicare_outlier_amount,zip,cbsa,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,0,440002.0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,TN - Jackson,5302.0,Level 2 Upper GI Procedures,...,1141.086681,899.775086,0.0,0.0,38301,15140,0.000167,0.000428,0.0,0.000185
1,1,440002.0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,TN - Jackson,5302.0,Level 2 Upper GI Procedures,...,1141.086681,899.775086,0.0,0.0,38301,27180,0.999833,0.999572,1.0,0.999815
2,2,440003.0,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0,TN - Nashville,5302.0,Level 2 Upper GI Procedures,...,1223.454825,968.390614,0.0,0.0,37066,34980,1.0,1.0,1.0,1.0
3,3,440006.0,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0,TN - Nashville,5302.0,Level 2 Upper GI Procedures,...,1248.62,994.84,0.0,0.0,37207,34980,1.0,1.0,1.0,1.0
4,4,440009.0,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0,TN - Nashville,5302.0,Level 2 Upper GI Procedures,...,1152.663556,914.163556,0.0,0.0,38555,18900,1.0,1.0,1.0,1.0


# Clean Up Column Names and Dtypes

In [9]:
# convert spaces to underscores and make lowercase
cbsa_to_name.columns = cbsa_to_name.columns.str.replace(' ', '_').str.lower()



# DEBUGDEBUG: This is not working cbsa_code is still an object, see below
# Cast CBSA code to numeric
cbsa_to_name['cbsa_code'] = pd.to_numeric(cbsa_to_name['cbsa_code'], errors='coerce')
print(cbsa_to_name.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
cbsa_code                                     1915 non-null float64
metropolitan_division_code                    111 non-null object
csa_code                                      1256 non-null object
cbsa_title                                    1916 non-null object
metropolitan/micropolitan_statistical_area    1915 non-null object
metropolitan_division_title                   1915 non-null object
csa_title                                     110 non-null object
county/county_equivalent                      1255 non-null object
state_name                                    1915 non-null object
fips_state_code                               1915 non-null object
fips_county_code                              1915 non-null object
central/outlying_county                       1915 non-null object
columnm                                       1915 non-null object
columnn                

# Join Files

In [4]:
outpatient = pd.merge(left=outpatient, right=cbsa_to_name, 
                                      left_on='cbsa', right_on='cbsa_code', how='inner')

print(outpatient.info())
outpatient.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202424 entries, 0 to 202423
Data columns (total 38 columns):
Unnamed: 0                                    202421 non-null float64
provider_id                                   427 non-null float64
provider_name                                 427 non-null object
provider_street_address                       427 non-null object
provider_city                                 427 non-null object
provider_state                                427 non-null object
provider_zip_code                             427 non-null float64
provider_hospital_referral_region_(hrr)       427 non-null object
apc                                           427 non-null float64
apc_description                               427 non-null object
beneficiaries                                 425 non-null float64
comprehensive_apc_services                    427 non-null float64
average_estimated_total_submitted_charges     427 non-null float64
average_medicare_allo

Unnamed: 0.1,Unnamed: 0,provider_id,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code,provider_hospital_referral_region_(hrr),apc,apc_description,...,metropolitan_division_title,csa_title,county/county_equivalent,state_name,fips_state_code,fips_county_code,central/outlying_county,columnm,columnn,columno
0,0.0,440002.0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,TN - Jackson,5302.0,Level 2 Upper GI Procedures,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
1,17633.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
2,17637.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
3,17655.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
4,17670.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,


# Look at data
- .info()
- .head()
- .tail()
- .shape

# Drop Extra Columns

In [5]:
# Drop all but listed tables for outpatient
outpatient_clean = outpatient[[
    'provider_name',
    'apc',
    'apc_description',
    'beneficiaries',
    'comprehensive_apc_services',
    'average_estimated_total_submitted_charges',
    'average_medicare_allowed_amount',
    'average_medicare_payment_amount',
    'average_medicare_outlier_amount',
    'zip',
    'cbsa',
    'cbsa_title']]
# outpatient = outpatient.dropna()
outpatient

Unnamed: 0.1,Unnamed: 0,provider_id,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code,provider_hospital_referral_region_(hrr),apc,apc_description,...,metropolitan_division_title,csa_title,county/county_equivalent,state_name,fips_state_code,fips_county_code,central/outlying_county,columnm,columnn,columno
0,0.0,440002.0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,TN - Jackson,5302.0,Level 2 Upper GI Procedures,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
1,17633.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
2,17637.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
3,17655.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
4,17670.0,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202419,47405.0,,,,,,,,,,...,Micropolitan Statistical Area,,,Ketchikan Gateway Borough,Alaska,2,130,Central,,
202420,47415.0,,,,,,,,,,...,Micropolitan Statistical Area,,,Ketchikan Gateway Borough,Alaska,2,130,Central,,
202421,,,,,,,,,,,...,,,,,,,,,,
202422,,,,,,,,,,,...,,,,,,,,,,
