# Import Data and Create Dataframes

- set up the dataframes
- clean up as needed
- join as needed
- save cleaned and joined dataframes

## DEBUGDEBUG
- population_projections has float columns that should be int
- agi_by_state_2017 and agi_by_zip dataframes are displaying weird when I do .info. Take a closer look


# Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import and Setup Dataframes

In [2]:
# Setup filename variables
outpatient_csv = '../data/tennessee_outpatient_clean.csv'
cbsa2name_csv = '../data/core-based-statistical-areas-cbsas-and-combined-statistical-areas-csas.csv'
agi_by_state_2017_csv = '../data/Adjusted Gross Income Percentiles by State Tax Year 2017.csv'
agi_by_zip_2017_csv = '../data/income by zip 2017.csv'
population_projections_csv = '../data/State Population Projections 2004-2030.csv'

# import files to dataframes
cbsa_to_name = pd.read_csv(cbsa2name_csv, engine='python')
outpatient = pd.read_csv(outpatient_csv, low_memory=False,
                         dtype={"provider_name": object, "provider_street_address": object, "provider_city": object})
agi_by_state_2017 = pd.read_csv(agi_by_state_2017_csv)
agi_by_zip_2017 = pd.read_csv(agi_by_zip_2017_csv)
population_projections = pd.read_csv(population_projections_csv, sep='\t')



In [3]:
# confirm imports
print('This is: cbsa_to_name')
print(cbsa_to_name.info())

print('\n\nThis is: outpatient')
print(outpatient.info())

print('\n\nThis is: agi_by_state_2017')
print(agi_by_state_2017.info())

print('\n\nThis is: agi_by_zip_2017')
print(agi_by_zip_2017.info())

print('\n\nThis is: population_projections')
print(population_projections.info())




This is: cbsa_to_name
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
CBSA Code                                     1918 non-null object
Metropolitan Division Code                    111 non-null object
CSA Code                                      1256 non-null object
CBSA Title                                    1916 non-null object
Metropolitan/Micropolitan Statistical Area    1915 non-null object
Metropolitan Division Title                   1915 non-null object
CSA Title                                     110 non-null object
County/County Equivalent                      1255 non-null object
State Name                                    1915 non-null object
FIPS State Code                               1915 non-null object
FIPS County Code                              1915 non-null object
Central/Outlying County                       1915 non-null object
columnm                                       1915 non-null object
co

# Clean Up Column Names and Dtypes

In [4]:
# convert spaces to underscores and make lowercase
# NOTE: Make this a function and add to my library
cbsa_to_name.columns = cbsa_to_name.columns.str.replace(' ', '_').str.lower()
agi_by_state_2017.columns = agi_by_state_2017.columns.str.replace(' ', '_').str.lower()
agi_by_zip_2017.columns = agi_by_zip_2017.columns.str.replace(' ', '_').str.lower()
population_projections.columns = population_projections.columns.str.replace(' ', '_').str.lower()


In [5]:
# confirm conversion
print('This is: cbsa_to_name')
print(cbsa_to_name.info())

print('\n\nThis is: agi_by_state_2017')
print(agi_by_state_2017.info())

print('\n\nThis is: agi_by_zip_2017')
print(agi_by_zip_2017.info())

print('\n\nThis is: population_projections')
print(population_projections.info())




This is: cbsa_to_name
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
cbsa_code                                     1918 non-null object
metropolitan_division_code                    111 non-null object
csa_code                                      1256 non-null object
cbsa_title                                    1916 non-null object
metropolitan/micropolitan_statistical_area    1915 non-null object
metropolitan_division_title                   1915 non-null object
csa_title                                     110 non-null object
county/county_equivalent                      1255 non-null object
state_name                                    1915 non-null object
fips_state_code                               1915 non-null object
fips_county_code                              1915 non-null object
central/outlying_county                       1915 non-null object
columnm                                       1915 non-null object
co

In [6]:
# Cast CBSA code to numeric
cbsa_to_name['cbsa_code'] = pd.to_numeric(cbsa_to_name['cbsa_code'], errors='coerce')
print(cbsa_to_name.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1918 entries, 0 to 1917
Data columns (total 15 columns):
cbsa_code                                     1915 non-null float64
metropolitan_division_code                    111 non-null object
csa_code                                      1256 non-null object
cbsa_title                                    1916 non-null object
metropolitan/micropolitan_statistical_area    1915 non-null object
metropolitan_division_title                   1915 non-null object
csa_title                                     110 non-null object
county/county_equivalent                      1255 non-null object
state_name                                    1915 non-null object
fips_state_code                               1915 non-null object
fips_county_code                              1915 non-null object
central/outlying_county                       1915 non-null object
columnm                                       1915 non-null object
columnn                

In [7]:
###################################################
# drop the rows where 'notes'=='Total', they have NaN in 'age_group' column
# this leaves us with only year and population info

# df = df[df['EPS'].notna()]
population_projections=population_projections[population_projections['age_group'].notna()]

# Convert Year and Population columns to int.
# >>> df[list("ABCD")] = df[list("ABCD")].astype(int)
population_projections[['year','year_code','projected_populations']] = population_projections[['year','year_code','projected_populations']].astype(int)

# drop notes column
population_projections = population_projections.drop(['notes'], axis=1)
population_projections

Unnamed: 0,year,year_code,age_group,age_group_code,projected_populations
0,2025,2025,0-4 years,0-4,481714
1,2025,2025,5-9 years,5-9,466987
2,2025,2025,10-14 years,10-14,462242
3,2025,2025,15-19 years,15-19,464989
4,2025,2025,20-24 years,20-24,466395
5,2025,2025,25-29 years,25-29,442275
6,2025,2025,30-34 years,30-34,445935
7,2025,2025,35-39 years,35-39,436057
8,2025,2025,40-44 years,40-44,427959
9,2025,2025,45-49 years,45-49,419593


In [8]:
# these look wrong, take a closer look
print('\n\nThis is: agi_by_state_2017')
print(agi_by_state_2017.head())

print('\n\nThis is: agi_by_zip_2017')
print(agi_by_zip_2017.head())




This is: agi_by_state_2017
   statefips state     state_name      total   top_01   top_05    top_10  \
0          0    US  United States  140871623  1408716  7043581  14087162   
1          1    AL        Alabama    1912769    19128    95638    191277   
2          2    AK         Alaska     317892     3179    15895     31789   
3          4    AZ        Arizona    2837210    28372   141861    283721   
4          5    AR       Arkansas    1147443    11474    57372    114744   

     top_25    top_50     top_75  ...  sum_scorp_50  num_scorp_75  \
0  35217906  70435812  105653717  ...  7.268620e+11       7962545   
1    478192    956385    1434577  ...  7.302828e+09         85913   
2     79473    158946     238419  ...  1.315947e+09         17060   
3    709303   1418605    2127908  ...  1.118380e+10        148837   
4    286861    573722     860582  ...  3.876006e+09         60420   

   sum_scorp_75     total_tax    sum_tax_01    sum_tax_05    sum_tax_10  \
0  7.318500e+11  1.60269

# Join Files? 
- not sure if I should do this yet as file are still having issues. Commented out for now.

In [9]:
outpatient = pd.merge(left=outpatient, right=cbsa_to_name, 
                                      left_on='cbsa', right_on='cbsa_code', how='inner')

print(outpatient.info())
outpatient.head(30)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191227 entries, 0 to 191226
Data columns (total 38 columns):
Unnamed: 0                                    191227 non-null int64
provider_id                                   417 non-null float64
provider_name                                 417 non-null object
provider_street_address                       417 non-null object
provider_city                                 417 non-null object
provider_state                                417 non-null object
provider_zip_code                             417 non-null float64
provider_hospital_referral_region_(hrr)       417 non-null object
apc                                           417 non-null float64
apc_description                               417 non-null object
beneficiaries                                 415 non-null float64
comprehensive_apc_services                    417 non-null float64
average_estimated_total_submitted_charges     417 non-null float64
average_medicare_allowe

Unnamed: 0.1,Unnamed: 0,provider_id,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code,provider_hospital_referral_region_(hrr),apc,apc_description,...,metropolitan_division_title,csa_title,county/county_equivalent,state_name,fips_state_code,fips_county_code,central/outlying_county,columnm,columnn,columno
0,0,440002.0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,TN - Jackson,5302.0,Level 2 Upper GI Procedures,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
1,17633,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
2,17637,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
3,17655,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
4,17670,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
5,17684,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
6,17687,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
7,17689,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
8,17693,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,
9,17878,,,,,,,,,,...,Micropolitan Statistical Area,,Jackson-Brownsville,TN,Haywood County,Tennessee,47,75,Central,


In [10]:
# Still some extraneous columns, let's everything but this list:
# provider_id
# provider_name
# provider_street_address
# provider_city
# provider_state
# provider_zip_code
# apc
# apc_description
# beneficiaries
# comprehensive_apc_services
# cbsa
# cbsa_title

apc_locations = outpatient[[
    'provider_id', 'provider_name', 'provider_street_address', 'provider_city', 'provider_state', 'provider_zip_code',
    'apc', 'apc_description', 'beneficiaries', 'comprehensive_apc_services', 'cbsa', 'cbsa_title'
]].copy()
print(apc_locations.info())
apc_locations = apc_locations.dropna()
apc_locations.tail(30)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191227 entries, 0 to 191226
Data columns (total 12 columns):
provider_id                   417 non-null float64
provider_name                 417 non-null object
provider_street_address       417 non-null object
provider_city                 417 non-null object
provider_state                417 non-null object
provider_zip_code             417 non-null float64
apc                           417 non-null float64
apc_description               417 non-null object
beneficiaries                 415 non-null float64
comprehensive_apc_services    417 non-null float64
cbsa                          191227 non-null int64
cbsa_title                    191227 non-null object
dtypes: float64(5), int64(1), object(6)
memory usage: 19.0+ MB
None


Unnamed: 0,provider_id,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code,apc,apc_description,beneficiaries,comprehensive_apc_services,cbsa,cbsa_title
4895,440091.0,"Memorial Healthcare System, Inc",2525 Desales Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,315.0,373.0,16860,Chattanooga
4896,440091.0,"Memorial Healthcare System, Inc",2525 Desales Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,315.0,373.0,16860,Chattanooga
4897,440091.0,"Memorial Healthcare System, Inc",2525 Desales Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,315.0,373.0,16860,Chattanooga
4898,440091.0,"Memorial Healthcare System, Inc",2525 Desales Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,315.0,373.0,16860,Chattanooga
4899,440091.0,"Memorial Healthcare System, Inc",2525 Desales Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,315.0,373.0,16860,Chattanooga
4900,440091.0,"Memorial Healthcare System, Inc",2525 Desales Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,315.0,373.0,16860,Chattanooga
4901,440156.0,Parkridge Medical Center,2333 Mccallie Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,37.0,40.0,16860,Chattanooga
4902,440156.0,Parkridge Medical Center,2333 Mccallie Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,37.0,40.0,16860,Chattanooga
4903,440156.0,Parkridge Medical Center,2333 Mccallie Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,37.0,40.0,16860,Chattanooga
4904,440156.0,Parkridge Medical Center,2333 Mccallie Ave,Chattanooga,TN,37404.0,5302.0,Level 2 Upper GI Procedures,37.0,40.0,16860,Chattanooga


# Export clean files

In [11]:
apc_locations.to_csv('../data/apc_locations.csv')
# cbsa_to_name.to_csv('../data/cbsa2name_clean.csv')
# agi_by_state_2017.to_csv('../data/agi_by_state_2017_clean.csv')
# agi_by_zip_2017.to_csv('../data/agi_by_zip_2017_clean.csv')
population_projections.to_csv('../data/population_projections_clean.csv')


# Import next set of data
- .info()
- .head()
- .tail()
- .shape

In [12]:
# Setup filename variables
tenn_health_rankings_excel = '../data/2020 County Health Rankings Tennessee Data - v1_0.xlsx'
cities_csv = '../data/500_Cities__Local_Data_for_Better_Health__2019_release.csv'

# import files to dataframes
tenn_health_rankings = pd.read_excel(tenn_health_rankings_excel)
cities = pd.read_csv(cities_csv)

tenn_health_rankings.info()
cities.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 3 columns):
Unnamed: 0    49 non-null object
Unnamed: 1    241 non-null object
Unnamed: 2    192 non-null object
dtypes: object(3)
memory usage: 6.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15299 entries, 0 to 15298
Data columns (total 24 columns):
Year                          15299 non-null int64
StateAbbr                     15299 non-null object
StateDesc                     15299 non-null object
CityName                      15299 non-null object
GeographicLevel               15299 non-null object
DataSource                    15299 non-null object
Category                      15299 non-null object
UniqueID                      15299 non-null object
Measure                       15299 non-null object
Data_Value_Unit               15299 non-null object
DataValueTypeID               15299 non-null object
Data_Value_Type               15299 non-null object
Data_Value             

# Look at AGI files again

In [13]:
print(agi_by_zip_2017.info())
agi_by_zip_2017.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166537 entries, 0 to 166536
Columns: 153 entries, statefips to a12000
dtypes: int64(152), object(1)
memory usage: 194.4+ MB
None


Unnamed: 0,statefips,state,zipcode,agi_stub,n1,mars1,mars2,mars4,elf,cprep,...,n85300,a85300,n11901,a11901,n11900,a11900,n11902,a11902,n12000,a12000
0,1,AL,0,1,802640,474470,99850,216600,717050,44090,...,0,0,64680,53602,700940,1803125,698100,1796343,2860,4917
1,1,AL,0,2,499070,218590,137460,129760,448190,26230,...,0,0,77660,118725,419640,1175607,416180,1165352,4250,8894
2,1,AL,0,3,268590,89780,134440,38280,241060,14160,...,0,0,67820,156752,201030,560461,197060,547812,5440,13482
3,1,AL,0,4,170880,32180,124070,11660,154120,7980,...,0,0,48440,141721,121930,396526,118460,383588,3160,12369
4,1,AL,0,5,229870,22810,196990,5540,208380,11120,...,50,19,91100,465160,138250,588068,130970,529001,8430,55564


In [14]:
# in new dataframe: filter out all but Tennessee, exclude zip codes of 0 and 99999
# df.loc[df['column_name'] == some_value]
# df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)]


tenn_agi_by_zip_2017 = agi_by_zip_2017.loc[(agi_by_zip_2017['state']  == 'TN') 
                    & (agi_by_zip_2017['zipcode'] > 0)
                    & (agi_by_zip_2017['zipcode'] != 99999)]

# confirm new dataframe
print(tenn_agi_by_zip_2017.info())
print(tenn_agi_by_zip_2017.head(30))
# tenn_agi_by_zip_2017.tail(30)

# save new dataframe
tenn_agi_by_zip_2017.to_csv('../data/tenn_agi_by_zip_2017.csv')


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3528 entries, 134992 to 138519
Columns: 153 entries, statefips to a12000
dtypes: int64(152), object(1)
memory usage: 4.1+ MB
None
        statefips state  zipcode  agi_stub     n1  mars1  mars2  mars4    elf  \
134992         47    TN    37010         1    830    620    130     70    700   
134993         47    TN    37010         2    450    190    180     60    410   
134994         47    TN    37010         3    390    100    230     50    330   
134995         47    TN    37010         4    290     30    240     20    260   
134996         47    TN    37010         5    460     20    430      0    420   
134997         47    TN    37010         6    100      0     90      0     90   
134998         47    TN    37012         1    340    230     70     40    320   
134999         47    TN    37012         2    280    110    100     60    260   
135000         47    TN    37012         3    160     60    110      0    160   
135001    

In [15]:
cities.columns


Index(['Year', 'StateAbbr', 'StateDesc', 'CityName', 'GeographicLevel',
       'DataSource', 'Category', 'UniqueID', 'Measure', 'Data_Value_Unit',
       'DataValueTypeID', 'Data_Value_Type', 'Data_Value',
       'Low_Confidence_Limit', 'High_Confidence_Limit',
       'Data_Value_Footnote_Symbol', 'Data_Value_Footnote', 'PopulationCount',
       'GeoLocation', 'CategoryID', 'MeasureId', 'CityFIPS', 'TractFIPS',
       'Short_Question_Text'],
      dtype='object')

In [16]:
# I think thi sline is now redundant
# cities.columns = cities.columns.str.replace(' ', '_').str.lower()

# rename these columns before export
new_names = ['year', 'state_abbr', 'state_desc', 'city_name', 'geographic_level', 'data_source', 
             'category', 'uid', 'measure', 'data_value_unit', 'data_value_type_id', 'data_value_type',
             'data_value', 'low_confidence_limit', 'high_confidence_limit', 'data_value_footnote_symbol', 
             'data_value_footnote', 'population_count', 'geolocation', 'category_id', 'measure_id', 'city_fips', 
             'tract_fips', 'short_question_text']

cities.columns = new_names

cities.head()

Unnamed: 0,year,state_abbr,state_desc,city_name,geographic_level,data_source,category,uid,measure,data_value_unit,...,high_confidence_limit,data_value_footnote_symbol,data_value_footnote,population_count,geolocation,category_id,measure_id,city_fips,tract_fips,short_question_text
0,2017,TN,Tennessee,Memphis,Census Tract,BRFSS,Health Outcomes,4748000-47157007300,Chronic kidney disease among adults aged >=18 ...,%,...,3.2,,,4127,"(35.1209840406, -89.9363799177)",HLTHOUT,KIDNEY,4748000,47157010000.0,Chronic Kidney Disease
1,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Health Outcomes,4752006-47037018700,Stroke among adults aged >=18 Years,%,...,2.6,,,565,"(36.0524304194, -86.8041894006)",HLTHOUT,STROKE,4752006,47037020000.0,Stroke
2,2017,TN,Tennessee,Memphis,Census Tract,BRFSS,Unhealthy Behaviors,4748000-47157001900,Current smoking among adults aged >=18 Years,%,...,35.3,,,1601,"(35.1633019396, -90.0233116647)",UNHBEH,CSMOKING,4748000,47157000000.0,Current Smoking
3,2016,TN,Tennessee,Memphis,Census Tract,BRFSS,Prevention,4748000-47157001500,Papanicolaou smear use among adult women aged ...,%,...,86.4,,,1745,"(35.1529878945, -89.9740999194)",PREVENT,PAPTEST,4748000,47157000000.0,Pap Smear Test
4,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,4752006-47037018102,Binge drinking among adults aged >=18 Years,%,...,13.6,,,3529,"(36.1276431581, -86.8678623837)",UNHBEH,BINGE,4752006,47037020000.0,Binge Drinking


In [17]:
# before saving, need to split geolocation column into longitude and latitude
# df[['First','Last']] = df.Name.str.split(expand=True) 
cities[['latitude','longitude']] = cities.geolocation.str.split(expand=True)

# df.Num_of_employees = df.Num_of_employees.replace({"10-Jan": "1-10",
#                                                    "Nov-50": "11-50"})

cities.latitude = cities.latitude.str.replace('(','') 
cities.latitude = cities.latitude.str.replace(',','') 
cities.longitude = cities.longitude.str.replace(')','') 

cities.head()

Unnamed: 0,year,state_abbr,state_desc,city_name,geographic_level,data_source,category,uid,measure,data_value_unit,...,data_value_footnote,population_count,geolocation,category_id,measure_id,city_fips,tract_fips,short_question_text,latitude,longitude
0,2017,TN,Tennessee,Memphis,Census Tract,BRFSS,Health Outcomes,4748000-47157007300,Chronic kidney disease among adults aged >=18 ...,%,...,,4127,"(35.1209840406, -89.9363799177)",HLTHOUT,KIDNEY,4748000,47157010000.0,Chronic Kidney Disease,35.1209840406,-89.9363799177
1,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Health Outcomes,4752006-47037018700,Stroke among adults aged >=18 Years,%,...,,565,"(36.0524304194, -86.8041894006)",HLTHOUT,STROKE,4752006,47037020000.0,Stroke,36.0524304194,-86.8041894006
2,2017,TN,Tennessee,Memphis,Census Tract,BRFSS,Unhealthy Behaviors,4748000-47157001900,Current smoking among adults aged >=18 Years,%,...,,1601,"(35.1633019396, -90.0233116647)",UNHBEH,CSMOKING,4748000,47157000000.0,Current Smoking,35.1633019396,-90.0233116647
3,2016,TN,Tennessee,Memphis,Census Tract,BRFSS,Prevention,4748000-47157001500,Papanicolaou smear use among adult women aged ...,%,...,,1745,"(35.1529878945, -89.9740999194)",PREVENT,PAPTEST,4748000,47157000000.0,Pap Smear Test,35.1529878945,-89.9740999194
4,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,4752006-47037018102,Binge drinking among adults aged >=18 Years,%,...,,3529,"(36.1276431581, -86.8678623837)",UNHBEH,BINGE,4752006,47037020000.0,Binge Drinking,36.1276431581,-86.8678623837


In [18]:
# save new dataframe
cities.to_csv('../data/500cities_clean.csv')


## Look at County Health Rankings
- what to do with this file?
- what am I looking at?
- is this relevant?

In [19]:
tenn_health_rankings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 258 entries, 0 to 257
Data columns (total 3 columns):
Unnamed: 0    49 non-null object
Unnamed: 1    241 non-null object
Unnamed: 2    192 non-null object
dtypes: object(3)
memory usage: 6.2+ KB


In [23]:
# how many header rows?
tenn_health_rankings.head(30)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,A collaboration between the Robert Wood Johnso...,,
1,,,
2,This Excel file contains the ranks and scores ...,,
3,,,
4,For additional information about how the Count...,,
5,,,
6,Contents:,,
7,Outcomes & Factors Rankings,,
8,Outcomes & Factors Sub Rankings,,
9,Ranked Measures Data (including measure values...,,
