# Import Data and Create Provider Dataframes

- set up the dataframes: outpatient, provider
- clean up as needed
- join as needed
- save cleaned and joined dataframes as 'all_providers'

## Questions
- What's the best way to join 'all_providers' with Tenn. City Boundaries … spatial join? 

## Notes
- This notebook needs some cleaning up. There are several cells that have been commented out as they're no longer used.

# Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import and Setup Dataframes
- outpatients seems to list hospitals and medical centers
- providers lists individual providers
- need to plot locations of these in order to determine underserved locations

In [2]:
# Setup filename variables
outpatient_csv = '../data/original/tennessee_outpatient_clean.csv'
providers_tsv = '../data/original/Medicare_Provider_Util_Payment_PUF_CY2017.tsv'

# import files to dataframes
outpatient = pd.read_csv(outpatient_csv, low_memory=False,
                         dtype={"provider_name": object, "provider_street_address": object, "provider_city": object})
providers = pd.read_csv(providers_tsv, sep='\t', low_memory=False) 



In [3]:
providers.tail(50)

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,...,hcpcs_code,hcpcs_description,hcpcs_drug_indicator,line_srvc_cnt,bene_unique_cnt,bene_day_srvc_cnt,average_Medicare_allowed_amt,average_submitted_chrg_amt,average_Medicare_payment_amt,average_Medicare_standard_amt
9847394,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,99238,"Hospital discharge day management, 30 minutes ...",N,108.0,89.0,108.0,76.26,90.0,59.05787,56.9675
9847395,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0179,Physician re-certification for medicare-covere...,N,28.0,17.0,28.0,43.32,50.0,32.756786,31.753929
9847396,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0180,Physician certification for medicare-covered h...,N,50.0,45.0,50.0,56.51,60.0,43.4238,41.9146
9847397,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0328,Colorectal cancer screening; fecal occult bloo...,N,160.0,160.0,160.0,21.82,30.0,21.38,21.38
9847398,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0402,Initial preventive physical examination; face-...,N,15.0,15.0,15.0,171.941333,192.0,168.499333,165.3
9847399,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0403,"Electrocardiogram, routine ecg with 12 leads; ...",N,15.0,15.0,15.0,17.99,25.0,10.34,9.9
9847400,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0439,"Annual wellness visit, includes a personalized...",N,353.0,353.0,353.0,120.478754,128.498584,118.070255,115.371048
9847401,1992999437,RIVERA,JOSE,L,M.D.,M,I,1600 S ANDREWS AVE,,FORT LAUDERDALE,...,99284,"Emergency department visit, problem of high se...",N,35.0,35.0,35.0,132.35,1144.342857,99.901714,89.847429
9847402,1992999437,RIVERA,JOSE,L,M.D.,M,I,1600 S ANDREWS AVE,,FORT LAUDERDALE,...,99285,"Emergency department visit, problem with signi...",N,224.0,216.0,224.0,195.88,1775.571429,148.791562,133.422098
9847403,1992999437,RIVERA,JOSE,L,M.D.,M,I,1600 S ANDREWS AVE,,FORT LAUDERDALE,...,99291,Critical care delivery critically ill or injur...,N,49.0,49.0,49.0,249.79,2041.081633,190.435714,172.425714


In [4]:
# confirm imports
print('\n\nThis is: outpatient')
print(outpatient.info())

print('\n\nThis is: providers')
print(providers.info())




This is: outpatient
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47417 entries, 0 to 47416
Data columns (total 23 columns):
Unnamed: 0                                   47417 non-null int64
provider_id                                  79 non-null float64
provider_name                                79 non-null object
provider_street_address                      79 non-null object
provider_city                                79 non-null object
provider_state                               79 non-null object
provider_zip_code                            79 non-null float64
provider_hospital_referral_region_(hrr)      79 non-null object
apc                                          79 non-null float64
apc_description                              79 non-null object
beneficiaries                                77 non-null float64
comprehensive_apc_services                   79 non-null float64
average_estimated_total_submitted_charges    79 non-null float64
average_medicare_allowed_amou

# Clean Up Column Names and Dtypes

In [5]:
# make provider names and cities lowercase
providers['nppes_provider_last_org_name'] = providers['nppes_provider_last_org_name'].str.capitalize()
providers['nppes_provider_first_name'] = providers['nppes_provider_first_name'].str.capitalize()
providers['nppes_provider_street1'] = providers['nppes_provider_street1'].str.capitalize()
providers['nppes_provider_street2'] = providers['nppes_provider_street2'].str.capitalize()
providers['nppes_provider_city'] = providers['nppes_provider_city'].str.capitalize()


Keep these columns from providers:
- nppes_provider_last_org_name,
- nppes_provider_first_name,
- nppes_provider_street1,
- nppes_provider_street2,
- nppes_provider_city,
- nppes_provider_zip,
- nppes_provider_state,
- provider_type,
- hcpcs_code,
- hcpcs_description


In [6]:
# create temp files with the fields we need
# providers_tmp = providers[['nppes_provider_last_org_name', 'nppes_provider_first_name', 
#                           'nppes_provider_street1', 'nppes_provider_street2', 'nppes_provider_city',
#                           'nppes_provider_state', 'nppes_provider_zip', 'provider_type', 'hcpcs_code',
#                           'hcpcs_description']]

# UPDATE: I don't think I need provider_type and hcpcs fields so let's not copy them 
providers_tmp = providers[['nppes_provider_last_org_name', 'nppes_provider_first_name', 
                          'nppes_provider_street1', 'nppes_provider_street2', 'nppes_provider_city',
                          'nppes_provider_state', 'nppes_provider_zip']]

providers_tmp = providers_tmp.drop([0])
providers_tmp.head()


Unnamed: 0,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_state,nppes_provider_zip
1,Enkeshafi,Ardalan,900 seton dr,,Cumberland,MD,215021854
2,Enkeshafi,Ardalan,900 seton dr,,Cumberland,MD,215021854
3,Enkeshafi,Ardalan,900 seton dr,,Cumberland,MD,215021854
4,Enkeshafi,Ardalan,900 seton dr,,Cumberland,MD,215021854
5,Enkeshafi,Ardalan,900 seton dr,,Cumberland,MD,215021854


In [7]:
# combine first and last names into 'full_name' field
# df['Name'] = df['First'].str.cat(df['Last'],sep=" ")

providers_tmp['full_name'] = providers_tmp['nppes_provider_first_name'].str.cat(providers_tmp['nppes_provider_last_org_name'],sep=' ')
                                         
                                         

providers_tmp.tail()


Unnamed: 0,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_state,nppes_provider_zip,full_name
9847439,Deschenes,Geoffrey,1100 9th ave,Ms:m4-pfs,Seattle,WA,981012756,Geoffrey Deschenes
9847440,Joffe,Gabriella,8260 atlee rd,"Mob 2, suite 319",Mechanicsville,VA,231161844,Gabriella Joffe
9847441,Joffe,Gabriella,8260 atlee rd,"Mob 2, suite 319",Mechanicsville,VA,231161844,Gabriella Joffe
9847442,Joffe,Gabriella,8260 atlee rd,"Mob 2, suite 319",Mechanicsville,VA,231161844,Gabriella Joffe
9847443,Joffe,Gabriella,8260 atlee rd,"Mob 2, suite 319",Mechanicsville,VA,231161844,Gabriella Joffe


In [8]:
# exclude non-Tennessee records
tennessee_providers = providers_tmp.loc[providers_tmp ['nppes_provider_state'] == 'TN'] 

# convert zip code to int
tennessee_providers.nppes_provider_zip = tennessee_providers.nppes_provider_zip.astype('int64') 

# confirm the fix
print(tennessee_providers.info())
tennessee_providers.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269452 entries, 416 to 9847080
Data columns (total 8 columns):
nppes_provider_last_org_name    269448 non-null object
nppes_provider_first_name       259479 non-null object
nppes_provider_street1          269452 non-null object
nppes_provider_street2          115651 non-null object
nppes_provider_city             269452 non-null object
nppes_provider_state            269452 non-null object
nppes_provider_zip              269452 non-null int64
full_name                       259475 non-null object
dtypes: int64(1), object(7)
memory usage: 18.5+ MB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_state,nppes_provider_zip,full_name
9847076,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847077,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847078,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847079,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847080,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles


Keep these columns from outpatient:
- provider_name,
- provider_street_address,
- provider_city,
- provider_state,
- provider_zip_code,
- apc,
- apc_description,
- beneficiaries

In [9]:
# create temp files with the fields we need
# outpatient_tmp = outpatient[['provider_name', 'provider_street_address', 'provider_city', 'provider_state',
#                              'provider_zip_code', 'apc', 'apc_description'
# ]]

# UPDATE: don't think I need apc fields
outpatient_tmp = outpatient[['provider_name', 'provider_street_address', 'provider_city', 'provider_state',
                             'provider_zip_code']]

print(outpatient_tmp.info())
outpatient_tmp.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47417 entries, 0 to 47416
Data columns (total 5 columns):
provider_name              79 non-null object
provider_street_address    79 non-null object
provider_city              79 non-null object
provider_state             79 non-null object
provider_zip_code          79 non-null float64
dtypes: float64(1), object(4)
memory usage: 1.8+ MB
None


Unnamed: 0,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code
0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0
1,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0


# Clean up Column Names

In [10]:
# rename columns
# tennessee_providers.columns = ['last_name', 'first_name', 'address', 'address2', 'city', 'state', 'zip',
#                               'provider_type', 'hcpcs_code', 'hcpcs_description', 'full_name'] 

tennessee_providers.columns = ['last_name', 'first_name', 'address', 'address2', 'city', 'state', 'zip', 'full_name'] 

# outpatient_tmp.columns = ['full_name', 'address', 'city', 'state', 'zip', 'apc', 'apc_description']
outpatient_tmp.columns = ['full_name', 'address', 'city', 'state', 'zip']

In [11]:
# confirm renames
print('This is tennessee_providers')
tennessee_providers.info()

This is tennessee_providers
<class 'pandas.core.frame.DataFrame'>
Int64Index: 269452 entries, 416 to 9847080
Data columns (total 8 columns):
last_name     269448 non-null object
first_name    259479 non-null object
address       269452 non-null object
address2      115651 non-null object
city          269452 non-null object
state         269452 non-null object
zip           269452 non-null int64
full_name     259475 non-null object
dtypes: int64(1), object(7)
memory usage: 18.5+ MB


In [12]:
print('\n\nThis is outpatient_tmp')
outpatient_tmp.info()



This is outpatient_tmp
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47417 entries, 0 to 47416
Data columns (total 5 columns):
full_name    79 non-null object
address      79 non-null object
city         79 non-null object
state        79 non-null object
zip          79 non-null float64
dtypes: float64(1), object(4)
memory usage: 1.8+ MB


## Look for NaNs and duplicates

In [13]:
tennessee_providers.head(10)

Unnamed: 0,last_name,first_name,address,address2,city,state,zip,full_name
416,Walgreen co.,,5104 bobby hicks hwy,,Gray,TN,376156217,
417,Walgreen co.,,5104 bobby hicks hwy,,Gray,TN,376156217,
418,Walgreen co.,,5104 bobby hicks hwy,,Gray,TN,376156217,
419,Walgreen co.,,5104 bobby hicks hwy,,Gray,TN,376156217,
420,Walgreen co.,,5104 bobby hicks hwy,,Gray,TN,376156217,
1747,Cudzilo,Corey,2240 sutherland ave,Suite 103,Knoxville,TN,379192333,Corey Cudzilo
1748,Cudzilo,Corey,2240 sutherland ave,Suite 103,Knoxville,TN,379192333,Corey Cudzilo
1749,Cudzilo,Corey,2240 sutherland ave,Suite 103,Knoxville,TN,379192333,Corey Cudzilo
1750,Cudzilo,Corey,2240 sutherland ave,Suite 103,Knoxville,TN,379192333,Corey Cudzilo
1751,Cudzilo,Corey,2240 sutherland ave,Suite 103,Knoxville,TN,379192333,Corey Cudzilo


In [14]:
tennessee_providers.tail(10)

Unnamed: 0,last_name,first_name,address,address2,city,state,zip,full_name
9847071,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847072,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847073,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847074,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847075,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847076,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847077,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847078,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847079,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles
9847080,Giles,Wesley,979 e 3rd st ste 300,,Chattanooga,TN,374032187,Wesley Giles


In [15]:
# drop duplicates
tennessee_providers = tennessee_providers.drop_duplicates(keep=False)
tennessee_providers

Unnamed: 0,last_name,first_name,address,address2,city,state,zip,full_name
6354,Stone,Ralph,1700 carmack blvd,,Columbia,TN,38401,Ralph Stone
15768,State of tennessee,,214 west longview drive,,Portland,TN,37148,
17080,Davis,Patrick,1222 trotwood ave ste 603,,Columbia,TN,38401,Patrick Davis
26250,Horn,Cassondra,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn
27899,Hiatt,Emily,3310 w end ave,Suite 590,Nashville,TN,372031028,Emily Hiatt
...,...,...,...,...,...,...,...,...
9834934,Shah,Jasmine,975 e 3rd st,,Chattanooga,TN,374032147,Jasmine Shah
9834942,Basty,Marie,713 cheatham st,,Springfield,TN,371722828,Marie Basty
9839309,Bell,Amanda,317 n hickory ave,,Cookeville,TN,385012428,Amanda Bell
9840208,Gill,Farrukh,5301 virginia way,Suite 300,Brentwood,TN,370277541,Farrukh Gill


In [16]:
outpatient_tmp.head(80)

Unnamed: 0,full_name,address,city,state,zip
0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0
1,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0
...,...,...,...,...,...
75,Tennova Healthcare-Lebanon,1411 Baddour Parkway,Lebanon,TN,37087.0
76,Tristar Hendersonville Medical Center,355 New Shackle Island Rd,Hendersonville,TN,37075.0
77,Tristar Southern Hills Medical Center,391 Wallace Rd,Nashville,TN,37211.0
78,Tristar Stonecrest Medical Center,200 Stonecrest Boulevard,Smyrna,TN,37167.0


In [17]:
outpatient_tmp.tail(47339)

Unnamed: 0,full_name,address,city,state,zip
78,Tristar Stonecrest Medical Center,200 Stonecrest Boulevard,Smyrna,TN,37167.0
79,,,,,
80,,,,,
81,,,,,
82,,,,,
...,...,...,...,...,...
47412,,,,,
47413,,,,,
47414,,,,,
47415,,,,,


In [18]:
# drop NaN columns
outpatient_tmp.dropna()

Unnamed: 0,full_name,address,city,state,zip
0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0
1,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0
...,...,...,...,...,...
74,Saint Thomas Highlands Hospital,401 Sewell Dr,Sparta,TN,38583.0
75,Tennova Healthcare-Lebanon,1411 Baddour Parkway,Lebanon,TN,37087.0
76,Tristar Hendersonville Medical Center,355 New Shackle Island Rd,Hendersonville,TN,37075.0
77,Tristar Southern Hills Medical Center,391 Wallace Rd,Nashville,TN,37211.0


In [19]:
# drop duplicates
outpatient_tmp = outpatient_tmp.drop_duplicates(keep=False)
outpatient_tmp

Unnamed: 0,full_name,address,city,state,zip
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0
5,Blount Memorial Hospital,907 E Lamar Alexander Parkway,Maryville,TN,37804.0
6,Wellmont Bristol Regional Medical Center,One Medical Park Blvd,Bristol,TN,37620.0
9,Wellmont Holston Valley Medical Center,130 West Ravine Road,Kingsport,TN,37662.0
14,Williamson Medical Center,4321 Carothers Parkway,Franklin,TN,37067.0
15,Morristown Hamblen Hospital Association,908 W 4th North St,Morristown,TN,37814.0
16,Lakeway Regional Hospital,726 Mcfarland St,Morristown,TN,37814.0
17,Roane Medical Center,8045 Roane Medical Center Drive,Harriman,TN,37748.0


In [20]:
# save this version of outpatient_tmp for later
# outpatient_providers = outpatient_tmp.drop(['apc','apc_description'], axis=1)

outpatient_providers = outpatient_tmp

# save_me
outpatient_providers.to_csv('../data/clean/outpatient_providers.csv')


In [21]:
# tennessee providers has NaNs in the full_name field. Fix them.
tennessee_providers['full_name'].fillna(tennessee_providers['last_name'], inplace = True) 

# confirm that there are no NaNs in the full_name field. Ok to have them in other fields
null_columns=tennessee_providers.columns[tennessee_providers.isnull().any()]
tennessee_providers[null_columns].isnull().sum()


first_name      38
address2      1283
dtype: int64

In [22]:
# drop first_ and last_name from tennessee_providers, will only use full_name
tennessee_providers = tennessee_providers.drop(['first_name', 'last_name'], axis=1)

tennessee_providers

Unnamed: 0,address,address2,city,state,zip,full_name
6354,1700 carmack blvd,,Columbia,TN,38401,Ralph Stone
15768,214 west longview drive,,Portland,TN,37148,State of tennessee
17080,1222 trotwood ave ste 603,,Columbia,TN,38401,Patrick Davis
26250,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn
27899,3310 w end ave,Suite 590,Nashville,TN,372031028,Emily Hiatt
...,...,...,...,...,...,...
9834934,975 e 3rd st,,Chattanooga,TN,374032147,Jasmine Shah
9834942,713 cheatham st,,Springfield,TN,371722828,Marie Basty
9839309,317 n hickory ave,,Cookeville,TN,385012428,Amanda Bell
9840208,5301 virginia way,Suite 300,Brentwood,TN,370277541,Farrukh Gill


In [23]:
tennessee_providers.address.value_counts()

110 29th ave n                43
3601 tvc                      38
3601 the vanderbilt clinic    35
110 29th ave n ste 202        25
501 20th st                   22
                              ..
2000 old fort pkwy             1
114 highway 70 e               1
1112 nashville pike            1
105 cherokee rd                1
6000 ramsey way                1
Name: address, Length: 1361, dtype: int64

In [24]:
# extract addresses from tennessee_providers for geocoding
tennessee_providers_addresses = tennessee_providers[['address', 'city', 'state', 'zip']]

tennessee_providers_addresses


Unnamed: 0,address,city,state,zip
6354,1700 carmack blvd,Columbia,TN,38401
15768,214 west longview drive,Portland,TN,37148
17080,1222 trotwood ave ste 603,Columbia,TN,38401
26250,1 medical park blvd,Bristol,TN,376207430
27899,3310 w end ave,Nashville,TN,372031028
...,...,...,...,...
9834934,975 e 3rd st,Chattanooga,TN,374032147
9834942,713 cheatham st,Springfield,TN,371722828
9839309,317 n hickory ave,Cookeville,TN,385012428
9840208,5301 virginia way,Brentwood,TN,370277541


In [25]:
# save these versions of tennessee_providers_addresses and tennessee_providers for later
tennessee_providers_addresses.to_csv('../data/clean/tennessee_providers_addresses.csv')
tennessee_providers.to_csv('../data/clean/tennessee_providers.csv')


In [26]:
# hey, stop here
# get geocodes for tennessee_providers_addresses, then clean up results file in BBEdit … 
#     will need to remove double quotes around each value and add names to columns w/o them

## Import and Join Geocodes to DataFrames

In [27]:
# NOTE: outpatient_geocoded_addresses does not include the three public clinics in davidson County. 
# I added them manually 
#     42	East Public Health Center	1015 East Trinity Lane	Nashville	TN		Nashville	TN		-86.745286	36.204273
#     43	Woodbine Public Health Center	224 Oriel Avenue	Nashville	TN		Nashville	TN		-86.743627	36.122097
#     44	Lentz Public Health Center	2500 Charlotte Avenue	Nashville	TN		Nashville	TN		-86.812991	36.155043



In [28]:
# set up import variables

# import geocoded address files for joining
tennessee_providers_geocodes_csv = '../data/location/tennessee_providers_geocoded_addresses.csv'
outpatient_geocodes_csv = '../data/clean/all_providers_geocodes.csv'
# outpatient_geocodes_csv = '../data/clean/outpatient_geocodes.csv'   # Not sure what this file was for


# import only the needed columns from outpatient_geocodes
columns = ['address', ' city', ' state', ' zip', 'longitude', 'latitude'] 
# import file to create dataframe
outpatient_geocodes = pd.read_csv(outpatient_geocodes_csv, usecols=columns)
# delete initial space in column names
outpatient_geocodes.columns = outpatient_geocodes.columns.str.replace(' ', '').str.lower()


# repeat for tennessee_providers_geocodes, note lack of leading space in column names
columns = ['address', 'city', 'state', 'zip', 'longitude', 'latitude'] 
tennessee_providers_geocodes = pd.read_csv(tennessee_providers_geocodes_csv, usecols=columns)


# confirm imports
print('This is outpatient with geocodes')
print(outpatient_geocodes.info())
print('This is tennessee_providers with geocodes')
print(tennessee_providers_geocodes.info())


This is outpatient with geocodes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806 entries, 0 to 805
Data columns (total 6 columns):
address      806 non-null object
city         806 non-null object
state        806 non-null object
zip          806 non-null int64
longitude    576 non-null float64
latitude     576 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 37.9+ KB
None
This is tennessee_providers with geocodes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1997 entries, 0 to 1996
Data columns (total 6 columns):
address      1997 non-null object
city         1997 non-null object
state        1997 non-null object
zip          1996 non-null float64
longitude    1461 non-null float64
latitude     1462 non-null float64
dtypes: float64(3), object(3)
memory usage: 93.7+ KB
None


In [29]:
# join outpatient files
# note, if I include 'city', 'state', 'zip' in my join, I don't get longitude and latitude
#     Joining on 'address' only give more complete results. Only two Nashville addresses w/o longitude and latitude

outpatient_locations = pd.merge(outpatient_tmp, outpatient_geocodes, how='left', on=['address'])

outpatient_locations.tail(50)

Unnamed: 0,full_name,address,city_x,state_x,zip_x,city_y,state_y,zip_y,longitude,latitude
0,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0,Gallatin,TN,37066.0,,
1,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0,Nashville,TN,37207.0,,
2,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0,Crossville,TN,38555.0,,
3,Blount Memorial Hospital,907 E Lamar Alexander Parkway,Maryville,TN,37804.0,Maryville,TN,37804.0,-83.9591,35.754974
4,Wellmont Bristol Regional Medical Center,One Medical Park Blvd,Bristol,TN,37620.0,Bristol,TN,37620.0,,
5,Wellmont Holston Valley Medical Center,130 West Ravine Road,Kingsport,TN,37662.0,Kingsport,TN,37662.0,,
6,Williamson Medical Center,4321 Carothers Parkway,Franklin,TN,37067.0,Franklin,TN,37067.0,-86.81695,35.917603
7,Morristown Hamblen Hospital Association,908 W 4th North St,Morristown,TN,37814.0,Morristown,TN,37814.0,-83.30439,36.211937
8,Lakeway Regional Hospital,726 Mcfarland St,Morristown,TN,37814.0,Morristown,TN,37814.0,-83.30431,36.214622
9,Roane Medical Center,8045 Roane Medical Center Drive,Harriman,TN,37748.0,Harriman,TN,37748.0,-84.555176,35.88938


In [30]:
# export geocoded outpatient addresses
outpatient_locations.to_csv('../data/clean/outpatient_geocoded_addresses.csv')


In [31]:
# join tennessee_providers files
# note, if I include 'city', 'state', 'zip' in my join, I don't get longitude and latitude
#     Joining on 'address' only give more complete results. Only two Nashville addresses w/o longitude and latitude
result = pd.merge(tennessee_providers, tennessee_providers_geocodes, how='left', on=['address'])

result.head(50)

Unnamed: 0,address,address2,city_x,state_x,zip_x,full_name,city_y,state_y,zip_y,longitude,latitude
0,1700 carmack blvd,,Columbia,TN,38401,Ralph Stone,Columbia,TN,38401.0,,
1,214 west longview drive,,Portland,TN,37148,State of tennessee,Portland,TN,37148.0,-86.51546,36.560825
2,1222 trotwood ave ste 603,,Columbia,TN,38401,Patrick Davis,Columbia,TN,38401.0,-87.06556,35.6061
3,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn,Bristol,TN,376207430.0,-82.2538,36.58927
4,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn,Bristol,TN,376207430.0,-82.2538,36.58927
5,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn,Bristol,TN,376207430.0,-82.2538,36.58927
6,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn,Bristol,TN,376207430.0,-82.2538,36.58927
7,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn,Bristol,TN,376207430.0,-82.2538,36.58927
8,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn,Bristol,TN,376207430.0,-82.2538,36.58927
9,1 medical park blvd,,Bristol,TN,376207430,Cassondra Horn,Bristol,TN,376207430.0,-82.2538,36.58927


In [32]:
# drop duplicates
tennessee_providers = result.drop_duplicates(keep=False)
tennessee_providers

Unnamed: 0,address,address2,city_x,state_x,zip_x,full_name,city_y,state_y,zip_y,longitude,latitude
0,1700 carmack blvd,,Columbia,TN,38401,Ralph Stone,Columbia,TN,38401.0,,
1,214 west longview drive,,Portland,TN,37148,State of tennessee,Portland,TN,37148.0,-86.51546,36.560825
2,1222 trotwood ave ste 603,,Columbia,TN,38401,Patrick Davis,Columbia,TN,38401.0,-87.06556,35.606100
18,3310 w end ave,Suite 590,Nashville,TN,372031028,Emily Hiatt,Nashville,TN,372031028.0,-86.81769,36.140644
19,1740 n germantown pkwy,Suite #6,Cordova,TN,380163307,Charles Hogan,Cordova,TN,380163307.0,-89.79291,35.173042
...,...,...,...,...,...,...,...,...,...,...,...
10074,3601 the vanderbilt clinic,,Nashville,TN,372320014,Tiffany Street,Nashville,TN,372322012.0,,
10101,975 e 3rd st,,Chattanooga,TN,374032147,Jasmine Shah,Chattanooga,TN,374032103.0,,
10102,975 e 3rd st,,Chattanooga,TN,374032147,Jasmine Shah,Chattanooga,TN,37403.0,,
10109,5301 virginia way,Suite 300,Brentwood,TN,370277541,Farrukh Gill,Brentwood,TN,370277541.0,-86.80924,36.032856


In [33]:
# export geocoded outpatient addresses
tennessee_providers.to_csv('../data/clean/tennessee_providers_geocoded_addresses.csv')


In [39]:
# hey, stop here

In [40]:
# join temp dataframes so that all providers are in one geocoded file
all_providers = outpatient_locations.append(tennessee_providers)

# fix column names
all_providers.columns = ['address', 'address2', 'city', 'city_y', 'full_name', 'latitude', 'longitude', 
           'state', 'state_y', 'zip', 'zip_y']
# drop extra columns
all_providers = all_providers.drop(['city_y', 'state_y', 'zip_y'], axis=1)
all_providers.head(50)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,address,address2,city,full_name,latitude,longitude,state,zip
0,555 Hartsville Pike,,Gallatin,Sumner Regional Medical Center,,,TN,37066.0
1,3441 Dickerson Pike,,Nashville,Tristar Skyline Medical Center,,,TN,37207.0
2,421 S Main St,,Crossville,Cumberland Medical Center,,,TN,38555.0
3,907 E Lamar Alexander Parkway,,Maryville,Blount Memorial Hospital,35.754974,-83.9591,TN,37804.0
4,One Medical Park Blvd,,Bristol,Wellmont Bristol Regional Medical Center,,,TN,37620.0
5,130 West Ravine Road,,Kingsport,Wellmont Holston Valley Medical Center,,,TN,37662.0
6,4321 Carothers Parkway,,Franklin,Williamson Medical Center,35.917603,-86.81695,TN,37067.0
7,908 W 4th North St,,Morristown,Morristown Hamblen Hospital Association,36.211937,-83.30439,TN,37814.0
8,726 Mcfarland St,,Morristown,Lakeway Regional Hospital,36.214622,-83.30431,TN,37814.0
9,8045 Roane Medical Center Drive,,Harriman,Roane Medical Center,35.88938,-84.555176,TN,37748.0


In [41]:
# # drop duplicates
# all_providers = all_providers.drop_duplicates(keep=False)
# all_providers

In [42]:
all_providers.tail()

Unnamed: 0,address,address2,city,full_name,latitude,longitude,state,zip
10074,3601 the vanderbilt clinic,,Nashville,Tiffany Street,,,TN,372320014.0
10101,975 e 3rd st,,Chattanooga,Jasmine Shah,,,TN,374032147.0
10102,975 e 3rd st,,Chattanooga,Jasmine Shah,,,TN,374032147.0
10109,5301 virginia way,Suite 300,Brentwood,Farrukh Gill,36.032856,-86.80924,TN,370277541.0
10110,1124 new highway 52 e,,Westmoreland,Amanda Perry,,,TN,371865060.0


In [43]:
# export unique addresses for geocoding later
# all_providers_addresses = all_providers[['address','city', 'state', 'zip']]

# drop duplicate addresses
all_providers.drop_duplicates(keep=False,inplace=True)

all_providers.tail()

Unnamed: 0,address,address2,city,full_name,latitude,longitude,state,zip
10049,710 carl perkins parkway,,Tiptonville,Carol Guess,36.369045,-89.4685,TN,380791305.0
10050,2805 old fort pkwy,Suite d,Murfreesboro,Amy Bennett,35.844967,-86.44899,TN,371285115.0
10054,415 n lindell st,,Martin,Phillip Elliott,,,TN,38237.0
10109,5301 virginia way,Suite 300,Brentwood,Farrukh Gill,36.032856,-86.80924,TN,370277541.0
10110,1124 new highway 52 e,,Westmoreland,Amanda Perry,,,TN,371865060.0


In [44]:
all_providers.head()

Unnamed: 0,address,address2,city,full_name,latitude,longitude,state,zip
0,555 Hartsville Pike,,Gallatin,Sumner Regional Medical Center,,,TN,37066.0
1,3441 Dickerson Pike,,Nashville,Tristar Skyline Medical Center,,,TN,37207.0
2,421 S Main St,,Crossville,Cumberland Medical Center,,,TN,38555.0
3,907 E Lamar Alexander Parkway,,Maryville,Blount Memorial Hospital,35.754974,-83.9591,TN,37804.0
4,One Medical Park Blvd,,Bristol,Wellmont Bristol Regional Medical Center,,,TN,37620.0


In [45]:
# all_providers_addresses.info()

# # NOTE: zip colim needs to be an integer to work with online geocoder at: 
# # https://geocoding.geo.census.gov/geocoder/geographies/addressbatch?form

# # df['column name'] = df['column name'].astype(np.int64)
# all_providers_addresses['zip'] = all_providers_addresses['zip'].astype(np.int64)


In [46]:
all_providers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1366 entries, 0 to 10110
Data columns (total 8 columns):
address      1366 non-null object
address2     448 non-null object
city         1366 non-null object
full_name    1366 non-null object
latitude     973 non-null float64
longitude    973 non-null float64
state        1366 non-null object
zip          1366 non-null float64
dtypes: float64(3), object(5)
memory usage: 96.0+ KB


In [47]:
# export address dataframe for geocoding
# all_providers.to_csv('../data/clean/all_providers.csv')
all_providers.to_csv('../data/clean/all_providers_geocoded_addresses.csv')



## NOTE: This notebook needs some cleaning up. The cells below are no longer being used. There are also a few above that have been commented out. 

In [48]:
# NOTE This info may no longer be needed


########################### READ ME FOR LATER #################################
# NOTE: The addresses file will go through the geocoder and return the following format: "","xxx",""
#     This is a comma-delimited format with double quotes around each cell. I've been goint into BBEdit and
#     deleting the double quotes. Loads fine after that, but you'll still need to add column names, see below.
   
# Keep these columns 
#     - drop_me
#     - address
#     - city
#     - state
#     - zip
#     - No_Match
#     - match_type
#     - longitude
#     - latitude

# Drop these columns
#     - street2
#     - city2
#     - state2
#     - zip2
#     - unknown
#     - unknown2
#     - unknown3
#     - unknown4
#     - unknown5
#     - unknown6


In [49]:
# # set up import variables
# all_providers_geocodes_csv = '../data/clean/all_providers_geocodes.csv'
# columns = ['address', ' city', ' state', ' zip', 'longitude', 'latitude'] 


# # import files to dataframes
# all_providers_geocodes = pd.read_csv(all_providers_geocodes_csv, usecols=columns)

# # delete initial space in column names
# all_providers_geocodes.columns = all_providers_geocodes.columns.str.replace(' ', '').str.lower()

# all_providers_geocodes.info()



In [50]:
# # join geocoded addresses
# # pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
# #          left_index=False, right_index=False, sort=True,
# #          suffixes=('_x', '_y'), copy=True, indicator=False,
# #          validate=None)

# #DEBUGDEBUG This is not working correctly
# result = pd.merge(all_providers, all_providers_geocodes, how='right', on=['address'])

# result.tail(50)

In [51]:
# # fix column names
# new_names = ['address', 'address2', 'apc', 'apc_description', 'city_x', 'first_name',
#        'full_name', 'hcpcs_code', 'hcpcs_description', 'last_name',
#        'provider_type', 'state_x', 'zip_x', 'city', 'state', 'zip',
#        'longitude', 'latitude']


# result.columns = new_names

# result.columns

In [52]:
# # create all_providers_geocode_addresses
# all_providers_geocoded_addresses = result[['full_name', 'first_name', 'last_name', 'address', 'address2',
#                                           'city', 'state', 'zip', 'longitude', 'latitude', 'apc', 'apc_description',
#                                           'hcpcs_code', 'hcpcs_description', 'provider_type']]

# all_providers_geocoded_addresses.tail(50)

In [53]:
# # save new dataframes
# all_providers.to_csv('../data/clean/all_providers.csv')
# all_providers_geocoded_addresses.to_csv('../data/clean/all_providers_geocoded_addresses.csv')

