# Import Data and Create Dataframes

- set up the dataframes
- clean up as needed
- join as needed
- save cleaned and joined dataframes

## DEBUGDEBUG
- I am lost in getting the locations on a map. The ASC map seems to have more locations than the outpatients one. It also seems to be less crowded than the one I deleted.
- I need to find the best way to divide up Davidson County and look at it. Thinking that neighborhoods would be the best divisor but not sure I can find map files for that. Maybe go back to census tracts/blocks? Could also use political districts or maybe go back to zip codes

# Import Libraries

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Import and Setup Dataframes

In [4]:
# Setup filename variables
outpatient_csv = '../data/original/tennessee_outpatient_clean.csv'
providers_tsv = '../data/original/Medicare_Provider_Util_Payment_PUF_CY2017.tsv'

# import files to dataframes
outpatient = pd.read_csv(outpatient_csv, low_memory=False,
                         dtype={"provider_name": object, "provider_street_address": object, "provider_city": object})
providers = pd.read_csv(providers_tsv, sep='\t', low_memory=False) 



In [5]:
providers.tail(50)

Unnamed: 0,npi,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_mi,nppes_credentials,nppes_provider_gender,nppes_entity_code,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,...,hcpcs_code,hcpcs_description,hcpcs_drug_indicator,line_srvc_cnt,bene_unique_cnt,bene_day_srvc_cnt,average_Medicare_allowed_amt,average_submitted_chrg_amt,average_Medicare_payment_amt,average_Medicare_standard_amt
9847394,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,99238,"Hospital discharge day management, 30 minutes ...",N,108.0,89.0,108.0,76.26,90.0,59.05787,56.9675
9847395,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0179,Physician re-certification for medicare-covere...,N,28.0,17.0,28.0,43.32,50.0,32.756786,31.753929
9847396,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0180,Physician certification for medicare-covered h...,N,50.0,45.0,50.0,56.51,60.0,43.4238,41.9146
9847397,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0328,Colorectal cancer screening; fecal occult bloo...,N,160.0,160.0,160.0,21.82,30.0,21.38,21.38
9847398,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0402,Initial preventive physical examination; face-...,N,15.0,15.0,15.0,171.941333,192.0,168.499333,165.3
9847399,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0403,"Electrocardiogram, routine ecg with 12 leads; ...",N,15.0,15.0,15.0,17.99,25.0,10.34,9.9
9847400,1992999122,JOHNSON,CHARLES,R,D.O.,M,I,1601 CLINT MOORE RD,155,BOCA RATON,...,G0439,"Annual wellness visit, includes a personalized...",N,353.0,353.0,353.0,120.478754,128.498584,118.070255,115.371048
9847401,1992999437,RIVERA,JOSE,L,M.D.,M,I,1600 S ANDREWS AVE,,FORT LAUDERDALE,...,99284,"Emergency department visit, problem of high se...",N,35.0,35.0,35.0,132.35,1144.342857,99.901714,89.847429
9847402,1992999437,RIVERA,JOSE,L,M.D.,M,I,1600 S ANDREWS AVE,,FORT LAUDERDALE,...,99285,"Emergency department visit, problem with signi...",N,224.0,216.0,224.0,195.88,1775.571429,148.791562,133.422098
9847403,1992999437,RIVERA,JOSE,L,M.D.,M,I,1600 S ANDREWS AVE,,FORT LAUDERDALE,...,99291,Critical care delivery critically ill or injur...,N,49.0,49.0,49.0,249.79,2041.081633,190.435714,172.425714


In [6]:
# confirm imports
print('\n\nThis is: outpatient')
print(outpatient.info())

print('\n\nThis is: providers')
print(providers.info())




This is: outpatient
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47417 entries, 0 to 47416
Data columns (total 23 columns):
Unnamed: 0                                   47417 non-null int64
provider_id                                  79 non-null float64
provider_name                                79 non-null object
provider_street_address                      79 non-null object
provider_city                                79 non-null object
provider_state                               79 non-null object
provider_zip_code                            79 non-null float64
provider_hospital_referral_region_(hrr)      79 non-null object
apc                                          79 non-null float64
apc_description                              79 non-null object
beneficiaries                                77 non-null float64
comprehensive_apc_services                   79 non-null float64
average_estimated_total_submitted_charges    79 non-null float64
average_medicare_allowed_amou

# Clean Up Column Names and Dtypes

# Join Files? 
- not sure if I should do this yet as file are still having issues. Commented out for now.

In [7]:
# make provider names and cities lowercase
providers['nppes_provider_last_org_name'] = providers['nppes_provider_last_org_name'].str.capitalize()
providers['nppes_provider_first_name'] = providers['nppes_provider_first_name'].str.capitalize()
providers['nppes_provider_city'] = providers['nppes_provider_city'].str.capitalize()


Keep these columns from providers:
- nppes_provider_last_org_name,
- nppes_provider_first_name,
- nppes_provider_street1,
- nppes_provider_street2,
- nppes_provider_city,
- nppes_provider_zip,
- nppes_provider_state,
- provider_type,
- hcpcs_code,
- hcpcs_description


In [8]:
# create temp files with the fields we need
# providers_tmp = providers[['nppes_provider_last_org_name', 'nppes_provider_first_name', 
#                           'nppes_provider_street1', 'nppes_provider_street2', 'nppes_provider_city',
#                           'nppes_provider_state', 'nppes_provider_zip', 'provider_type', 'hcpcs_code',
#                           'hcpcs_description']]

providers_tmp = providers[['nppes_provider_last_org_name', 'nppes_provider_first_name', 
                          'nppes_provider_street1', 'nppes_provider_street2', 'nppes_provider_city',
                          'nppes_provider_state', 'nppes_provider_zip']]

providers_tmp = providers_tmp.drop([0])
providers_tmp.head()


Unnamed: 0,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_state,nppes_provider_zip
1,Enkeshafi,Ardalan,900 SETON DR,,Cumberland,MD,215021854
2,Enkeshafi,Ardalan,900 SETON DR,,Cumberland,MD,215021854
3,Enkeshafi,Ardalan,900 SETON DR,,Cumberland,MD,215021854
4,Enkeshafi,Ardalan,900 SETON DR,,Cumberland,MD,215021854
5,Enkeshafi,Ardalan,900 SETON DR,,Cumberland,MD,215021854


In [9]:
# combine first and last names into 'full_name' field
# df['Name'] = df['First'].str.cat(df['Last'],sep=" ")

providers_tmp['full_name'] = providers_tmp['nppes_provider_first_name'].str.cat(providers_tmp['nppes_provider_last_org_name'],sep=' ')
                                         
                                         

providers_tmp.tail()


Unnamed: 0,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_state,nppes_provider_zip,full_name
9847439,Deschenes,Geoffrey,1100 9TH AVE,MS:M4-PFS,Seattle,WA,981012756,Geoffrey Deschenes
9847440,Joffe,Gabriella,8260 ATLEE RD,"MOB 2, SUITE 319",Mechanicsville,VA,231161844,Gabriella Joffe
9847441,Joffe,Gabriella,8260 ATLEE RD,"MOB 2, SUITE 319",Mechanicsville,VA,231161844,Gabriella Joffe
9847442,Joffe,Gabriella,8260 ATLEE RD,"MOB 2, SUITE 319",Mechanicsville,VA,231161844,Gabriella Joffe
9847443,Joffe,Gabriella,8260 ATLEE RD,"MOB 2, SUITE 319",Mechanicsville,VA,231161844,Gabriella Joffe


In [10]:
# exclude non-Tennessee records
tennessee_providers = providers_tmp.loc[providers_tmp ['nppes_provider_state'] == 'TN'] 

# convert zip code to int
tennessee_providers.nppes_provider_zip = tennessee_providers.nppes_provider_zip.astype('int64') 


print(tennessee_providers.info())
tennessee_providers.tail()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269452 entries, 416 to 9847080
Data columns (total 8 columns):
nppes_provider_last_org_name    269448 non-null object
nppes_provider_first_name       259479 non-null object
nppes_provider_street1          269452 non-null object
nppes_provider_street2          115651 non-null object
nppes_provider_city             269452 non-null object
nppes_provider_state            269452 non-null object
nppes_provider_zip              269452 non-null int64
full_name                       259475 non-null object
dtypes: int64(1), object(7)
memory usage: 18.5+ MB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,nppes_provider_last_org_name,nppes_provider_first_name,nppes_provider_street1,nppes_provider_street2,nppes_provider_city,nppes_provider_state,nppes_provider_zip,full_name
9847076,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847077,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847078,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847079,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847080,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles


Keep these columns from outpatient:
- provider_name,
- provider_street_address,
- provider_city,
- provider_state,
- provider_zip_code,
- apc,
- apc_description,
- beneficiaries

In [11]:
# create temp files with the fields we need
outpatient_tmp = outpatient[['provider_name', 'provider_street_address', 'provider_city', 'provider_state',
                             'provider_zip_code', 'apc', 'apc_description'
]]

print(outpatient_tmp.info())
outpatient_tmp.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47417 entries, 0 to 47416
Data columns (total 7 columns):
provider_name              79 non-null object
provider_street_address    79 non-null object
provider_city              79 non-null object
provider_state             79 non-null object
provider_zip_code          79 non-null float64
apc                        79 non-null float64
apc_description            79 non-null object
dtypes: float64(2), object(5)
memory usage: 2.5+ MB
None


Unnamed: 0,provider_name,provider_street_address,provider_city,provider_state,provider_zip_code,apc,apc_description
0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,5302.0,Level 2 Upper GI Procedures
1,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,5302.0,Level 2 Upper GI Procedures
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0,5302.0,Level 2 Upper GI Procedures
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0,5302.0,Level 2 Upper GI Procedures
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0,5302.0,Level 2 Upper GI Procedures


# Clean up Column Names

In [12]:
# rename columns
# tennessee_providers.columns = ['last_name', 'first_name', 'address', 'address2', 'city', 'state', 'zip',
#                               'provider_type', 'hcpcs_code', 'hcpcs_description', 'full_name'] 

tennessee_providers.columns = ['last_name', 'first_name', 'address', 'address2', 'city', 'state', 'zip', 'full_name'] 

outpatient_tmp.columns = ['full_name', 'address', 'city', 'state', 'zip', 'apc', 'apc_description']

In [13]:
# confirm renames
print('This is tennessee_providers')
tennessee_providers.info()

This is tennessee_providers
<class 'pandas.core.frame.DataFrame'>
Int64Index: 269452 entries, 416 to 9847080
Data columns (total 8 columns):
last_name     269448 non-null object
first_name    259479 non-null object
address       269452 non-null object
address2      115651 non-null object
city          269452 non-null object
state         269452 non-null object
zip           269452 non-null int64
full_name     259475 non-null object
dtypes: int64(1), object(7)
memory usage: 18.5+ MB


In [14]:
print('\n\nThis is outpatient_tmp')
outpatient_tmp.info()



This is outpatient_tmp
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47417 entries, 0 to 47416
Data columns (total 7 columns):
full_name          79 non-null object
address            79 non-null object
city               79 non-null object
state              79 non-null object
zip                79 non-null float64
apc                79 non-null float64
apc_description    79 non-null object
dtypes: float64(2), object(5)
memory usage: 2.5+ MB


## Look for NaNs and duplicates

In [15]:
tennessee_providers.head(10)

Unnamed: 0,last_name,first_name,address,address2,city,state,zip,full_name
416,Walgreen co.,,5104 BOBBY HICKS HWY,,Gray,TN,376156217,
417,Walgreen co.,,5104 BOBBY HICKS HWY,,Gray,TN,376156217,
418,Walgreen co.,,5104 BOBBY HICKS HWY,,Gray,TN,376156217,
419,Walgreen co.,,5104 BOBBY HICKS HWY,,Gray,TN,376156217,
420,Walgreen co.,,5104 BOBBY HICKS HWY,,Gray,TN,376156217,
1747,Cudzilo,Corey,2240 SUTHERLAND AVE,SUITE 103,Knoxville,TN,379192333,Corey Cudzilo
1748,Cudzilo,Corey,2240 SUTHERLAND AVE,SUITE 103,Knoxville,TN,379192333,Corey Cudzilo
1749,Cudzilo,Corey,2240 SUTHERLAND AVE,SUITE 103,Knoxville,TN,379192333,Corey Cudzilo
1750,Cudzilo,Corey,2240 SUTHERLAND AVE,SUITE 103,Knoxville,TN,379192333,Corey Cudzilo
1751,Cudzilo,Corey,2240 SUTHERLAND AVE,SUITE 103,Knoxville,TN,379192333,Corey Cudzilo


In [16]:
tennessee_providers.tail(10)

Unnamed: 0,last_name,first_name,address,address2,city,state,zip,full_name
9847071,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847072,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847073,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847074,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847075,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847076,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847077,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847078,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847079,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles
9847080,Giles,Wesley,979 E 3RD ST STE 300,,Chattanooga,TN,374032187,Wesley Giles


In [17]:
# drop duplicates
tennessee_providers = tennessee_providers.drop_duplicates(keep=False)
tennessee_providers

Unnamed: 0,last_name,first_name,address,address2,city,state,zip,full_name
6354,Stone,Ralph,1700 CARMACK BLVD,,Columbia,TN,38401,Ralph Stone
15768,State of tennessee,,214 WEST LONGVIEW DRIVE,,Portland,TN,37148,
17080,Davis,Patrick,1222 TROTWOOD AVE STE 603,,Columbia,TN,38401,Patrick Davis
26250,Horn,Cassondra,1 MEDICAL PARK BLVD,,Bristol,TN,376207430,Cassondra Horn
27899,Hiatt,Emily,3310 W END AVE,SUITE 590,Nashville,TN,372031028,Emily Hiatt
...,...,...,...,...,...,...,...,...
9834934,Shah,Jasmine,975 E 3RD ST,,Chattanooga,TN,374032147,Jasmine Shah
9834942,Basty,Marie,713 CHEATHAM ST,,Springfield,TN,371722828,Marie Basty
9839309,Bell,Amanda,317 N HICKORY AVE,,Cookeville,TN,385012428,Amanda Bell
9840208,Gill,Farrukh,5301 VIRGINIA WAY,SUITE 300,Brentwood,TN,370277541,Farrukh Gill


In [18]:
outpatient_tmp.head(80)

Unnamed: 0,full_name,address,city,state,zip,apc,apc_description
0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,5302.0,Level 2 Upper GI Procedures
1,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,5302.0,Level 2 Upper GI Procedures
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0,5302.0,Level 2 Upper GI Procedures
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0,5302.0,Level 2 Upper GI Procedures
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0,5302.0,Level 2 Upper GI Procedures
...,...,...,...,...,...,...,...
75,Tennova Healthcare-Lebanon,1411 Baddour Parkway,Lebanon,TN,37087.0,5302.0,Level 2 Upper GI Procedures
76,Tristar Hendersonville Medical Center,355 New Shackle Island Rd,Hendersonville,TN,37075.0,5302.0,Level 2 Upper GI Procedures
77,Tristar Southern Hills Medical Center,391 Wallace Rd,Nashville,TN,37211.0,5302.0,Level 2 Upper GI Procedures
78,Tristar Stonecrest Medical Center,200 Stonecrest Boulevard,Smyrna,TN,37167.0,5302.0,Level 2 Upper GI Procedures


In [19]:
outpatient_tmp.tail(47339)

Unnamed: 0,full_name,address,city,state,zip,apc,apc_description
78,Tristar Stonecrest Medical Center,200 Stonecrest Boulevard,Smyrna,TN,37167.0,5302.0,Level 2 Upper GI Procedures
79,,,,,,,
80,,,,,,,
81,,,,,,,
82,,,,,,,
...,...,...,...,...,...,...,...
47412,,,,,,,
47413,,,,,,,
47414,,,,,,,
47415,,,,,,,


In [20]:
# drop NaN columns
outpatient_tmp.dropna()

Unnamed: 0,full_name,address,city,state,zip,apc,apc_description
0,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,5302.0,Level 2 Upper GI Procedures
1,Jackson-Madison County General Hospital,620 Skyline Drive,Jackson,TN,38301.0,5302.0,Level 2 Upper GI Procedures
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0,5302.0,Level 2 Upper GI Procedures
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0,5302.0,Level 2 Upper GI Procedures
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0,5302.0,Level 2 Upper GI Procedures
...,...,...,...,...,...,...,...
74,Saint Thomas Highlands Hospital,401 Sewell Dr,Sparta,TN,38583.0,5302.0,Level 2 Upper GI Procedures
75,Tennova Healthcare-Lebanon,1411 Baddour Parkway,Lebanon,TN,37087.0,5302.0,Level 2 Upper GI Procedures
76,Tristar Hendersonville Medical Center,355 New Shackle Island Rd,Hendersonville,TN,37075.0,5302.0,Level 2 Upper GI Procedures
77,Tristar Southern Hills Medical Center,391 Wallace Rd,Nashville,TN,37211.0,5302.0,Level 2 Upper GI Procedures


In [21]:
# drop duplicates
outpatient_tmp = outpatient_tmp.drop_duplicates(keep=False)
outpatient_tmp

Unnamed: 0,full_name,address,city,state,zip,apc,apc_description
2,Sumner Regional Medical Center,555 Hartsville Pike,Gallatin,TN,37066.0,5302.0,Level 2 Upper GI Procedures
3,Tristar Skyline Medical Center,3441 Dickerson Pike,Nashville,TN,37207.0,5302.0,Level 2 Upper GI Procedures
4,Cumberland Medical Center,421 S Main St,Crossville,TN,38555.0,5302.0,Level 2 Upper GI Procedures
5,Blount Memorial Hospital,907 E Lamar Alexander Parkway,Maryville,TN,37804.0,5302.0,Level 2 Upper GI Procedures
6,Wellmont Bristol Regional Medical Center,One Medical Park Blvd,Bristol,TN,37620.0,5302.0,Level 2 Upper GI Procedures
9,Wellmont Holston Valley Medical Center,130 West Ravine Road,Kingsport,TN,37662.0,5302.0,Level 2 Upper GI Procedures
14,Williamson Medical Center,4321 Carothers Parkway,Franklin,TN,37067.0,5302.0,Level 2 Upper GI Procedures
15,Morristown Hamblen Hospital Association,908 W 4th North St,Morristown,TN,37814.0,5302.0,Level 2 Upper GI Procedures
16,Lakeway Regional Hospital,726 Mcfarland St,Morristown,TN,37814.0,5302.0,Level 2 Upper GI Procedures
17,Roane Medical Center,8045 Roane Medical Center Drive,Harriman,TN,37748.0,5302.0,Level 2 Upper GI Procedures


In [22]:
# save this version of outpatient_tmp for later
outpatient_addresses = outpatient_tmp.drop(['apc','apc_description'], axis=1)

# save_me
outpatient_addresses.to_csv('../data/clean/outpatient_providers.csv')


hey, stop here

SyntaxError: invalid syntax (<ipython-input-22-32f1eb22b672>, line 8)

## Import and Join Geocodes to DataFrames

In [None]:
# set up import variables
outpatient_geocodes_csv = '../data/clean/all_providers_geocodes.csv'

# import only the needed columns from outpatient_geocodes
columns = ['address', ' city', ' state', ' zip', 'longitude', 'latitude'] 

# import files to dataframes
outpatient_geocodes = pd.read_csv(outpatient_geocodes_csv, usecols=columns)

# delete initial space in column names
outpatient_geocodes.columns = outpatient_geocodes.columns.str.replace(' ', '').str.lower()

outpatient_geocodes.info()



In [None]:
# join
result = pd.merge(outpatient_addresses, outpatient_geocodes, how='left', on=['address'])

result.tail(50)

In [None]:
# export geocoded outpatient addresses
result.to_csv('../data/clean/outpatient_geocoded_addresses.csv')


In [None]:
hey, stop here

In [None]:
# join temp dataframes
# all_providers = pd.concat([tennessee_providers, outpatient_tmp], sort=False)
# df1.append(df2) 

all_providers = outpatient_tmp.append(tennessee_providers)

# all_providers.head(47339)
# drop NaN
all_providers.dropna


In [None]:
# drop duplicates
all_providers = all_providers.drop_duplicates(keep=False)
all_providers

In [None]:
all_providers.tail()

In [None]:
# export unique addresses for geocoding later
all_providers_addresses = all_providers[['address','city', 'state', 'zip']]

# drop duplicate addresses
all_providers_addresses.drop_duplicates(keep=False,inplace=True)



In [None]:
all_providers_addresses.head()

In [None]:
all_providers_addresses.info()

# NOTE: zip colim needs to be an integer to work with online geocoder at: 
# https://geocoding.geo.census.gov/geocoder/geographies/addressbatch?form

# df['column name'] = df['column name'].astype(np.int64)
all_providers_addresses['zip'] = all_providers_addresses['zip'].astype(np.int64)


In [None]:
all_providers_addresses.info()

In [None]:
# export address dataframe for geocoding
# all_providers.to_csv('../data/clean/all_providers.csv')
all_providers_addresses.to_csv('../data/clean/all_providers_addresses.csv')


########################### READ ME FOR LATER #################################
# NOTE: The addresses file will go through the geocoder and return the following format: "","xxx",""
#     This is a comma-delimited format with double quotes around each cell. I've been goint into BBEdit and
#     deleting the double quotes. Loads fine after that, but you'll still need to add column names, see below.
   
# Keep these columns 
#     - drop_me
#     - address
#     - city
#     - state
#     - zip
#     - No_Match
#     - match_type
#     - longitude
#     - latitude

# Drop these columns
#     - street2
#     - city2
#     - state2
#     - zip2
#     - unknown
#     - unknown2
#     - unknown3
#     - unknown4
#     - unknown5
#     - unknown6


In [None]:
# set up import variables
all_providers_geocodes_csv = '../data/clean/all_providers_geocodes.csv'
columns = ['address', ' city', ' state', ' zip', 'longitude', 'latitude'] 


# import files to dataframes
all_providers_geocodes = pd.read_csv(all_providers_geocodes_csv, usecols=columns)

# delete initial space in column names
all_providers_geocodes.columns = all_providers_geocodes.columns.str.replace(' ', '').str.lower()

all_providers_geocodes.info()



In [None]:
# join geocoded addresses
# pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None,
#          left_index=False, right_index=False, sort=True,
#          suffixes=('_x', '_y'), copy=True, indicator=False,
#          validate=None)

#DEBUGDEBUG This is not working correctly
result = pd.merge(all_providers, all_providers_geocodes, how='right', on=['address'])

result.tail(50)

In [None]:
# fix column names
new_names = ['address', 'address2', 'apc', 'apc_description', 'city_x', 'first_name',
       'full_name', 'hcpcs_code', 'hcpcs_description', 'last_name',
       'provider_type', 'state_x', 'zip_x', 'city', 'state', 'zip',
       'longitude', 'latitude']


result.columns = new_names

result.columns

In [None]:
# create all_providers_geocode_addresses
all_providers_geocoded_addresses = result[['full_name', 'first_name', 'last_name', 'address', 'address2',
                                          'city', 'state', 'zip', 'longitude', 'latitude', 'apc', 'apc_description',
                                          'hcpcs_code', 'hcpcs_description', 'provider_type']]

all_providers_geocoded_addresses.tail(50)

In [None]:
# save new dataframes
all_providers.to_csv('../data/clean/all_providers.csv')
all_providers_geocoded_addresses.to_csv('../data/clean/all_providers_geocoded_addresses.csv')

