In [85]:
import pandas as pd
import bokeh
import pandas_bokeh


In [86]:
# Embedding plots in Jupyter Notebook
pandas_bokeh.output_notebook()

# Data overview

In [87]:
df = pd.read_csv('hospital-charges.csv')
df.head(3)

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
0,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,AL - Dothan,91,$32963.07,$5777.24,$4763.73
1,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,AL - Birmingham,14,$15131.85,$5787.57,$4976.71
2,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,AL - Birmingham,24,$37560.37,$5434.95,$4453.79


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163065 entries, 0 to 163064
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype 
---  ------                                --------------   ----- 
 0   DRG Definition                        163065 non-null  object
 1   Provider Id                           163065 non-null  int64 
 2   Provider Name                         163065 non-null  object
 3   Provider Street Address               163065 non-null  object
 4   Provider City                         163065 non-null  object
 5   Provider State                        163065 non-null  object
 6   Provider Zip Code                     163065 non-null  int64 
 7   Hospital Referral Region Description  163065 non-null  object
 8    Total Discharges                     163065 non-null  int64 
 9    Average Covered Charges              163065 non-null  object
 10   Average Total Payments               163065 non-null  object
 11  Average Medic

# Format names and data

In [89]:
# To remove spaces
df.columns = df.columns.str.strip()
df.rename(columns={'Average Covered Charges': 'Avg Covered Charges [$]','Average Total Payments' : 'Avg Total Payments [$]', 'Average Medicare Payments': 'Avg Medicare Payments [$]'}, inplace=True)
df.head(3)

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$]
0,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,AL - Dothan,91,$32963.07,$5777.24,$4763.73
1,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,AL - Birmingham,14,$15131.85,$5787.57,$4976.71
2,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,AL - Birmingham,24,$37560.37,$5434.95,$4453.79


In [90]:

# To remove $  symbol and convert into float
df[['Avg Covered Charges [$]', 'Avg Total Payments [$]', 'Avg Medicare Payments [$]']] =  df[['Avg Covered Charges [$]', 'Avg Total Payments [$]', 'Avg Medicare Payments [$]']].replace('\$','',regex=True).astype(float)
df['Avg. Cost/attended patient [$]'] = df['Avg Total Payments [$]']/df['Total Discharges']

In [91]:
df.head(2)

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
0,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,AL - Dothan,91,32963.07,5777.24,4763.73,63.486154
1,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,AL - Birmingham,14,15131.85,5787.57,4976.71,413.397857


# Answer questions

### 1. Which are the most expensive states and the cheapest ones?
The best variable to compare is **Average Total Payments** instead of **Average Covered Charges** because the last one is only the ticket price, not the real amount paid.

But...

In [92]:
df1 = df.groupby(df['Provider State']).median()
df1.drop(df[['Provider Id',	'Provider Zip Code', 'Total Discharges', 'Avg Covered Charges [$]']], axis=1, inplace=True)
df1.head()

Unnamed: 0_level_0,Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
Provider State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,11190.08,9594.93,550.763226
AL,5876.37,4811.25,235.660588
AR,6151.68,5165.3,231.121667
AZ,7671.36,6471.11,316.4925
CA,9378.01,8380.895,393.354306


In [93]:
df1 = df1.astype(int)

In [94]:
sorted_df1 = df1.sort_values(by=['Avg Total Payments [$]'])

In [95]:
by_states = sorted_df1.plot_bokeh.bar(figsize=(1200,400), title='Hospital Charges in America by States', show_figure=True, legend = 'top_left')

In [96]:
sort_by_patients = df1.sort_values(by=['Avg. Cost/attended patient [$]'])
by_patient = sort_by_patients.plot_bokeh.bar(figsize=(1200,400), y='Avg. Cost/attended patient [$]',title='Hospital Charges in America by Attended Patient', show_figure=True, legend = 'top_left')

In [97]:
df_drg = df.drop(df[['Provider Id',	'Provider Zip Code', 'Total Discharges', 'Avg Covered Charges [$]', 'Avg Medicare Payments [$]', 'Avg Total Payments [$]']], axis=1)

In [107]:
df_drg_max = df_drg.groupby(['DRG Definition', 'Provider State']).median()
sorted_df_drg_max = df_drg_max.sort_values(['DRG Definition','Avg. Cost/attended patient [$]'], ascending=[1,0])
sorti = sorted_df_drg_max.groupby(level=0).head(1)

In [109]:
sorti.index.unique(level=1)

Index(['DC', 'AK', 'HI', 'UT', 'WY', 'CA', 'OR', 'VT', 'NM', 'RI', 'NV', 'CT',
       'SD'],
      dtype='object', name='Provider State')

In [111]:
listi= sorti.index.get_level_values('Provider State')

In [128]:
from collections import Counter
a = Counter(listi)
a

Counter({'DC': 8,
         'AK': 42,
         'HI': 21,
         'UT': 2,
         'WY': 13,
         'CA': 1,
         'OR': 1,
         'VT': 7,
         'NM': 1,
         'RI': 1,
         'NV': 1,
         'CT': 1,
         'SD': 1})

In [153]:
from collections import Counter
a = Counter(listi)
b = dict(a)
c = pd.DataFrame(b.items())
c.rename(columns={0: 'Provider State',1:'DRG_qty'}, inplace=True)
d = c.set_index('Provider State')
d
# d.rename(columns={0: 'Provider State',1:'DRG_qty'}, inplace=True)
# d



Unnamed: 0_level_0,DRG_qty
Provider State,Unnamed: 1_level_1
DC,8
AK,42
HI,21
UT,2
WY,13
CA,1
OR,1
VT,7
NM,1
RI,1


In [155]:
sorted_d = d.sort_values(by='DRG_qty')
sorted_d.plot_bokeh.bar(show_figure=True, legend = 'top_left')

To answer that question is necesary to clarify:

1.1. By **Avg Total Payments [$]**

💰 The three most expensive states✳️ are:
1. **Alaska** with $11190.
2. **Hawaii** with $10148.
3. **California** with $9378.  

✳️ **Washington, D.C.** is not an state, however is cosidered an special distric because is the capital city of the United States.

💰 The three most cheapest states are:
1. **Alabama** with $5876.
2. **West Virginia** with $6102.
3. **Arkansas** with $6151.  

1.2. By **Discharges** (attended patient)

🤕 The three most expensive states are:
1. **Alaska** with $550.
2. **Hawaii** with $503.
3. **Wyoming** with $443.  


🤕 The three most cheapest states are:
1. **Delaware** with $215.
2. **Kentucky** with $226.
3. **Tennessee** with $227. 


2.3. By  **DRG** 

💉 The most expensive states by DRG are:
* **Alaska**: 42 DRG
* **Hawaii**: 21 DRG
* **Utah**: 13 DRG
* **Wyoming**: 8 DRG
* **California**: 7 DRG

💉 The most cheapest states by DRG are:



### 2. Cheapest Hospitals are in the same state?

 

In [102]:
df2 = df.groupby(['Provider State', 'Provider Zip Code']).sum()
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Provider Id,Total Discharges,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
Provider State,Provider Zip Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AK,99508,1181300,1403,2285991.93,889383.89,767378.66,47241.020366
AK,99519,1520076,2613,4031236.53,1132781.72,1015596.76,52750.061937
AK,99559,120108,137,45876.14,59637.32,54540.84,3029.116208
AK,99645,640192,793,1226802.20,371122.45,322782.67,18702.211819
AK,99669,360432,344,482658.18,207639.95,182150.37,12073.501473
...,...,...,...,...,...,...,...
WY,82716,8480032,311,291255.24,162774.63,147219.99,9433.991693
WY,82801,7950090,375,283548.15,137845.21,124673.64,6692.835522
WY,82901,4240088,127,142791.27,80474.15,71877.93,5799.567604
WY,82930,2120128,69,95796.40,34527.16,29749.98,2008.243758


In [103]:
# To answer if exist some correlation between Average Covered Charges, Average Total Payments and Average Medicare Payments
df[['Avg Covered Charges [$]', 'Avg Total Payments [$]', 'Avg Medicare Payments [$]']].corr(method ='pearson')

Unnamed: 0,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$]
Avg Covered Charges [$],1.0,0.774112,0.768927
Avg Total Payments [$],0.774112,1.0,0.989362
Avg Medicare Payments [$],0.768927,0.989362,1.0
