In [130]:
!pip install pandas
!pip install bokeh
!pip install pandas_bokeh



In [100]:
import pandas as pd
import pandas_bokeh
import bokeh

In [101]:
# Embedding plots in Jupyter Notebook
pandas_bokeh.output_notebook()

# Data overview

In [102]:
df = pd.read_csv('hospital-charges.csv')
df.head()

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
0,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,AL - Dothan,91,$32963.07,$5777.24,$4763.73
1,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,AL - Birmingham,14,$15131.85,$5787.57,$4976.71
2,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,AL - Birmingham,24,$37560.37,$5434.95,$4453.79
3,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10011,ST VINCENT'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,AL,35235,AL - Birmingham,25,$13998.28,$5417.56,$4129.16
4,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10016,SHELBY BAPTIST MEDICAL CENTER,1000 FIRST STREET NORTH,ALABASTER,AL,35007,AL - Birmingham,18,$31633.27,$5658.33,$4851.44


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163065 entries, 0 to 163064
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype 
---  ------                                --------------   ----- 
 0   DRG Definition                        163065 non-null  object
 1   Provider Id                           163065 non-null  int64 
 2   Provider Name                         163065 non-null  object
 3   Provider Street Address               163065 non-null  object
 4   Provider City                         163065 non-null  object
 5   Provider State                        163065 non-null  object
 6   Provider Zip Code                     163065 non-null  int64 
 7   Hospital Referral Region Description  163065 non-null  object
 8    Total Discharges                     163065 non-null  int64 
 9    Average Covered Charges              163065 non-null  object
 10   Average Total Payments               163065 non-null  object
 11  Average Medic

# Format names and data

In [104]:
# To remove spaces and rename columns
df.columns = df.columns.str.strip()
df.rename(columns={'Average Covered Charges': 'Avg Covered Charges [$]',
                    'Average Total Payments' : 'Avg Total Payments [$]', 
                    'Average Medicare Payments': 'Avg Medicare Payments [$]'
                    }, inplace=True)

In [105]:
# To remove $  symbol and convert into float
df[['Avg Covered Charges [$]', 'Avg Total Payments [$]', 'Avg Medicare Payments [$]']] =  df[['Avg Covered Charges [$]', 
                'Avg Total Payments [$]', 'Avg Medicare Payments [$]']].replace('\$','',regex=True).astype(float)

In [106]:
# To add 'Avg. Cost/attended patient [$]'
df['Avg. Cost/attended patient [$]'] = df['Avg Total Payments [$]']/df['Total Discharges']

In [107]:
df.sample(2)

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
95445,378 - G.I. HEMORRHAGE W CC,520138,AURORA ST LUKES MEDICAL CENTER,2900 W OKLAHOMA AVE,MILWAUKEE,WI,53215,WI - Milwaukee,159,28441.81,7805.78,6451.0,49.092956
114777,473 - CERVICAL SPINAL FUSION W/O CC/MCC,450869,DOCTORS HOSPITAL AT RENAISSANCE,5501 SOUTH MCCOLL,EDINBURG,TX,78539,TX - McAllen,18,41459.72,17545.5,15114.5,974.75


# Answer questions

### 1. Which are the most expensive states and the cheapest ones?
The best variable to compare is **Average Total Payments** instead of **Average Covered Charges** because the last one is only the ticket price, not the real amount paid.

But...

In [108]:
# To answer if exist some correlation between Average Covered Charges, Average Total Payments and Average Medicare Payments
df[['Avg Covered Charges [$]', 'Avg Total Payments [$]', 'Avg Medicare Payments [$]']].corr(method ='pearson')

Unnamed: 0,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$]
Avg Covered Charges [$],1.0,0.774112,0.768927
Avg Total Payments [$],0.774112,1.0,0.989362
Avg Medicare Payments [$],0.768927,0.989362,1.0


In [109]:
df.drop(df[['Provider Id', 'Provider Zip Code', 'Avg Covered Charges [$]']], axis=1, inplace=True)

In [110]:
df.plot_bokeh.hist()

In [111]:
df_by_state = df.groupby(df['Provider State']).sum()
df_by_state.drop(df[['Total Discharges']], axis=1, inplace=True)

In [112]:
df_by_state = df_by_state.astype(int)
df_by_state.head(4)

Unnamed: 0_level_0,Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
Provider State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,3366222,2993521,176355
AL,27510523,23329455,1194685
AR,16575787,14303062,695235
AZ,28950559,25162119,1332213


In [113]:
sorted_df_by_state = df_by_state.sort_values(by=['Avg Total Payments [$]'])

In [114]:
by_states = sorted_df_by_state.plot_bokeh.bar(figsize=(1200,400), 
                                            title='Hospital Charges in America by States', 
                                            show_figure=True, legend = 'top_left', disable_scientific_axes='y')

####  By Pacient by State

In [115]:
df_by_patient = df.groupby(df['Provider State']).median()

In [116]:
sorted_df_by_patient = df_by_patient.sort_values(by=['Avg. Cost/attended patient [$]'])

In [117]:
by_patient = sorted_df_by_patient.plot_bokeh.bar(figsize=(1200,400), 
                                y='Avg. Cost/attended patient [$]',
                                title='Hospital Charges in America by Attended Patient', 
                                show_figure=True, legend = 'top_left')

#### More expensive States by DRG

In [118]:
df_by_drg = df.groupby(['DRG Definition', 'Provider State']).median()
df_by_drg = df_by_drg.sort_values(['DRG Definition','Avg. Cost/attended patient [$]'], ascending=[1,0])

In [119]:
sorted_df_by_drg = df_by_drg.groupby(level=0).head(1)
sorted_df_by_drg.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Discharges,Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
DRG Definition,Provider State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,DC,14.0,9479.57,8680.14,677.112143
057 - DEGENERATIVE NERVOUS SYSTEM DISORDERS W/O MCC,AK,11.0,8799.0,8081.54,799.909091
064 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W MCC,AK,22.5,21608.01,20555.255,1417.867941


In [120]:
drg_by_states = sorted_df_by_drg.index.get_level_values('Provider State')
drg_by_states

Index(['DC', 'AK', 'AK', 'AK', 'HI', 'AK', 'UT', 'HI', 'DC', 'HI', 'AK', 'DC',
       'WY', 'AK', 'AK', 'HI', 'AK', 'HI', 'AK', 'WY', 'CA', 'OR', 'AK', 'AK',
       'HI', 'VT', 'WY', 'AK', 'HI', 'WY', 'AK', 'AK', 'WY', 'DC', 'HI', 'UT',
       'VT', 'HI', 'AK', 'AK', 'AK', 'WY', 'NM', 'AK', 'HI', 'WY', 'AK', 'AK',
       'WY', 'AK', 'AK', 'VT', 'AK', 'AK', 'WY', 'AK', 'AK', 'AK', 'AK', 'VT',
       'HI', 'WY', 'HI', 'RI', 'AK', 'DC', 'HI', 'DC', 'HI', 'WY', 'DC', 'AK',
       'AK', 'HI', 'HI', 'HI', 'VT', 'HI', 'AK', 'AK', 'WY', 'HI', 'AK', 'AK',
       'VT', 'HI', 'AK', 'NV', 'HI', 'AK', 'AK', 'DC', 'AK', 'WY', 'AK', 'CT',
       'AK', 'SD', 'VT', 'AK'],
      dtype='object', name='Provider State')

In [121]:
from collections import Counter
drg_max_states_counter = Counter(drg_by_states)
drg_max_states_dict = dict(drg_max_states_counter)
drg_max_states_df = pd.DataFrame(drg_max_states_dict.items())
drg_max_states_df.rename(columns={0: 'Provider State',1:'DRG_qty'}, inplace=True)
drg_max_states_df = drg_max_states_df.set_index('Provider State')

In [122]:
sorted_drg_max_states_df = drg_max_states_df.sort_values(by='DRG_qty')
sorted_drg_max_states_df.plot_bokeh.bar(show_figure=True, legend = 'top_left', title='States with more expensive DRG')

#### More sheapest States by DRG

In [123]:
df_by_drg_min = df_by_drg.sort_values(['DRG Definition','Avg. Cost/attended patient [$]'], ascending=[1,1])

In [124]:
sorted_df_by_drg_min = df_by_drg_min.groupby(level=0).head(1)
sorted_df_by_drg_min.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Total Discharges,Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
DRG Definition,Provider State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,ME,75.0,7225.66,5470.51,96.342133
057 - DEGENERATIVE NERVOUS SYSTEM DISORDERS W/O MCC,DE,57.0,6899.31,5840.745,137.787576
064 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W MCC,KY,43.0,11293.04,10192.765,222.920249


In [125]:
drg_by_states_min = sorted_df_by_drg_min.index.get_level_values('Provider State')

In [126]:
drg_min_states_counter = Counter(drg_by_states_min)
drg_min_states_dict = dict(drg_min_states_counter)
drg_min_states_df = pd.DataFrame(drg_min_states_dict.items())
drg_min_states_df.rename(columns={0: 'Provider State',1:'DRG_qty'}, inplace=True)
drg_min_states_df = drg_min_states_df.set_index('Provider State')

In [127]:
sorted_drg_min_states_df = drg_min_states_df.sort_values(by='DRG_qty')
sorted_drg_min_states_df.plot_bokeh.bar(show_figure=True, legend = 'top_left', title='States with cheapest DRG')

To answer that question is necesary to clarify:

1.1. By **Avg Total Payments [$]**

💰 According to **Average Total Payments**  the three most expensive states✳️ are:

1. **California** with $164993988.  
2. **Texas** with $10967057.
3. **New York** with $108259026.

💰 According to **Average Total Payments**  the three most cheapest states are:

1. **West Virginia** with $2815426.
2. **Vermont** with $3176902.
3. **Arkansas** with $3366222.  
  

1.2. By **Discharges** (attended patient)

🤕 The three most expensive states are:
1. **Alaska** with $550.
2. **Hawaii** with $503.
3. **Wyoming** with $443.  


🤕 The three most cheapest states are:
1. **Delaware** with $215.
2. **Kentucky** with $226.
3. **Tennessee** with $227. 


2.3. By  **DRG** 

💉 The most expensive states by DRG are:
* **Alaska**: 42 DRG
* **Hawaii**: 21 DRG
* **Utah**: 13 DRG
* **Wyoming**: 8 DRG
* **West Virginia**: 7 DRG

💉 The most cheapest states by DRG are:
* **Delaware**: 20 DRG
* **Michigan**: 14 DRG
* **New Jersey**: 12 DRG
* **Maine**: 8 DRG
* **California**: 6 DRG


### 2. Cheapest Hospitals are in the same state?

 