In [1]:
import pandas as pd
import pandas_bokeh
import bokeh

In [2]:
# Embedding plots in Jupyter Notebook
pandas_bokeh.output_notebook()

# Data overview

In [3]:
df = pd.read_csv('hospital-charges.csv')
df.head()

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Average Covered Charges,Average Total Payments,Average Medicare Payments
0,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10001,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,AL - Dothan,91,$32963.07,$5777.24,$4763.73
1,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10005,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,AL - Birmingham,14,$15131.85,$5787.57,$4976.71
2,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10006,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,AL - Birmingham,24,$37560.37,$5434.95,$4453.79
3,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10011,ST VINCENT'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,AL,35235,AL - Birmingham,25,$13998.28,$5417.56,$4129.16
4,039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,10016,SHELBY BAPTIST MEDICAL CENTER,1000 FIRST STREET NORTH,ALABASTER,AL,35007,AL - Birmingham,18,$31633.27,$5658.33,$4851.44


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163065 entries, 0 to 163064
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype 
---  ------                                --------------   ----- 
 0   DRG Definition                        163065 non-null  object
 1   Provider Id                           163065 non-null  int64 
 2   Provider Name                         163065 non-null  object
 3   Provider Street Address               163065 non-null  object
 4   Provider City                         163065 non-null  object
 5   Provider State                        163065 non-null  object
 6   Provider Zip Code                     163065 non-null  int64 
 7   Hospital Referral Region Description  163065 non-null  object
 8    Total Discharges                     163065 non-null  int64 
 9    Average Covered Charges              163065 non-null  object
 10   Average Total Payments               163065 non-null  object
 11  Average Medic

# Format names and data

In [5]:
# To remove spaces and rename columns
df.columns = df.columns.str.strip()
df.rename(columns={'Average Covered Charges': 'Avg Covered Charges [$]',
                    'Average Total Payments' : 'Avg Total Payments [$]', 
                    'Average Medicare Payments': 'Avg Medicare Payments [$]'
                    }, inplace=True)

In [6]:
# To remove $  symbol and convert into float
df[['Avg Covered Charges [$]', 'Avg Total Payments [$]', 'Avg Medicare Payments [$]']] =  df[['Avg Covered Charges [$]', 
                'Avg Total Payments [$]', 'Avg Medicare Payments [$]']].replace('\$','',regex=True).astype(float)

In [7]:
# To add 'Avg. Cost/attended patient [$]'
df['Avg. Cost/attended patient [$]'] = df['Avg Total Payments [$]']/df['Total Discharges']

In [8]:
df.sample(2)

Unnamed: 0,DRG Definition,Provider Id,Provider Name,Provider Street Address,Provider City,Provider State,Provider Zip Code,Hospital Referral Region Description,Total Discharges,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
99781,390 - G.I. OBSTRUCTION W/O CC/MCC,370008,NORMAN REGIONAL HEALTH SYSTEM,901 NORTH PORTER,NORMAN,OK,73070,OK - Oklahoma City,42,18280.64,3550.47,2794.59,84.535
36039,195 - SIMPLE PNEUMONIA & PLEURISY W/O CC/MCC,310108,JFK MEDICAL CTR - ANTHONY M. YELENCSICS COMMUNITY,65 JAMES STREET,EDISON,NJ,8818,NJ - New Brunswick,76,37380.61,5235.57,4311.89,68.889079


# Answer questions

### 1. Which are the most expensive states and the cheapest ones?
The best variable to compare is **Average Total Payments** instead of **Average Covered Charges** because the last one is only the ticket price, not the real amount paid.

But...

In [9]:
df.drop(df[['Provider Id', 'Provider Zip Code', 'Avg Covered Charges [$]']], axis=1, inplace=True)

In [10]:
df.plot_bokeh.hist()

In [11]:
df_by_state = df.groupby(df['Provider State']).sum()
df_by_state.drop(df[['Total Discharges']], axis=1, inplace=True)

In [12]:
df_by_state = df_by_state.astype(int)
df_by_state.head(4)

Unnamed: 0_level_0,Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
Provider State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,3366222,2993521,176355
AL,27510523,23329455,1194685
AR,16575787,14303062,695235
AZ,28950559,25162119,1332213


In [13]:
sorted_df_by_state = df_by_state.sort_values(by=['Avg Total Payments [$]'])

In [14]:
by_states = sorted_df_by_state.plot_bokeh.bar(figsize=(1200,400), 
                                            title='Hospital Charges in America by States', 
                                            show_figure=True, legend = 'top_left', disable_scientific_axes='y')

####  By Pacient by State

In [25]:
df_by_patient = df.groupby(df['Provider State']).median()

In [33]:
sorted_df_by_patient = df_by_patient.sort_values(by=['Avg. Cost/attended patient [$]'])

In [31]:
by_patient = sorted_df_by_patient.plot_bokeh.bar(figsize=(1200,400), 
                                y='Avg. Cost/attended patient [$]',
                                title='Hospital Charges in America by Attended Patient', 
                                show_figure=True, legend = 'top_left')

In [174]:
df_drg = df.drop(df[['Provider Id',	'Provider Zip Code', 'Total Discharges', 'Avg Covered Charges [$]', 'Avg Medicare Payments [$]', 'Avg Total Payments [$]']], axis=1)

KeyError: "['Provider Id', 'Provider Zip Code', 'Avg Covered Charges [$]'] not in index"

#### More expensive States by DRG

In [None]:
df_drg_max = df_drg.groupby(['DRG Definition', 'Provider State']).median()
sorted_df_drg_max = df_drg_max.sort_values(['DRG Definition','Avg. Cost/attended patient [$]'], ascending=[1,0])
sorti = sorted_df_drg_max.groupby(level=0).head(1)
sorti

Unnamed: 0_level_0,Unnamed: 1_level_0,Avg. Cost/attended patient [$]
DRG Definition,Provider State,Unnamed: 2_level_1
039 - EXTRACRANIAL PROCEDURES W/O CC/MCC,DC,677.112143
057 - DEGENERATIVE NERVOUS SYSTEM DISORDERS W/O MCC,AK,799.909091
064 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W MCC,AK,1417.867941
065 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W CC,AK,793.107500
066 - INTRACRANIAL HEMORRHAGE OR CEREBRAL INFARCTION W/O CC/MCC,HI,518.900302
...,...,...
885 - PSYCHOSES,CT,686.596667
897 - ALCOHOL/DRUG ABUSE OR DEPENDENCE W/O REHABILITATION THERAPY W/O MCC,AK,516.298205
917 - POISONING & TOXIC EFFECTS OF DRUGS W MCC,SD,1402.751818
918 - POISONING & TOXIC EFFECTS OF DRUGS W/O MCC,VT,427.085184


In [None]:
sorti.index.unique(level=1)

Index(['DC', 'AK', 'HI', 'UT', 'WY', 'CA', 'OR', 'VT', 'NM', 'RI', 'NV', 'CT',
       'SD'],
      dtype='object', name='Provider State')

In [None]:
listi= sorti.index.get_level_values('Provider State')

In [None]:
from collections import Counter
a = Counter(listi)
b = dict(a)
c = pd.DataFrame(b.items())
c.rename(columns={0: 'Provider State',1:'DRG_qty'}, inplace=True)
d = c.set_index('Provider State')

In [None]:
sorted_d = d.sort_values(by='DRG_qty')
sorted_d.plot_bokeh.bar(show_figure=True, legend = 'top_left', title='States with more expensive DRG')

#### More sheapest States by DRG

In [None]:
sorted_df_drg_min = df_drg_max.sort_values(['DRG Definition','Avg. Cost/attended patient [$]'], ascending=[1,1])
drg_min = sorted_df_drg_min.groupby(level=0).head(1)


In [None]:
drg_min_states = drg_min.index.get_level_values('Provider State')


In [None]:
drg_min_states_counter = Counter(drg_min_states)
drg_min_states_dict = dict(drg_min_states_counter)
drg_min_states_df = pd.DataFrame(drg_min_states_dict.items())
drg_min_states_df.rename(columns={0: 'Provider State',1:'DRG_qty'}, inplace=True)
drg_min_states_df = drg_min_states_df.set_index('Provider State')

In [None]:
sorted_drg_min_states_df = drg_min_states_df.sort_values(by='DRG_qty')
sorted_drg_min_states_df.plot_bokeh.bar(show_figure=True, legend = 'top_left', title='States with cheapest DRG')

To answer that question is necesary to clarify:

1.1. By **Avg Total Payments [$]**

💰 According to **Average Total Payments**  the three most expensive states✳️ are:

1. **California** with $164993988.  
2. **Texas** with $10967057.
3. **New York** with $108259026.

💰 According to **Average Total Payments**  the three most cheapest states are:

1. **West Virginia** with $2815426.
2. **Vermont** with $3176902.
3. **Arkansas** with $3366222.  
  

1.2. By **Discharges** (attended patient)

🤕 The three most expensive states are:
1. **Alaska** with $550.
2. **Hawaii** with $503.
3. **Wyoming** with $443.  


🤕 The three most cheapest states are:
1. **Delaware** with $215.
2. **Kentucky** with $226.
3. **Tennessee** with $227. 


2.3. By  **DRG** 

💉 The most expensive states by DRG are:
* **Alaska**: 42 DRG
* **Hawaii**: 21 DRG
* **Utah**: 13 DRG
* **Wyoming**: 8 DRG
* **West Virginia**: 7 DRG

💉 The most cheapest states by DRG are:
* **Delaware**: 20 DRG
* **Michigan**: 14 DRG
* **New Jersey**: 12 DRG
* **Maine**: 8 DRG
* **California**: 6 DRG


### 2. Cheapest Hospitals are in the same state?

 

In [None]:
df2 = df.groupby(['Provider State', 'Provider Zip Code']).sum()
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Provider Id,Total Discharges,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$],Avg. Cost/attended patient [$]
Provider State,Provider Zip Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AK,99508,1181300,1403,2285991.93,889383.89,767378.66,47241.020366
AK,99519,1520076,2613,4031236.53,1132781.72,1015596.76,52750.061937
AK,99559,120108,137,45876.14,59637.32,54540.84,3029.116208
AK,99645,640192,793,1226802.20,371122.45,322782.67,18702.211819
AK,99669,360432,344,482658.18,207639.95,182150.37,12073.501473
...,...,...,...,...,...,...,...
WY,82716,8480032,311,291255.24,162774.63,147219.99,9433.991693
WY,82801,7950090,375,283548.15,137845.21,124673.64,6692.835522
WY,82901,4240088,127,142791.27,80474.15,71877.93,5799.567604
WY,82930,2120128,69,95796.40,34527.16,29749.98,2008.243758


In [None]:
# To answer if exist some correlation between Average Covered Charges, Average Total Payments and Average Medicare Payments
df[['Avg Covered Charges [$]', 'Avg Total Payments [$]', 'Avg Medicare Payments [$]']].corr(method ='pearson')

Unnamed: 0,Avg Covered Charges [$],Avg Total Payments [$],Avg Medicare Payments [$]
Avg Covered Charges [$],1.0,0.774112,0.768927
Avg Total Payments [$],0.774112,1.0,0.989362
Avg Medicare Payments [$],0.768927,0.989362,1.0
