In [1]:
import os, sys
sys.path.append('/data/spark15/python/')

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (18, 9)
import seaborn as sb
import numpy as np
import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import HiveContext
from pyspark.sql.types import *
import pyspark.sql.functions

sc = SparkContext("local", "hospital_compare")
sqlContext = SQLContext(sc)
hc = HiveContext(sc)

### What hospitals are models of high-quality care? That is, which hospitals have the most consistently high scores for a variety of procedures. 

In [3]:
print(hc.tableNames())

[u'effective_care', u'effective_care_scores', u'hospitals', u'hospitals_reduced', u'measures', u'readmissions', u'readmissions_reduced', u'survey_responses', u'survey_responses_reduced']


We are asked to find the hospitals that have the most consistently high scores for a variet of procedures. We will examine all the hospitals in the below two tables and join on their scores for each measure

In [89]:
hospitals = hc.sql('select * from hospitals_reduced').toPandas()
effective_care = hc.sql('select * from effective_care_scores').toPandas()

In [5]:
hospitals.head()

Unnamed: 0,provider_id,hospital_name,city,state,zip_code,county_name,hospital_type,hospital_ownership,meets_criteria,hospital_overall_rating,mortality_national_compare,safety_of_care_national_comparison,readmission_national_comparison,patient_experience_national_comparison,effectiveness_of_care_national_comparison,timeliness_of_care_national_comparison
0,10001,SOUTHEAST ALABAMA MEDICAL CENTER,DOTHAN,AL,36301,HOUSTON,Acute Care Hospitals,Government - Hospital District or Authority,Y,3,Same as the national average,Above the national average,Same as the national average,Below the national average,Same as the national average,Same as the national average
1,10005,MARSHALL MEDICAL CENTER SOUTH,BOAZ,AL,35957,MARSHALL,Acute Care Hospitals,Government - Hospital District or Authority,Y,3,Below the national average,Same as the national average,Above the national average,Same as the national average,Same as the national average,Above the national average
2,10006,ELIZA COFFEE MEMORIAL HOSPITAL,FLORENCE,AL,35631,LAUDERDALE,Acute Care Hospitals,Government - Hospital District or Authority,Y,2,Below the national average,Same as the national average,Same as the national average,Below the national average,Same as the national average,Above the national average
3,10007,MIZELL MEMORIAL HOSPITAL,OPP,AL,36467,COVINGTON,Acute Care Hospitals,Voluntary non-profit - Private,Y,2,Same as the national average,Not Available,Below the national average,Same as the national average,Below the national average,Above the national average
4,10008,CRENSHAW COMMUNITY HOSPITAL,LUVERNE,AL,36049,CRENSHAW,Acute Care Hospitals,Proprietary,Y,3,Same as the national average,Not Available,Same as the national average,Not Available,Same as the national average,Above the national average


In [9]:
effective_care.head()

Unnamed: 0,provider_id,measure_id,measure_name,condition,hospital_name,state,measure_start_date,measure_end_date,score
0,10001,ED_1b,ED1,Emergency Department,SOUTHEAST ALABAMA MEDICAL CENTER,AL,01/01/2016,12/31/2016,
1,10001,ED_2b,ED2,Emergency Department,SOUTHEAST ALABAMA MEDICAL CENTER,AL,01/01/2016,12/31/2016,
2,10001,IMM_2,Immunization for influenza,Preventive Care,SOUTHEAST ALABAMA MEDICAL CENTER,AL,10/01/2015,03/31/2016,
3,10001,IMM_3_OP_27_FAC_ADHPCT,Healthcare workers given influenza vaccination,Preventive Care,SOUTHEAST ALABAMA MEDICAL CENTER,AL,10/01/2016,03/31/2017,
4,10001,OP_18b,OP 18,Emergency Department,SOUTHEAST ALABAMA MEDICAL CENTER,AL,01/01/2016,12/31/2016,


In [16]:
effective_care = effective_care.drop(['hospital_name', 'state'], axis=1)

#### Let's join the hospital_reduced to the effective_care_scores table on provider_id

In [59]:
df = pd.merge(hospitals, effective_care, how='inner', left_on='provider_id', right_on='provider_id')

#### Finding overall best rated hospitals

In [60]:
df['hospital_overall_rating'] = df['hospital_overall_rating'].replace('Not Available', np.nan).apply(pd.to_numeric)

In [61]:
(df.groupby(['provider_id', 'hospital_name']).mean()[['hospital_overall_rating']]
                                             .sort_values(by='hospital_overall_rating', 
                                                          ascending=False)).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,hospital_overall_rating
provider_id,hospital_name,Unnamed: 2_level_1
40147,ARKANSAS SURGICAL HOSPITAL,5.0
370222,MCBRIDE ORTHOPEDIC HOSPITAL,5.0
450610,MEMORIAL HERMANN MEMORIAL CITY MEDICAL CENTER,5.0
450604,HILL COUNTRY MEMORIAL HOSPITAL INC,5.0
330270,HOSPITAL FOR SPECIAL SURGERY,5.0
220088,NEW ENGLAND BAPTIST HOSPITAL,5.0
140291,ADVOCATE GOOD SHEPHERD HOSPITAL,5.0
230035,SPECTRUM HEALTH UNITED HOSPITAL,5.0
140202,ADVOCATE CONDELL MEDICAL CENTER,5.0
140186,RIVERSIDE MEDICAL CENTER,5.0


#### Finding average score by procedure along with variability in score

In [58]:
results = df.set_index(['provider_id', 'hospital_name', 'measure_id'])[['score']].unstack()
results = results.apply(pd.to_numeric)
results.loc[:, 'Average_Score'] = results.mean(axis=1)
results.loc[:, 'StDev_Score'] = results.std(axis=1)

results.sort_values(by='Average_Score', ascending=False).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,score,score,score,score,score,score,score,score,score,score,score,score,score,score,score,score,score,score,score,Average_Score,StDev_Score
Unnamed: 0_level_1,measure_id,ED_1b,ED_2b,IMM_2,IMM_3_OP_27_FAC_ADHPCT,OP_1,OP_18b,OP_2,OP_20,OP_21,OP_22,...,OP_31,OP_3b,OP_4,OP_5,PC_01,STK_4,VTE_5,VTE_6,Unnamed: 21_level_1,Unnamed: 22_level_1
provider_id,hospital_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
331311,LITTLE FALLS HOSPITAL,,,,,,,,,,,...,,,,9.0,,,,,9.0,0.0
451386,MEMORIAL HOSPITAL,,,,,,,,,,,...,,,,9.0,9.0,,,,9.0,0.0
301304,NEW LONDON HOSPITAL,,,,,,,,,,,...,,,,9.0,,,,,9.0,0.0
281336,YORK GENERAL HOSPITAL,,,,,,,,,,,...,,,,9.0,,,,,9.0,0.0
181304,TRIGG COUNTY HOSPITAL,,,,,,,,,,,...,,,,,,,,,9.0,0.0
21313,SOUTH PENINSULA HOSPITAL,,,,,,,,,,,...,,,,9.0,,,,,9.0,0.0
51306,GLENN MEDICAL CENTER,,,9.0,,,,,,,,...,,,,,,,,,9.0,0.0
111308,CLINCH MEMORIAL HOSPITAL,,,,,,,,,,,...,,,,9.0,,,,,9.0,0.0
391316,SOLDIERS AND SAILORS MEMORIAL HOSPITAL,,,,,,,,,,,...,,,,9.0,,,,,9.0,0.0
161330,AUDUBON COUNTY MEMORIAL HOSPITAL,,,,,,,,9.0,,,...,,,,,,,,,9.0,0.0
