In [293]:
"""Cleaning of Readmissions Data
The goal of this cleaning is to reduce the data from ratings by facility to ratings by state.
The rating by state will be found by finding the average rating for all facilities within a state.
Will capture facilities by counting them and adding a coulumn to the final database"""
import pandas as pd
import numpy as np

#Pull File Into Database and Set Column Names
col = ['hospital_name', 'provider_number', 'state', 'measure', 'discharges','footnote',
           'readmission_ratio','predicted_rate','expected_rate','readmissions','starte_date','end_Date']
df = pd.read_csv('Readmissions.csv')
df.columns=col


In [294]:
"""Initial removal of the following columns:
measure: columns are to be denied on the state level to get the overall state readmission ratio
         rendering this column unneccessary to this analysis.
footnote:  footnotes associated with a lack of information.  Most of which will be removed in the
           cleaning process.
start_date:  Does not provide any useful information for this analysis.  Also the same for all rows.
end_date: Does not provide any useful information fro this analysis.  Also the same for all rows."""

'Initial removal of the following columns:\nmeasure: columns are to be denied on the state level to get the overall state readmission ratio\n         rendering this column unneccessary to this analysis.\nfootnote:  footnotes associated with a lack of information.  Most of which will be removed in the\n           cleaning process.\nstart_date:  Does not provide any useful information for this analysis.  Also the same for all rows.\nend_date: Does not provide any useful information fro this analysis.  Also the same for all rows.'

In [295]:
#Removing Columns
usecols= ['hospital_name', 'provider_number', 'state','discharges','readmission_ratio',
           'predicted_rate','expected_rate','readmissions']
df=df[usecols]
df.head()

Unnamed: 0,hospital_name,provider_number,state,discharges,readmission_ratio,predicted_rate,expected_rate,readmissions
0,HIGHLANDS MEDICAL CENTER,10061,AL,Not Available,Not Available,Not Available,Not Available,Not Available
1,CLAY COUNTY HOSPITAL,10073,AL,103,0.9853,14.4,14.6,14
2,NORTHEAST ALABAMA REGIONAL MEDICAL CENTER,10078,AL,404,1.4044,6.1,4.3,31
3,NORTHEAST ALABAMA REGIONAL MEDICAL CENTER,10078,AL,544,0.9653,16.7,17.3,89
4,ATHENS LIMESTONE HOSPITAL,10079,AL,Not Available,1.0204,4.3,4.2,Too Few to Report


In [296]:
#Explore Data Pre-Clean
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19830 entries, 0 to 19829
Data columns (total 8 columns):
hospital_name        19830 non-null object
provider_number      19830 non-null int64
state                19830 non-null object
discharges           19830 non-null object
readmission_ratio    19830 non-null object
predicted_rate       19830 non-null object
expected_rate        19830 non-null object
readmissions         19830 non-null object
dtypes: int64(1), object(7)
memory usage: 1.2+ MB


In [297]:
#Get Hospital Count for Unique Provider Numbers and Readmission Counts By State
hospital_count = df.groupby('state').provider_number.nunique()



In [298]:
#Coerce Discharges, Readmission Ratios, Predicted Rates, Expected Rates, and Readmissions to get NaNs
tonumeric=['discharges','readmission_ratio','predicted_rate','expected_rate','readmissions']
dfa = df[tonumeric].apply(pd.to_numeric, errors='coerce')
#Setting up additional columns to concatinate
dfb = df[['hospital_name','provider_number','state']]


In [299]:
#Concatenating Data Back Together and Confirming DataFrame Integrity
df2= pd.concat([dfb,dfa], axis=1)
df2.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19830 entries, 0 to 19829
Data columns (total 8 columns):
hospital_name        19830 non-null object
provider_number      19830 non-null int64
state                19830 non-null object
discharges           11758 non-null float64
readmission_ratio    14411 non-null float64
predicted_rate       14411 non-null float64
expected_rate        14411 non-null float64
readmissions         11638 non-null float64
dtypes: float64(5), int64(1), object(2)
memory usage: 1.2+ MB


In [300]:
#Build Dictionary of States and Assign them to 0
dictionary ={}
for n in df2['state']:
    if n in dictionary.keys():
        continue
    else:
        dictionary[n]=0
#Count the Number of Excessive Readmissions Per State
for x in range(len(df)): 
    if df2.iloc[x][4]>1:
        dictionary[df2.iloc[x][2]] +=1


In [301]:
#Convert Dictionary to DataFrame for Merger
data = pd.DataFrame(list(dictionary.items()))
data.columns=['state','excessive_count']


In [302]:
#Initializing and Creating New Dataframe To Group By State
cleaned= pd.DataFrame(hospital_count)
cleaned.columns=['hospital_count']
cleaned['readmission_ratio'] = df2.groupby('state').readmission_ratio.mean()
cleaned['discharges']= df2.groupby('state').discharges.sum()
cleaned['predicted_rate']= df2.groupby('state').predicted_rate.sum()
cleaned['expected_rate'] = df2.groupby('state').expected_rate.sum()
cleaned['readmissions'] = df2.groupby('state').readmissions.sum()

In [303]:
#Reset Index to Get State Column 
cleaned = cleaned.reset_index()


In [304]:
#Merge Excessive Readmission Count with Cleaned DataFrame
final = pd.merge(cleaned,data, on='state')

In [305]:
# Save and Print Final DataFrame
final.to_csv('Readmissions_Cleaned.csv')
final

Unnamed: 0,state,hospital_count,readmission_ratio,discharges,predicted_rate,expected_rate,readmissions,excessive_count
0,AK,8,0.969563,5019.0,530.2,548.7,606.0,11
1,AL,85,1.017475,95303.0,5351.5,5308.2,15305.0,188
2,AR,45,1.032275,61703.0,2973.3,2879.7,9965.0,127
3,AZ,63,0.988116,76353.0,3930.2,3990.3,10290.0,104
4,CA,297,1.000689,303151.0,19823.2,19733.7,49252.0,580
5,CO,48,0.955349,39738.0,2824.6,2966.9,4483.0,37
6,CT,30,1.00891,64338.0,2316.9,2293.9,10596.0,81
7,DC,7,1.037523,11194.0,603.2,578.9,1898.0,25
8,DE,6,0.996822,22779.0,507.0,504.5,3529.0,16
9,FL,171,1.032694,347461.0,14225.8,13759.9,60967.0,547
