In [19]:
from __future__ import division
import pandas as pd
import numpy as np
import sklearn
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn import preprocessing
from sklearn.model_selection import validation_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pickle

### Set Parameters

In [20]:
# Can be True or False
score_models_by_partition = True

In [21]:

data_file = "rand_2019.csv"
table_columns = ['a','b','c','d','e']
new_model_columns = ['a','b','c','d','e']

partition_column = 'QualityStarCluster'
without_partition_model_name = 'All'

drop_col_for_traintest = ['QualityStarCluster','Medicare_provider_number','Hospital_name','State',
                          'Hospital_Compare_5-star_rating_(October_2018,_NA=Not_Available)',
                          'holdout_sample','Class_3', 'Class_5', 'Class_6','Class_10','Relative_price_for_outpatient_services']
id_cols_for_prediction_table = drop_col_for_traintest
target_col = 'Class_5'

# Output File names
predictions_table_file = "predictions_table.csv"

# Model readable list of important variables
important_variables_file = 'important_variables.pkl'

# Output Directories
Output_results_directory = 'QoS_Output_Results/'
pickled_model_directory  = 'QoS_Pickled_Models/'
partitioned_models_directory = 'Partitioned_Models/'
unpartitioned_models_directory = 'Unpartitioned_Models/'
unseen_scored_predictions = 'unseen_scored_predictions/'




In [22]:
output_directory_path = ''
if (score_models_by_partition == True):
    output_directory_path = Output_results_directory+partitioned_models_directory+unseen_scored_predictions
else:
    output_directory_path = Output_results_directory+unpartitioned_models_directory+unseen_scored_predictions

print(output_directory_path)

QoS_Output_Results/Partitioned_Models/unseen_scored_predictions/


In [23]:
def load_model(region):
    
    model_name = ''
    if (score_models_by_partition == True):
        model_name = pickled_model_directory+partitioned_models_directory+region+'_model.pkl'
    else:
        model_name = pickled_model_directory+unpartitioned_models_directory+region+'_model.pkl'
        
    with open(model_name,'rb') as fin:
        model=pickle.load(fin)    
        
    print(model_name)
    
    return model

In [24]:
def load_important_variables(region):
    file_name = ''
    
    if (score_models_by_partition == True):
        file_name = pickled_model_directory+partitioned_models_directory+important_variables_file
    else:
        file_name = pickled_model_directory+unpartitioned_models_directory+important_variables_file
           
    with open(file_name,'rb') as fin:
        important_variables=pickle.load(fin)    
        
    imp_variables=sorted(important_variables[region].keys())
    
    print(file_name)
    return imp_variables

### 2. FastExport Script Teradata to CSV

In [25]:
#%run ./Medxoom_code.ipynb

### 3. Load data in Pandas dataframe

In [26]:
df = pd.read_csv(data_file)
#df.columns = table_columns

#df = df[new_model_columns]
#df = df.drop('saleflag_15', axis =1)
#df = df.drop('saleflag_30', axis =1)
print(df.shape)
df.head(10)

(3050, 310)


Unnamed: 0,Medicare_provider_number,Hospital_name,State,"Hospital_Compare_5-star_rating_(October_2018,_NA=Not_Available)",Number_of_outpatient_services,Relative_price_for_outpatient_services,tot_expenses_employee_benefits,tot_expenses_admin_and_general,tot_expenses_housekeeping,tot_expenses_dietary,...,rural_urban_6.0,rural_urban_7.0,rural_urban_8.0,rural_urban_9.0,QualityStarCluster,Class_3,Class_5,Class_6,Class_10,holdout_sample
0,10001,Southeast Alabama Medical\nCenter,AL,2.0,54.0,122.0,14745010.0,37652600.0,2934629.0,4840752.0,...,0,0,0,0,1Star,1.0,1.0,1.0,1.0,0.0
1,390198,Millcreek Community Hospital,PA,2.0,209.0,143.0,7034446.0,7532066.0,407855.8,1300091.0,...,0,0,0,0,1Star,1.0,1.0,1.0,1.0,0.0
2,390079,Robert Packer Hospital,PA,2.0,337.0,385.0,27643740.0,48223220.0,2328390.0,1588517.0,...,1,0,0,0,1Star,3.0,5.0,6.0,9.0,1.0
3,390076,Brandywine Hospital,PA,2.0,71.0,156.0,4433410.0,29639980.0,1460756.0,2343273.0,...,0,0,0,0,1Star,1.0,1.0,1.0,2.0,0.0
4,390073,Altoona Regional Health\nSystem,PA,2.0,2546.0,204.0,24141430.0,99283640.0,6957286.0,5696613.0,...,0,0,0,0,1Star,1.0,2.0,2.0,3.0,1.0
5,390072,Berwick Hospital Center,PA,2.0,268.0,371.0,252916.3,10378010.0,522885.2,1922285.0,...,0,0,0,0,1Star,3.0,5.0,6.0,9.0,0.0
6,370093,Amend #1 Ou Medical Center,OK,1.0,160.0,315.0,49457750.0,256341200.0,9576080.0,16990820.0,...,0,0,0,0,1Star,3.0,4.0,5.0,8.0,0.0
7,370089,Tahlequah City Hospital,OK,2.0,36.0,227.0,6147690.0,12239150.0,741621.4,2059471.0,...,1,0,0,0,1Star,2.0,3.0,3.0,4.0,0.0
8,370078,Osu Medical Center,OK,2.0,18.0,257.0,13323120.0,27447520.0,1381148.0,2904929.0,...,0,0,0,0,1Star,2.0,3.0,4.0,6.0,0.0
9,370056,Comanche County Memorial\nHospital,OK,1.0,25.0,287.0,17469890.0,28676930.0,2266768.0,2581934.0,...,0,0,0,0,1Star,2.0,4.0,4.0,7.0,0.0


In [27]:
df = df.fillna(0)

In [28]:
#Dictionary of dataframes by region
region_dataframes = {}
Region_list = []

if (score_models_by_partition == False):
    Region_list = [without_partition_model_name]
    region_dataframes[without_partition_model_name] = df

else:
    Region_list = df[partition_column].unique().tolist()
    for region in Region_list:
        region_dataframes[region] = df[df[partition_column]==region]


In [29]:
def drop_columns_for_modeling_test(x_test):
    x_test = x_test.drop(drop_col_for_traintest, axis =1)
    return x_test

In [30]:
def create_prediction_table(x_data, prediction_probability, traintest_flag, modelname):
    prediction_tab  = pd.DataFrame()
    prediction_tab['predict'] = prediction_probability
    prediction_tab.reset_index(drop=True)
    prediction_tab['TrainTest'] = traintest_flag
    prediction_tab['Model'] = modelname
    prediction_table = pd.concat([x_data[id_cols_for_prediction_table].reset_index(drop=True),prediction_tab.reset_index(drop=True)], axis=1)
    return prediction_table

In [31]:
predictions_table = pd.DataFrame()       #Final table with predictions

In [32]:
def predict_proba(region_dataframes,region,imp_variables,model):
    x_test=drop_columns_for_modeling_test(region_dataframes[region])
    
    print("Important Vars for",region,"are",imp_variables)                   
    red_x_test=x_test[imp_variables]
                       
    predictions_test=model.predict(red_x_test)
                       
    return predictions_test

In [33]:
for region in Region_list:
    print("\n\n")
    print(partition_column," = ", region)
    model = load_model(region)
    important_variables = load_important_variables(region)
    predictions_test=predict_proba(region_dataframes,region,important_variables,model)
    predictions_table = predictions_table.append(create_prediction_table(region_dataframes[region], predictions_test, "test", "GBT"))




QualityStarCluster  =  1Star
QoS_Pickled_Models/Partitioned_Models/1Star_model.pkl
QoS_Pickled_Models/Partitioned_Models/important_variables.pkl
Important Vars for 1Star are ['OPPS_Medicare_Allowed_Amount_STATE_SCORE', 'OPPS_Medicare_Payment_Amount_STATE_SCORE', 'OPPS_Total_Submitted_Charges_NATIONAL_SCORE', 'any_home_office_costs_Y', 'charity_uncomp_expnsshr_only10', 'chow_cnt', 'commercial_rev_est', 'cost_to_charge_ratio_for_uncomp', 'critical_access_hosp_hcr_Y', 'expns_admin_and_general_other', 'income_cont_invest_approp', 'internres_per_bed', 'ipps_hosp_pos_Y', 'mdcr_inpat_costs_per_day', 'mdcr_inpat_costs_per_disc', 'mdcr_inpat_day_share', 'mdcr_inpat_discharge_share', 'mdcr_margin', 'mdcr_outpat_cost_to_charge_ratio', 'mdcr_outpat_margin', 'minor_teaching', 'net_expenses_anc_radiology_diag', 'outpat_charges_devices_chgd', 'outpat_totcosts_phys_therapy', 'outpat_totcosts_radiology_diag', 'ownership_forprofit', 'rural_urban_4.0', 'tot_expenses_admin_and_general', 'total_uncomp_e

In [34]:
predictions_table.head()

Unnamed: 0,QualityStarCluster,Medicare_provider_number,Hospital_name,State,"Hospital_Compare_5-star_rating_(October_2018,_NA=Not_Available)",holdout_sample,Class_3,Class_5,Class_6,Class_10,Relative_price_for_outpatient_services,predict,TrainTest,Model
0,1Star,10001,Southeast Alabama Medical\nCenter,AL,2.0,0.0,1.0,1.0,1.0,1.0,122.0,1,test,GBT
1,1Star,390198,Millcreek Community Hospital,PA,2.0,0.0,1.0,1.0,1.0,1.0,143.0,1,test,GBT
2,1Star,390079,Robert Packer Hospital,PA,2.0,1.0,3.0,5.0,6.0,9.0,385.0,3,test,GBT
3,1Star,390076,Brandywine Hospital,PA,2.0,0.0,1.0,1.0,1.0,2.0,156.0,4,test,GBT
4,1Star,390073,Altoona Regional Health\nSystem,PA,2.0,1.0,1.0,2.0,2.0,3.0,204.0,2,test,GBT


In [35]:
predictions_table.to_csv(output_directory_path+predictions_table_file,index=False)

In [36]:
predictions_table.shape

(3050, 14)