In [1]:
import pickle
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

In [13]:
import plotly.graph_objs as go
from plotly.offline import iplot

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('diabetic_data.csv')
model = pickle.load(open('rf_model.pkl', 'rb'))
scaler = pickle.load(open('scaler.pkl', 'rb'))
OH_Encoder = pickle.load(open('OH_Encoder.pkl', 'rb'))

In [6]:
def prepare_data(data, OH, sscaler):
    # Do OneHot Encoding
    # List of columns that we do not need to one hot encode
    no_OH = ['encounter_id', 'patient_nbr', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
             'number_emergency', 'number_inpatient', 'number_diagnoses', 'medical_specialty', 'payer_code', 'readmitted']

    X_continuous = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
             'number_emergency', 'number_inpatient', 'number_diagnoses']


    # Do OneHot encoding of remaining columns
    X_OH = data.drop(no_OH, axis=1)
    X_OH = OH.transform(X_OH)
    X_OH_df = pd.DataFrame(X_OH.toarray(), columns=OH.get_feature_names())
    
    # Feature scaling
    X_normed = sscaler.transform(data[X_continuous])
    X_normed = pd.DataFrame(X_normed, columns=X_continuous)

    # Final df
    final_df = pd.concat([X_OH_df, X_normed], axis=1)
    final_df['readmitted'] = data['readmitted'].apply(lambda val: 0 if val=='NO' else 1)
    
    return final_df

In [8]:
def count_predictions(data, model):
    predictions = model.predict(data.loc[:, data.columns != 'readmitted'])
    predictions = pd.DataFrame(predictions)
    predictions.columns = ['Predictions']
    prediction_counts = pd.DataFrame(predictions.Predictions.value_counts(normalize=True)).reset_index()
    prediction_counts.columns = ['Class', 'Percent']
    prediction_counts['Class'] = (prediction_counts['Class']
                                        .apply(lambda row: 'Readmitted' if row == 1 else 'Not Readmitted'))
    return prediction_counts

In [17]:
def plot_outputs(data):
    
    # Make plots
    bar = go.Bar(x=data.Class.values, 
                 y=data.Percent.values,
                 opacity=0.5,
                 textposition = "outside",
                 texttemplate = "%{y:%}",
                 marker=dict(color="#007dcc", line=dict(color="#002f4c", width=1.5))
                )
    
    layout = go.Layout(title='Model Prediction Distribution',
                       yaxis=dict(title='Percent', tickformat='%', range=(0,1)))

    fig = go.Figure(data=[bar], layout=layout)
    iplot(fig)

In [18]:
# Generate prediction 
final_df = prepare_data(data, OH_Encoder, scaler)
prediction_counts = count_predictions(final_df, model)

In [None]:
# Generate plot 
plot_outputs(prediction_counts)