James Fisher
9/1/2024
ANA680

Final Project: Predicting Medicare Provider Use of Annual Wellness Visits

In this Notebook, I simplify the model a bit, reducing its accuracy by a couple percent (still 72.8%) but dramatically reducing the pkl file from over 1.1GB to under 75MB.

In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import pickle as pkl



In [2]:
#import data
data = pd.read_csv('C:/Users/unkno/Desktop/MS Data Science/Class 9 - ANA680/Week 4/fp/prepared_data.csv')

##temporary
#drop rows where Rndrng_Prvdr_State_Abrvtn is 'PR', 'GU', 'VI', 'MP', 'AS', 'MH', 'PW', 'FM', 'AS', 'DC' because these are not US states
data = data[~data.Rndrng_Prvdr_State_Abrvtn.isin(['PR', 'GU', 'VI', 'MP', 'AS', 'MH', 'PW', 'FM', 'AS', 'DC', 'AA', 'AE', 'AP', 'XX', 'ZZ'])]


In [3]:
#simplify the data by randomly sampling 5% of the data
data = data.sample(frac=0.05, random_state=66)

data = data[['Rndrng_Prvdr_Gndr', 'Tot_Srvcs', 'Tot_Benes', 'Med_Tot_Benes', 'Med_Tot_Srvcs',
       'Bene_Avg_Risk_Scre', 'has_MD', 'Bene_Feml_Cnt', 'Tot_HCPCS_Cds',
       'Tot_Mdcr_Pymt_Amt', 'is_NP', 'Tot_Sbmtd_Chrg', 'Bene_Male_Cnt']]

In [4]:
#recode so that M = 0 and F = 1 for Rndrng_Prvdr_Gndr (provider gender)
data['Rndrng_Prvdr_Gndr'] = data['Rndrng_Prvdr_Gndr'].replace('M', 0)
data['Rndrng_Prvdr_Gndr'] = data['Rndrng_Prvdr_Gndr'].replace('F', 1)

#replace NaN values with 0 (no services rendered, values were NaN on import because they were blank in the original data)
data.fillna(0, inplace=True)

#print columns
print(data.columns)

#print head
print(data.head())

Index(['Rndrng_Prvdr_Gndr', 'Tot_Srvcs', 'Tot_Benes', 'Med_Tot_Benes',
       'Med_Tot_Srvcs', 'Bene_Avg_Risk_Scre', 'has_MD', 'Bene_Feml_Cnt',
       'Tot_HCPCS_Cds', 'Tot_Mdcr_Pymt_Amt', 'is_NP', 'Tot_Sbmtd_Chrg',
       'Bene_Male_Cnt'],
      dtype='object')
        Rndrng_Prvdr_Gndr  Tot_Srvcs  Tot_Benes  Med_Tot_Benes  Med_Tot_Srvcs  \
675206                  0      341.0        303          303.0          341.0   
772164                  0      525.0        280          280.0          525.0   
540112                  0      654.0        151          151.0          566.0   
252305                  0      598.0         52           52.0          598.0   
47309                   0     3635.0        307          307.0         3635.0   

        Bene_Avg_Risk_Scre  has_MD  Bene_Feml_Cnt  Tot_HCPCS_Cds  \
675206              1.6488       1          157.0             21   
772164              2.3284       1          141.0             28   
540112              1.2990       0          10

In [5]:
#split data into target and features
from sklearn.model_selection import train_test_split
X = data.drop('Rndrng_Prvdr_Gndr', axis=1)
y = data['Rndrng_Prvdr_Gndr']

#flatten y into a 1D array
y = y.to_numpy().ravel()

#split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=66)

In [6]:
##FEATURE VARIABLE ENCODING & SCALING
[['Rndrng_Prvdr_Gndr', 'Tot_Srvcs', 'Tot_Benes', 'Med_Tot_Benes', 'Med_Tot_Srvcs',
       'Bene_Avg_Risk_Scre', 'has_MD', 'Bene_Feml_Cnt', 'Tot_HCPCS_Cds',
       'Tot_Mdcr_Pymt_Amt', 'is_NP', 'Tot_Sbmtd_Chrg', 'Bene_Male_Cnt']]
#identify continuous and categorical variables in joined dataset
continuous_features = ['Tot_HCPCS_Cds', 'Tot_Benes', 'Tot_Srvcs', 'Tot_Sbmtd_Chrg', 'Tot_Mdcr_Pymt_Amt', 
                       'Med_Tot_Benes', 'Med_Tot_Srvcs', 'Bene_Feml_Cnt', 'Bene_Male_Cnt', 'Bene_Avg_Risk_Scre']
categorical_features = ['has_MD', 'is_NP']
#'Rndrng_Prvdr_Gndr' (categorical) is target variable

#initialize scaler and encoder
scaler = StandardScaler()
label_encoders = {feature: LabelEncoder() for feature in categorical_features}

#scale continuous features
X_train_continuous = scaler.fit_transform(X_train[continuous_features])
X_test_continuous = scaler.transform(X_test[continuous_features])

#encode categorical features
X_train_categorical = X_train[categorical_features].copy()
X_test_categorical = X_test[categorical_features].copy()

for feature in categorical_features:
    X_train_categorical[feature] = label_encoders[feature].fit_transform(X_train[feature])
    X_test_categorical[feature] = label_encoders[feature].transform(X_test[feature])

#concatenate processed continuous and categorical features back together
X_train_processed = pd.concat([pd.DataFrame(X_train_continuous, columns=continuous_features),
                               X_train_categorical.reset_index(drop=True)], axis=1)

X_test_processed = pd.concat([pd.DataFrame(X_test_continuous, columns=continuous_features),
                              X_test_categorical.reset_index(drop=True)], axis=1)


In [7]:
#compare number of columns in training and testing data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(32614, 12)
(8154, 12)
(32614,)
(8154,)


In [11]:
#initialize and fit Random Forest model
model = RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=200, random_state=66, n_jobs=-1)
model.fit(X_train_processed, y_train)

In [12]:
#predict on test data
y_pred = model.predict(X_test_processed)

#calculate accuracy
acc = accuracy(y_test, y_pred)
print(f'Accuracy: {acc}')

#confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.7281089036055923
[[3551  941]
 [1276 2386]]


A 20% sample of the total data resulted in < 1% reduction in predictive power of the model.

One observation: Use of any specific preventive care procedure or immunization was not in the top 12 features on which this model was focused.

In [13]:
#save the model
pkl.dump(model, open('simpler_model.pkl', 'wb'))