# Libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
from functools import partial
from collections import Counter

from joblib import load

# modeling
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import recall_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# ignore warnings
import warnings
warnings.filterwarnings(action="ignore")

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

# Get dataset

In [2]:
# get pickle file
providers = load('./data/Providers_Final.pkl')
providers.set_index('Provider', inplace=True)

In [3]:
providers

Unnamed: 0_level_0,PotentialFraud,Perc_Outpatient,DualPatientProvider,Perc_DualPatientType,Ratio_ClaimsPerPatient,Ratio_ClaimsPerAttPhys,Perc_ClaimsPerTopFraudState,Perc_HasTop5AdmtCode,PatientsPerAttPhys,PatientsPerOperPhys,PatientsPerOthPhys,Perc_MultHospAttPhys,Perc_MultHospOperPhys,Perc_MultHospOtherPhys,IP_Perc_Duplicates,IP_Count_UniquePatients,IP_Count_UniqueState,IP_Mean_AgeAtClaim,IP_Perc_HasDied,IP_Perc_GenderZero,IP_Perc_RaceOne,IP_Perc_RaceTwo,IP_Perc_RaceThree,IP_Mean_NumChronicConds,IP_Perc_Alzheimers_Chronic,IP_Perc_Cancer_Chronic,IP_Perc_Depression_Chronic,IP_Perc_Diabetes_Chronic,IP_Perc_HeartFailure_Chronic,IP_Perc_IschemicHeart_Chronic,IP_Perc_KidneyDisease_Chronic,IP_Perc_ObstrPulmonary_Chronic,IP_Perc_Osteoporosis_Chronic,IP_Perc_RheumatoidArthritis_Chronic,IP_Perc_Stroke_Chronic,IP_Perc_HasRenalDisease,IP_Mean_ClaimDuration,IP_Mean_AdmitDuration,IP_Mean_NoOfMonths_PartACov,IP_Mean_NoOfMonths_PartBCov,IP_Mean_ClaimCost,IP_Mean_DailyClaimCost,IP_Mean_DeductibleAmtPaid,IP_Mean_InscClaimAmtReimbursed,IP_Mean_InsReimbursementRatio,IP_Mean_AnnualDeductibleAmt,IP_Mean_AnnualReimbursementAmt,IP_Perc_No_ProcCode,IP_Perc_HasAllPhys,IP_Perc_HasNoPhys,IP_Perc_MultHosp,OP_Perc_Duplicates,OP_Count_UniquePatients,OP_Count_UniqueState,OP_Mean_AgeAtClaim,OP_Perc_HasDied,OP_Perc_GenderZero,OP_Perc_RaceOne,OP_Perc_RaceTwo,OP_Perc_RaceThree,OP_Mean_NumChronicConds,OP_Perc_Alzheimers_Chronic,OP_Perc_Cancer_Chronic,OP_Perc_Depression_Chronic,OP_Perc_Diabetes_Chronic,OP_Perc_HeartFailure_Chronic,OP_Perc_IschemicHeart_Chronic,OP_Perc_KidneyDisease_Chronic,OP_Perc_ObstrPulmonary_Chronic,OP_Perc_Osteoporosis_Chronic,OP_Perc_RheumatoidArthritis_Chronic,OP_Perc_Stroke_Chronic,OP_Perc_HasRenalDisease,OP_Mean_ClaimDuration,OP_Mean_NoOfMonths_PartACov,OP_Mean_NoOfMonths_PartBCov,OP_Mean_ClaimCost,OP_Mean_DailyClaimCost,OP_Mean_DeductibleAmtPaid,OP_Mean_InscClaimAmtReimbursed,OP_Mean_InsReimbursementRatio,OP_Mean_AnnualDeductibleAmt,OP_Mean_AnnualReimbursementAmt,OP_Perc_No_DiagCode,OP_Perc_HasAllPhys,OP_Perc_HasNoPhys,OP_Perc_MultHosp
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1
PRV51001,0,0.800000,1.0,0.400000,1.041667,1.785714,0.0,0.040000,10,19,16,0.240000,0.040000,0.000000,0.000000,5.0,1.0,77.600000,0.000000,0.400000,1.000000,0.000000,0.0,6.000000,0.400000,0.200000,0.800000,0.800000,0.800000,0.800000,0.800000,0.400000,0.000000,0.600000,0.400000,0.400000,6.000000,6.000000,12.000000,12.000000,20468.000000,4077.360000,1068.0,19400.000000,0.878279,897.120000,17606.000000,0.600000,0.2,0.0,0.160000,0.550000,19.0,1.0,77.950000,0.000000,0.350000,0.800000,0.200000,0.000000,5.450000,0.650000,0.200000,0.250000,0.850000,0.750000,0.950000,0.650000,0.400000,0.300000,0.250000,0.200000,0.300000,1.550000,12.000000,12.000000,382.000000,307.000000,0.000000,382.000000,1.000000,463.920000,2615.200000,0.000000,0.050000,0.000000,0.920000
PRV51003,1,0.530303,1.0,0.500000,1.128205,3.000000,0.0,0.060606,73,110,95,0.007576,0.000000,0.000000,0.016129,53.0,3.0,69.935484,0.016129,0.338710,0.790323,0.209677,0.0,4.919355,0.516129,0.112903,0.403226,0.790323,0.580645,0.887097,0.629032,0.370968,0.209677,0.306452,0.112903,0.274194,6.161290,6.161290,11.806452,11.806452,10309.935484,2384.941628,1068.0,9241.935484,0.821059,931.424242,7568.181818,0.370968,0.0,0.0,0.045455,0.357143,66.0,3.0,68.371429,0.000000,0.471429,0.828571,0.157143,0.000000,4.214286,0.342857,0.042857,0.414286,0.728571,0.628571,0.814286,0.357143,0.257143,0.285714,0.271429,0.071429,0.171429,3.357143,11.828571,11.928571,467.714286,336.440760,1.000000,466.714286,0.994032,737.121212,2678.181818,0.000000,0.057143,0.000000,0.818182
PRV51004,0,1.000000,0.0,0.208054,1.079710,3.921053,0.0,0.013423,100,119,112,0.167785,0.013423,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,434.953020,4351.879195,0.000000,0.0,0.0,0.060403,0.461538,138.0,9.0,71.302013,0.006711,0.308725,0.000000,0.000000,0.000000,4.342282,0.429530,0.107383,0.422819,0.704698,0.590604,0.724832,0.335570,0.275168,0.328859,0.308725,0.114094,0.154362,2.429530,11.865772,11.959732,352.214765,250.363050,2.080537,350.134228,0.978485,622.751678,2194.899329,0.040268,0.080537,0.000000,0.899329
PRV51005,1,1.000000,0.0,0.248069,2.353535,194.166667,0.0,0.032618,489,489,491,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,379.162232,3623.991416,0.000000,0.0,0.0,0.048069,0.452257,495.0,4.0,69.567382,0.003433,0.438627,0.805369,0.161074,0.033557,4.335622,0.365665,0.141631,0.416309,0.685837,0.583691,0.768240,0.435193,0.253219,0.295279,0.284120,0.106438,0.222318,2.088412,11.907296,11.939914,244.300429,196.533055,3.175966,241.124464,0.980747,636.328755,2109.733906,0.011159,0.084120,0.001717,0.731330
PRV51007,0,0.958333,1.0,0.277778,1.241379,7.200000,0.0,0.027778,48,53,51,0.597222,0.083333,0.027778,0.000000,3.0,1.0,78.000000,0.000000,0.333333,0.000000,0.000000,0.0,5.666667,0.666667,0.000000,0.666667,1.000000,1.000000,1.000000,0.333333,0.000000,0.000000,0.333333,0.666667,0.333333,6.333333,6.333333,12.000000,12.000000,7401.333333,1255.588889,1068.0,6333.333333,0.829955,445.000000,3050.000000,0.666667,0.0,0.0,0.069444,0.420290,56.0,2.0,67.956522,0.014493,0.478261,0.766524,0.224893,0.008584,4.101449,0.347826,0.173913,0.391304,0.666667,0.536232,0.695652,0.304348,0.231884,0.304348,0.304348,0.144928,0.144928,1.768116,11.826087,11.826087,214.057971,199.685990,0.869565,213.188406,0.992157,469.722222,1729.722222,0.000000,0.115942,0.000000,0.902778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PRV57759,0,1.000000,0.0,0.214286,1.166667,28.000000,0.0,0.000000,23,23,23,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,457.714286,3962.142857,0.000000,0.0,0.0,0.178571,0.518519,24.0,1.0,73.000000,0.000000,0.571429,0.931034,0.034483,0.034483,5.250000,0.500000,0.142857,0.321429,0.714286,0.714286,1.000000,0.535714,0.392857,0.464286,0.321429,0.142857,0.178571,3.142857,12.000000,12.000000,384.642857,200.629252,4.642857,380.000000,0.983401,886.785714,3241.785714,0.035714,0.035714,0.000000,0.928571
PRV57760,0,1.000000,0.0,0.363636,2.444444,7.333333,0.0,0.045455,6,7,7,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,436.909091,2785.454545,0.000000,0.0,0.0,0.000000,0.590909,9.0,1.0,60.590909,0.000000,0.772727,0.923077,0.076923,0.000000,3.500000,0.136364,0.000000,0.318182,0.818182,0.500000,1.000000,0.090909,0.090909,0.500000,0.045455,0.000000,0.000000,1.318182,12.000000,11.727273,216.818182,216.022727,0.000000,216.818182,1.000000,805.454545,1492.727273,0.000000,0.136364,0.000000,1.000000
PRV57761,0,1.000000,0.0,0.341463,1.223881,41.000000,0.0,0.000000,65,62,65,0.000000,0.060976,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,586.097561,7026.585366,0.000000,0.0,0.0,0.097561,0.390244,67.0,1.0,71.134146,0.012195,0.487805,0.919437,0.080563,0.000000,4.841463,0.439024,0.170732,0.463415,0.670732,0.682927,0.756098,0.487805,0.365854,0.390244,0.292683,0.121951,0.280488,2.390244,12.000000,12.000000,229.756098,157.134674,4.512195,225.243902,0.935979,707.317073,2928.414634,0.000000,0.109756,0.000000,0.951220
PRV57762,0,1.000000,0.0,1.000000,1.000000,1.000000,0.0,0.000000,0,1,1,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1068.000000,15000.000000,0.000000,0.0,0.0,0.000000,1.000000,1.0,1.0,67.000000,0.000000,1.000000,0.600000,0.000000,0.400000,5.000000,0.000000,1.000000,0.000000,1.000000,0.000000,1.000000,1.000000,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,12.000000,12.000000,1900.000000,1900.000000,0.000000,1900.000000,1.000000,400.000000,2540.000000,0.000000,0.000000,0.000000,1.000000


# Upsampling

In [4]:
scores = providers.drop(['PotentialFraud'], axis=1)
scores_cols = scores.columns
scores = preprocessing.normalize(scores, norm='l1')
scores_norm = pd.DataFrame(scores, columns = scores_cols)

decision = providers["PotentialFraud"]

In [5]:
# # Indicies of each class' observations
# i_class0 = np.where(decision == 0)[0] 
# i_class1 = np.where(decision == 1)[0]

# # Number of observations in each class
# n_class0 = len(i_class0)
# n_class1 = len(i_class1)

# n_class0, n_class1


# # For every observation, randomly sample with replacement
# i_class0_upsampled = np.random.choice(i_class1, size=n_class0, replace=True) 
# # Join together class 0's upsampled target vector with class 1's target vector
# decision = np.concatenate((decision[i_class0_upsampled], decision[i_class0]))



# upsample minority class with SMOTE
oversample = SMOTE(random_state = 0)
scores, decision = oversample.fit_resample(scores, decision)

# Count of target variable after upsampling
Counter(decision)

Counter({0: 4904, 1: 4904})

In [6]:
Xtrain, Xtest, ytrain, ytest = train_test_split(scores, decision, random_state=0, stratify = decision)

# Modeling


In [7]:
# some functions that will be used in modeling
def plot_feature_importances(model):
    n_features = scores.shape[1]
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), scores_cols.to_numpy()) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    

# Stratified Cross Validation needs to be applied
skf = StratifiedKFold(n_splits = 3, random_state = 0, shuffle = True)

### train a simple classifier model and handling imbalanced classes

In [8]:
logistic_regression = LogisticRegression(random_state=0)
logreg = logistic_regression.fit(Xtrain, ytrain)

print("Training set score: {:.3f}".format(recall_score(ytrain, logreg.predict(Xtrain))))
print("Test set score: {:.3f}".format(recall_score(ytest, logreg.predict(Xtest))))

Training set score: 0.918
Test set score: 0.914


In [9]:
logreg100 = LogisticRegression(C=11, random_state=0).fit(Xtrain, ytrain)
print("Training set score: {:.3f}".format(recall_score(ytrain, logreg100.predict(Xtrain))))
print("Test set score: {:.3f}".format(recall_score(ytest, logreg100.predict(Xtest))))

Training set score: 0.922
Test set score: 0.920


In [10]:
confusion_matrix(ytest,logreg100.predict(Xtest))

array([[ 864,  362],
       [  98, 1128]])

In [11]:
param_grid = [{'C': np.logspace(-1,1,100)}]
logreg100cv = GridSearchCV(logreg100, param_grid = param_grid).fit(Xtrain,ytrain)

### decision tree

In [None]:
tree = DecisionTreeClassifier(max_depth=4, random_state=0) 
tree.fit(Xtrain, ytrain)
print("Accuracy on training set: {:.3f}".format(tree.score(Xtrain, ytrain))) 
print("Accuracy on test set: {:.3f}".format(tree.score(Xtest, ytest)))

display(recall_score(ytrain, tree.predict(Xtrain)))
display(recall_score(ytest, tree.predict(Xtest)))

In [None]:
plt.figure(figsize=(10,20))
plot_feature_importances(tree)

### random forest

In [12]:
forest = RandomForestClassifier(n_estimators=100, random_state=0, max_depth = 30, max_features = 5) 
forest.fit(Xtrain, ytrain)

display(recall_score(ytrain, forest.predict(Xtrain)))
display(recall_score(ytest, forest.predict(Xtest)))

Accuracy on training set: 1.000
Accuracy on test set: 0.960


1.0

0.9722675367047309

In [13]:
param_grid = {
    'n_estimators': [200, 600, 1000, 1400, 2000],
    'max_features': ['auto', 'sqrt'],
    'min_samples_split': [2, 4, 6]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=0), param_grid, cv=skf)
grid_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))

Best parameters: {'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 600}
Best cross-validation score: 0.95
Best estimator:
RandomForestClassifier(n_estimators=600, random_state=0)


In [14]:
display(recall_score(ytrain, grid_search.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, grid_search.best_estimator_.predict(Xtest)))

1.0

0.9771615008156607

In [None]:
plt.figure(figsize=(10,20))
plot_feature_importances(grid_search.best_estimator_)

### gradient boosting

In [None]:
gbrt = GradientBoostingClassifier(random_state=0)
gbrt.fit(Xtrain, ytrain)

print("Accuracy on training set: {:.3f}".format(gbrt.score(Xtrain, ytrain)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(Xtest, ytest)))

In [None]:
gbrt = GradientBoostingClassifier(random_state=0, max_depth=1) 
gbrt.fit(Xtrain, ytrain)

print("Accuracy on training set: {:.3f}".format(gbrt.score(Xtrain, ytrain))) 
print("Accuracy on test set: {:.3f}".format(gbrt.score(Xtest, ytest)))

In [None]:
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01) 
gbrt.fit(Xtrain, ytrain)

print("Accuracy on training set: {:.3f}".format(gbrt.score(Xtrain, ytrain))) 
print("Accuracy on test set: {:.3f}".format(gbrt.score(Xtest, ytest)))

In [None]:
gb_param_grid = {
    'learning_rate':[0.1,0.05,0.01], 
    'n_estimators':[100,500,1000,1500],
    'max_depth':[2,4,6]
}

gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=0), gb_param_grid, cv=3)
gb_grid_search.fit(Xtrain, ytrain)

print("Best parameters: {}".format(gb_grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(gb_grid_search.best_score_))
print("Best estimator:\n{}".format(gb_grid_search.best_estimator_))

In [None]:
display(recall_score(ytrain, gb_grid_search.best_estimator_.predict(Xtrain)))
display(recall_score(ytest, gb_grid_search.best_estimator_.predict(Xtest)))

In [None]:
plt.figure(figsize=(10,20))
plot_feature_importances(gb_grid_search.best_estimator_)

### support vector machine

In [None]:
svc = SVC() 
svc.fit(Xtrain, ytrain)

print("Accuracy on training set: {:.2f}".format(svc.score(Xtrain, ytrain))) 
print("Accuracy on test set: {:.2f}".format(svc.score(Xtest, ytest)))

In [None]:
svc = SVC(C=1000) 
svc.fit(Xtrain, ytrain)

print("Accuracy on training set: {:.2f}".format(svc.score(Xtrain, ytrain))) 
print("Accuracy on test set: {:.2f}".format(svc.score(Xtest, ytest)))