<h3> Reading in Data

In [337]:
#import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

#display plots inline
%matplotlib inline

In [338]:
#import the raw training datasets: there are four, one with provider information, one with beneficiary data, and two with claim data. 
fraud_tr = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Train-1542865627584.csv')
bene_tr = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Train_Beneficiarydata-1542865627584.csv')
inpatient_tr = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Train_Inpatientdata-1542865627584.csv')
outpatient_tr = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Train_Outpatientdata-1542865627584.csv')

<h3> In-depth Data Analysis

We will use several machine learning algorithm to predict the provider who may be fraudulent. 

We will investigate the methods for analyzing imbalanced data since <10% of the providers are deemed fraudulent. 

In [339]:
# Calculate the number of inpatient claims each provider submitted and create a panda dataframe with provider ID
# and the number of inpatient claims
In_Provider=pd.DataFrame([inpatient_tr['Provider'].value_counts().index,inpatient_tr['Provider'].value_counts()]).transpose().rename({0:'Provider',1:'Inpatient_Claim_Number'},axis=1)
#In_Provider

In [340]:
Out_Provider=pd.DataFrame([outpatient_tr['Provider'].value_counts().index,outpatient_tr['Provider'].value_counts()]).transpose().rename({0:'Provider',1:'Outpatient_Claim_Number'},axis=1)
#Out_Provider

In [341]:
# merge the inpatient and outpatient claims per provider dataframes
provider_claimnumber=pd.merge(In_Provider,Out_Provider,on='Provider',how='outer')

In [342]:
# Calculate total amount of inpatient claims each provider submitted and create a panda dataframe with provider ID
# and total amount of inpatient claims
inreimburse_by_provider=inpatient_tr.groupby('Provider')['InscClaimAmtReimbursed'].sum()
inreimburse_temp1=pd.DataFrame([inreimburse_by_provider.index,inreimburse_by_provider])
inreimburse_temp2=inreimburse_temp1.transpose()
In_Provider_Claim=inreimburse_temp2.rename({0:'Provider',1:'Inpatient_Claim_Amount'},axis=1)
#In_Provider_Claim

In [343]:
outreimburse_by_provider=outpatient_tr.groupby('Provider')['InscClaimAmtReimbursed'].sum()
outreimburse_temp1=pd.DataFrame([outreimburse_by_provider.index,outreimburse_by_provider])
outreimburse_temp2=outreimburse_temp1.transpose()
Out_Provider_Claim=outreimburse_temp2.rename({0:'Provider',1:'Outpatient_Claim_Amount'},axis=1)
#Out_Provider_Claim

In [344]:
# merge the inpatient and outpatient claim amounts per provider dataframes
provider_claimamount=pd.merge(In_Provider_Claim,Out_Provider_Claim,on='Provider',how='outer')
#provider_claimamount

In [345]:
# Merge provider fraud data wth claim number and claim amount datasets
provider_list1=pd.merge(fraud_tr,provider_claimnumber,on='Provider',how='outer')
#provider_list1
provider_list2=pd.merge(provider_list1,provider_claimamount,on='Provider',how='outer')
#provider_list2

In [346]:
# Create arrays for the features and the response variable
provider_clean=provider_list2.fillna(0)
y = provider_clean['PotentialFraud'].values
X = provider_clean.drop(['PotentialFraud','Provider'], axis=1).values

<h4> Use K nearest neighbors for prediction

In [347]:
# Use GridSearchCV to choose hyperparameter for KNN, n_neighbors
# Do hyperparameter tuning on training set
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)
param_grid = {'n_neighbors': np.arange(1, 15)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X_train, y_train)
knn_cv.best_params_

{'n_neighbors': 8}

In [348]:
y_pred=knn_cv.predict(X_test)

In [349]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

          No       0.98      0.95      0.96      1521
         Yes       0.47      0.70      0.56       102

    accuracy                           0.93      1623
   macro avg       0.72      0.82      0.76      1623
weighted avg       0.95      0.93      0.94      1623



In [350]:
print(confusion_matrix(y_pred,y_test))

[[1440   81]
 [  31   71]]


Conclusion: KNN is not a good method with <50% sensitivity. 

<h4> Use logistic regression for prediction

In [351]:
import warnings
warnings. filterwarnings('ignore')

In [352]:
from sklearn.linear_model import LogisticRegression

# Create the hyperparameter grid
c_space = np.logspace(-5, 8, 15)
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=21,stratify=y)

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg,param_grid,cv=5)

# Fit it to the training data
logreg_cv.fit(X_train,y_train)

# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))


Tuned Logistic Regression Parameter: {'C': 0.05179474679231213, 'penalty': 'l1'}
Tuned Logistic Regression Accuracy: 0.9355690520200687


In [353]:
y_pred=logreg_cv.predict(X_test)

In [354]:
print(confusion_matrix(y_pred,y_test))

[[1449   82]
 [  22   70]]


In [355]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

          No       0.99      0.95      0.97      1531
         Yes       0.46      0.76      0.57        92

    accuracy                           0.94      1623
   macro avg       0.72      0.85      0.77      1623
weighted avg       0.96      0.94      0.94      1623



Conclusion: logistic regression is not a good method with <50% sensitivity, but oversampling the positive cases may help. 

In [356]:
# Use oversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=21)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[('No', 3433), ('Yes', 3433)]


In [357]:
# Fit logistic regression to the over-sampled data
logreg_cv.fit(X_resampled,y_resampled)
# Print the optimal parameters and best score
print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))

Tuned Logistic Regression Parameter: {'C': 11787686.347935867, 'penalty': 'l1'}
Tuned Logistic Regression Accuracy: 0.8598893096417128


In [358]:
y_pred=logreg_cv.predict(X_test)

In [359]:
print(confusion_matrix(y_pred,y_test))

[[1338   34]
 [ 133  118]]


In [360]:
print(balanced_accuracy_score(y_test, y_pred))
print(sensitivity_specificity_support(y_test,y_pred))

0.8429505527925865
(array([0.90958532, 0.77631579]), array([0.77631579, 0.90958532]), array([1471,  152]))


Conclusion: logistic regression with oversampling the positive cases have dramatically improved the accuracy with sensitivity of 78% sensitivity and 91% specificity. 

<h4> Use Random Forest

In [361]:
# Try Gridsearch on original imbalanced sampling
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)
param_grid = {'n_estimators': np.arange(50, 100)}
model=RandomForestClassifier(random_state=21)
rf_cv = GridSearchCV(model, param_grid, cv=5)
rf_cv.fit(X_train, y_train)
rf_cv.best_params_

{'n_estimators': 50}

In [362]:
rf_cv.score(X_test,y_test)

0.9229821318545902

In [363]:
y_pred=rf_cv.predict(X_test)

In [364]:
print(confusion_matrix(y_pred,y_test))

[[1426   80]
 [  45   72]]


In [365]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

          No       0.97      0.95      0.96      1506
         Yes       0.47      0.62      0.54       117

    accuracy                           0.92      1623
   macro avg       0.72      0.78      0.75      1623
weighted avg       0.93      0.92      0.93      1623



Conclusion: random forests is not working very well on the imbalanced data. 

In [366]:
#! conda install -y -c conda-forge imbalanced-learn

In [367]:
#! conda update -y -n base -c defaults conda

Try random forest with balanced sampling

In [368]:
param_grid = {'n_estimators': np.arange(50, 100)}
brf = BalancedRandomForestClassifier(random_state=21)
brf_cv = GridSearchCV(brf, param_grid, cv=5)
brf_cv.fit(X_train, y_train)
brf_cv.best_params_

{'n_estimators': 67}

In [369]:
y_pred = brf_cv.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))
print(sensitivity_specificity_support(y_test,y_pred))

0.8602320297685069
(array([0.83888511, 0.88157895]), array([0.88157895, 0.83888511]), array([1471,  152]))


In [370]:
print(confusion_matrix(y_pred,y_test))

[[1234   18]
 [ 237  134]]


Random forest with balanced data improved the performance to 88% sensitivity and 84% specificity. 

In [371]:
provider_clean=provider_list2.fillna(0)
y = provider_clean['PotentialFraud'].values
X = provider_clean.drop(['PotentialFraud','Provider'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

Try balanced bagging

In [372]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), sampling_strategy='auto', replacement=False,
                               random_state=21)
bbc.fit(X_train, y_train) 
y_pred = bbc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)  

0.8247052667358403

In [373]:
print(sensitivity_specificity_support(y_test,y_pred))

(array([0.87967369, 0.76973684]), array([0.76973684, 0.87967369]), array([1471,  152]))


In [374]:
print(confusion_matrix(y_pred,y_test))

[[1294   35]
 [ 177  117]]


Balanced bagging is not the best method with 77% sensitivity and 88% specificity. 

Try boosting.

In [375]:
from imblearn.ensemble import RUSBoostClassifier
param_grid = {'n_estimators': np.arange(50, 100)}
rusboost = RUSBoostClassifier(algorithm='SAMME.R',random_state=21)
rusboost_cv = GridSearchCV(rusboost, param_grid, cv=5)
rusboost_cv.fit(X_train, y_train)
rusboost_cv.best_params_
#rusboost = RUSBoostClassifier(n_estimators=50, algorithm='SAMME.R',random_state=0)
#rusboost.fit(X_train, y_train)  
#y_pred = rusboost.predict(X_test)
#balanced_accuracy_score(y_test, y_pred)  

{'n_estimators': 73}

In [376]:
y_pred = rusboost_cv.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))  
print(confusion_matrix(y_pred,y_test))

0.8423154674585853
[[1220   22]
 [ 251  130]]


In [377]:
print(sensitivity_specificity_support(y_test,y_pred))

(array([0.82936778, 0.85526316]), array([0.85526316, 0.82936778]), array([1471,  152]))


Boosting has 86% sensitivity and 83% specificity.

Try Adaboosting.

In [378]:
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(random_state=21)
eec.fit(X_train, y_train) 
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)  

0.8561419907689005

In [379]:
print(confusion_matrix(y_pred,y_test))

[[1251   21]
 [ 220  131]]


In [380]:
print(sensitivity_specificity_support(y_test,y_pred))

(array([0.85044188, 0.86184211]), array([0.86184211, 0.85044188]), array([1471,  152]))


Adaboosting has the 86% sensitivity and 85% specificity. 

<h3> Predicting the test data

In [381]:
#import the test datasets: there are four, one with provider information, one with beneficiary data, and two with claim data. 
fraud_tx = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Test-1542969243754.csv')
bene_tx = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Test_Beneficiarydata-1542969243754.csv')
inpatient_tx = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Test_Inpatientdata-1542969243754.csv')
outpatient_tx = pd.read_csv('/Users/zhengzhang/Downloads/healthcare-provider-fraud-detection-analysis/Test_Outpatientdata-1542969243754.csv')

In [382]:
#Create the test dataset for prediction
In_Provider=pd.DataFrame([inpatient_tx['Provider'].value_counts().index,inpatient_tx['Provider'].value_counts()]).transpose().rename({0:'Provider',1:'Inpatient_Claim_Number'},axis=1)
Out_Provider=pd.DataFrame([outpatient_tx['Provider'].value_counts().index,outpatient_tx['Provider'].value_counts()]).transpose().rename({0:'Provider',1:'Outpatient_Claim_Number'},axis=1)
provider_claimnumber=pd.merge(In_Provider,Out_Provider,on='Provider',how='outer')
inreimburse_by_provider=inpatient_tx.groupby('Provider')['InscClaimAmtReimbursed'].sum()
inreimburse_temp1=pd.DataFrame([inreimburse_by_provider.index,inreimburse_by_provider])
inreimburse_temp2=inreimburse_temp1.transpose()
In_Provider_Claim=inreimburse_temp2.rename({0:'Provider',1:'Inpatient_Claim_Amount'},axis=1)
outreimburse_by_provider=outpatient_tx.groupby('Provider')['InscClaimAmtReimbursed'].sum()
outreimburse_temp1=pd.DataFrame([outreimburse_by_provider.index,outreimburse_by_provider])
outreimburse_temp2=outreimburse_temp1.transpose()
Out_Provider_Claim=outreimburse_temp2.rename({0:'Provider',1:'Outpatient_Claim_Amount'},axis=1)
provider_claimamount=pd.merge(In_Provider_Claim,Out_Provider_Claim,on='Provider',how='outer')
provider_list1=pd.merge(fraud_tx,provider_claimnumber,on='Provider',how='outer')
provider_list2=pd.merge(provider_list1,provider_claimamount,on='Provider',how='outer')
provider_clean=provider_list2.fillna(0)
provider_clean
X = provider_clean.drop(['Provider'], axis=1).values

In [383]:
# AdaBoosting
eec_pred = eec.predict(X)
pd.value_counts(eec_pred)

No     1070
Yes     283
dtype: int64

In [384]:
# Rusboosting
rus_pred = rusboost_cv.predict(X)
pd.value_counts(rus_pred)

No     1047
Yes     306
dtype: int64

In [385]:
# Balanced Bagging Classifier
bbc_pred = bbc.predict(X)
pd.value_counts(bbc_pred)

No     1122
Yes     231
dtype: int64

In [386]:
# Balanced random forest
brf_pred = brf_cv.predict(X)
pd.value_counts(brf_pred)

No     1040
Yes     313
dtype: int64

In [387]:
# Logistic regression with oversampling
logreg_pred = logreg_cv.predict(X)
pd.value_counts(logreg_pred)

No     1137
Yes     216
dtype: int64

In [388]:
# Random Forest has a lot more positive cases than the logistic regression method. 
pd.crosstab(logreg_pred,brf_pred)

col_0,No,Yes
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1039,98
Yes,1,215


In [389]:
# Balanced bagging disagree with logistic regression a lot.
pd.crosstab(logreg_pred,bbc_pred)

col_0,No,Yes
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1085,52
Yes,37,179


In [390]:
# Adaboosting has a lot more positive cases than the logistic regression method. 
pd.crosstab(logreg_pred,eec_pred)

col_0,No,Yes
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1067,70
Yes,3,213


In [391]:
# Rusboosting disagree with logistic regression a lot.
pd.crosstab(logreg_pred,rus_pred)

col_0,No,Yes
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1024,113
Yes,23,193
