### 1. Read in the enriched PartB data for year 2013 & 2014

In [1]:
import pandas as pd
import numpy as np
import os 
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')
parent = os.path.dirname(os.path.dirname(os.getcwd()))

# read in the claim data with year from 2013 to 2014
df = pd.read_csv(parent + '/CMS_datasets/data/year2013_2014_combined_with_labels.csv')

# fill nulls with 0
df.fillna(0, inplace=True)
print(df.shape)
sum(df[df.Fraud_Indicator ==1].groupby('Rndrng_NPI').size()==2)

# assign categorical features to the numeric code for each category
df['Rndrng_Prvdr_Type'] = df['Rndrng_Prvdr_Type'].astype('category').cat.codes
df['Rndrng_Prvdr_Gndr'] = df['Rndrng_Prvdr_Gndr'].astype('category').cat.codes
df['Rndrng_Prvdr_Mdcr_Prtcptg_Ind'] = df['Rndrng_Prvdr_Mdcr_Prtcptg_Ind'].astype('category').cat.codes

(1847558, 89)


### 2. Split the data to a train set (used for cross validation) and a test set (holdout)

In [2]:
import random

# split out a holdout set for final evaluation, using train_X and train_y for train set evaluation and model tuning
random.seed(42)
holdout_rate = 0.1

uniq_id = df.Rndrng_NPI.unique()
holdout_set = random.sample(list(uniq_id), int(len(uniq_id)*holdout_rate))
train_set = list(set(uniq_id) - set(holdout_set))

train=df.query("`Rndrng_NPI` in @train_set")
holdout=df.query("`Rndrng_NPI` in @holdout_set")

train_X = train.drop(columns=['Rndrng_NPI', 'Fraud_Indicator','YEAR'])
train_y = train.Fraud_Indicator

test_X = holdout.drop(columns=['Rndrng_NPI', 'Fraud_Indicator','YEAR'])
test_y = holdout.Fraud_Indicator

char_feat = ['Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr', 'Rndrng_Prvdr_Mdcr_Prtcptg_Ind']
num_feat = list(set(train_X.columns) - set(char_feat))

### 3. build up a pipeline including evaluation

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score

# define pipeline
def smote_choose(scoring= None, oversample_rate = None, undersample_rate= None):
    model = LogisticRegression(random_state=42, class_weight='balanced')

    cleaner = ColumnTransformer(
        transformers=[            
            ('scaling', StandardScaler(), num_feat),
            ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), char_feat)], remainder='drop')
    
    if (oversample_rate != None) & (undersample_rate != None):
        over = SMOTE(sampling_strategy=oversample_rate, random_state=42)
        under = RandomUnderSampler(sampling_strategy=undersample_rate, random_state=42)
        steps = [('over', over), 
                 ('under', under), 
                 ("cleaner", cleaner),
                 ('model', model)]
        
    elif (oversample_rate != None) & (undersample_rate == None):
        over = SMOTE(sampling_strategy=oversample_rate, random_state=42)
        steps = [('over', over),
                 ("cleaner", cleaner),
                 ('model', model)]    
        
    elif (oversample_rate== None) & (undersample_rate != None):
        under = RandomUnderSampler(sampling_strategy=undersample_rate, random_state=42)
        steps = [('under', under), 
                 ("cleaner", cleaner),
                 ('model', model)]     
        
    elif (oversample_rate== None) & (undersample_rate == None):
        steps = [("cleaner", cleaner),
                 ('model', model)]
        
    pipe = Pipeline(steps=steps)


    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    if scoring == 'roc_auc':
        scores = cross_val_score(pipe, train_X, train_y, scoring = scoring, cv=skf, n_jobs=-1)
        fitted_pipe=pipe.fit(train_X, train_y)
        val_auc = roc_auc_score(test_y, fitted_pipe.predict_proba(test_X)[:, 1])
        return round(np.nanmean(scores), 4), round(np.nanstd(scores), 5), round(val_auc, 4), fitted_pipe
    
    if scoring != 'roc_auc':
        if scoring == 'binary':
            scores = cross_val_score(pipe, train_X, train_y, scoring = 'f1', cv=skf, n_jobs=-1)
        else:
            scores = cross_val_score(pipe, train_X, train_y, scoring = f'f1_{scoring}', cv=skf, n_jobs=-1)
        fitted_pipe=pipe.fit(train_X, train_y)
        f1 = f1_score(test_y, fitted_pipe.predict(test_X), average= scoring)
        return round(np.nanmean(scores), 4), round(np.nanstd(scores), 5), round(f1, 4), fitted_pipe

### 4. Output the roc_auc score - (Train set mean score, Train set score standard deviation, Test score)

In [17]:
# oversample the minority labels to the same amount (1:1) of the majority labels
logreg_3 = smote_choose('roc_auc', 1, None)
print(logreg_3[:3])
fitted_pipe3 = logreg_3[3]

(0.7885, 0.00814, 0.7503)


In [18]:
# oversample the minority to 1/2 of the majority labels
logreg_2 = smote_choose('roc_auc', 0.5, None)
print(logreg_2[:3])
fitted_pipe2 = logreg_2[3]

(0.7935, 0.00802, 0.7506)


In [19]:
# oversample the minority to 1/10 of the majority labels
logreg_1 = smote_choose('roc_auc', 0.1, None)
print(logreg_1[:3])
fitted_pipe1 = logreg_1[3]

(0.7892, 0.00459, 0.7536)


In [27]:
# No oversampling nor undersampling
logreg_4 = smote_choose('roc_auc', None, None)
print(logreg_4[:3])
fitted_pipe4 = logreg_4[3]

(0.8366, 0.00681, 0.814)


In [28]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/2) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_0 = smote_choose('roc_auc', 0.1, 0.5)
print(logreg_0[:3])
fitted_pipe0 = logreg_0[3]

(0.7894, 0.00771, 0.7565)


In [20]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/10) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_5 = smote_choose('roc_auc', 0.1, 0.1)
print(logreg_5[:3])
fitted_pipe5 = logreg_5[3]

(0.7892, 0.00836, 0.7533)


In [30]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/4) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_6 = smote_choose('roc_auc', 0.1, 0.25)
print(logreg_6[:3])
fitted_pipe6 = logreg_6[3]

(0.7905, 0.0084, 0.7539)


In [31]:
# undersample the majority/minority labels to ratio 10:1
logreg_7 = smote_choose('roc_auc', None, 0.1)
print(logreg_7[:3])
fitted_pipe7 = logreg_7[3]

(0.8377, 0.00667, 0.8112)


In [32]:
# undersample the majority/minority labels to ratio 4:1
logreg_8 = smote_choose('roc_auc', None, 0.25)
print(logreg_8[:3])
fitted_pipe8 = logreg_8[3]

# logreg_8 = smote_choose('binary', None, 0.25)
# print(logreg_8[:2])
# fitted_pipe81 = logreg_8[2]

(0.8379, 0.00752, 0.8169)


In [33]:
# undersample the majority/minority labels to ratio 2:1
logreg_9 = smote_choose('roc_auc', None, 0.5)
print(logreg_9[:3])
fitted_pipe9 = logreg_9[3]

# logreg_9 = smote_choose('binary', None, 0.5)
# print(logreg_9[:2])
# fitted_pipe91 = logreg_9[2]

(0.8364, 0.00687, 0.8174)


In [34]:
# undersample the majority/minority labels to ratio 1:1
logreg_10 = smote_choose('roc_auc', None, 1)
print(logreg_10[:3])
fitted_pipe10 = logreg_10[3]

(0.8336, 0.00678, 0.8117)


### 5. Addtional evaluation with 2015 data, in which the NPI not in training set

In [17]:
# read in the full dataset, keeping the data collect for year 2015 only
dfx = pd.read_csv(parent + '/CMS_datasets_2020-2021/data/year2013_to_2021_combined_with_labels.csv')
dfx = dfx[dfx.YEAR == 2015]
dfx.fillna(0, inplace=True)

# assign categorical features to the numeric code for each category
dfx['Rndrng_Prvdr_Type'] = dfx['Rndrng_Prvdr_Type'].astype('category').cat.codes
dfx['Rndrng_Prvdr_Gndr'] = dfx['Rndrng_Prvdr_Gndr'].astype('category').cat.codes
dfx['Rndrng_Prvdr_Mdcr_Prtcptg_Ind'] = dfx['Rndrng_Prvdr_Mdcr_Prtcptg_Ind'].astype('category').cat.codes
print(dfx.shape)

eval_2015 = list(set(dfx.Rndrng_NPI.unique()) - set(train_set))
evalset_2015 = dfx.query("`Rndrng_NPI` in @eval_2015")
print(evalset_2015.shape)
eval_X = evalset_2015.drop(columns=['Rndrng_NPI', 'Fraud_Indicator', "YEAR"])
eval_y = evalset_2015.Fraud_Indicator

(968203, 89)
(173127, 89)


In [18]:
resample_dict = {
    0: '0.1 over /0.5 under',
    1: '1/10 oversampling',
    2: '1/2 oversampling',
    3: '1/1 oversampling',
    4: 'None / None (No resampling)',
    5: '0.1 over/0.1 under',
    6: '0.1 over/ 0.25 under',
    7: 'undersampling 10:1',
    8: 'undersampling  4:1',
    9: 'undersampling  2:1',
    10: 'undersampling  1:1'}

for i in range(11):
    _fitted_pipe = eval(f'fitted_pipe{i}')
    eval_score = roc_auc_score(eval_y, _fitted_pipe.predict_proba(eval_X)[:, 1])
    print(resample_dict[i], ': ', eval_score)

0.1 over /0.5 under :  0.7799114207968557
1/10 oversampling :  0.7813499226221817
1/2 oversampling :  0.7757947845530115
1/1 oversampling :  0.7760327805516417
None / None (No resampling) :  0.8463308971780499
0.1 over/0.1 under :  0.7809293638056789
0.1 over/ 0.25 under :  0.780156405258741
undersampling 10:1 :  0.8451180968326647
undersampling  4:1 :  0.8464673447661717
undersampling  2:1 :  0.8430098672787296
undersampling  1:1 :  0.8441423434353439
