### 1. Read in the enriched PartB data for year 2013 & 2014, only keeping the features in original PartB

In [1]:
import pandas as pd
import numpy as np
import os 
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')
parent = os.path.dirname(os.getcwd())
df = pd.read_csv(parent + '/data/year2013_2014_combined_with_labels.csv')
print(df.shape)

# only keep the features collected from PartB
partB_feat = ['Rndrng_NPI', 'Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr',
       'Tot_Srvcs_sum', 'Tot_Srvcs_mean', 'Tot_Srvcs_median',
       'Tot_Srvcs_std', 'Tot_Srvcs_min', 'Tot_Srvcs_max', 'Tot_Benes_sum',
       'Tot_Benes_mean', 'Tot_Benes_median', 'Tot_Benes_std', 'Tot_Benes_min',
       'Tot_Benes_max', 'Tot_Bene_Day_Srvcs_sum', 'Tot_Bene_Day_Srvcs_mean',
       'Tot_Bene_Day_Srvcs_median', 'Tot_Bene_Day_Srvcs_std',
       'Tot_Bene_Day_Srvcs_min', 'Tot_Bene_Day_Srvcs_max',
       'Avg_Sbmtd_Chrg_sum', 'Avg_Sbmtd_Chrg_mean', 'Avg_Sbmtd_Chrg_median',
       'Avg_Sbmtd_Chrg_std', 'Avg_Sbmtd_Chrg_min', 'Avg_Sbmtd_Chrg_max',
       'Avg_Mdcr_Pymt_Amt_sum', 'Avg_Mdcr_Pymt_Amt_mean',
       'Avg_Mdcr_Pymt_Amt_median', 'Avg_Mdcr_Pymt_Amt_std',
       'Avg_Mdcr_Pymt_Amt_min', 'Avg_Mdcr_Pymt_Amt_max',
       'Avg_Mdcr_Stdzd_Amt_sum', 'Avg_Mdcr_Stdzd_Amt_mean',
       'Avg_Mdcr_Stdzd_Amt_median', 'Avg_Mdcr_Stdzd_Amt_std',
       'Avg_Mdcr_Stdzd_Amt_min', 'Avg_Mdcr_Stdzd_Amt_max', 'Fraud_Indicator']
df = df[partB_feat]

# Convert the categorical features to numeric code
df['Rndrng_Prvdr_Type'] = df['Rndrng_Prvdr_Type'].astype('category').cat.codes
df['Rndrng_Prvdr_Gndr'] = df['Rndrng_Prvdr_Gndr'].astype('category').cat.codes
print(df.shape)

(1847558, 89)
(1847558, 40)


### 2. Split the data to a train set (used for cross validation) and a test set (holdout)

In [2]:
import random

random.seed(42)
holdout_rate = 0.1

# get a holdout set by unique NPI, preventing from data leakage
uniq_id = df.Rndrng_NPI.unique()
holdout_set = random.sample(list(uniq_id), int(len(uniq_id)*holdout_rate))
train_set = list(set(uniq_id) - set(holdout_set))

train=df.query("`Rndrng_NPI` in @train_set")
holdout=df.query("`Rndrng_NPI` in @holdout_set")

train_X = train.drop(columns=['Rndrng_NPI', 'Fraud_Indicator'])
train_y = train.Fraud_Indicator

test_X = holdout.drop(columns=['Rndrng_NPI', 'Fraud_Indicator'])
test_y = holdout.Fraud_Indicator

char_feat = ['Rndrng_Prvdr_Type', 'Rndrng_Prvdr_Gndr']
num_feat = list(set(train_X.columns) - set(char_feat))

### 3. build up a pipeline including evaluation

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score

# define pipeline
def smote_choose(scoring= None, oversample_rate = None, undersample_rate= None):
    model = LogisticRegression(random_state=42, class_weight='balanced')

    cleaner = ColumnTransformer(
        transformers=[            
            ('scaling', StandardScaler(), num_feat),
            ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), char_feat)], remainder='drop')
    
    if (oversample_rate != None) & (undersample_rate != None):
        over = SMOTE(sampling_strategy=oversample_rate, random_state=42)
        under = RandomUnderSampler(sampling_strategy=undersample_rate, random_state=42)
        steps = [('over', over), 
                 ('under', under), 
                 ("cleaner", cleaner),
                 ('model', model)]
        
    elif (oversample_rate != None) & (undersample_rate == None):
        over = SMOTE(sampling_strategy=oversample_rate, random_state=42)
        steps = [('over', over),
                 ("cleaner", cleaner),
                 ('model', model)]    
        
    elif (oversample_rate== None) & (undersample_rate != None):
        under = RandomUnderSampler(sampling_strategy=undersample_rate, random_state=42)
        steps = [('under', under), 
                 ("cleaner", cleaner),
                 ('model', model)]     
        
    elif (oversample_rate== None) & (undersample_rate == None):
        steps = [("cleaner", cleaner),
                 ('model', model)]
        
    pipe = Pipeline(steps=steps)


    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    if scoring == 'roc_auc':
        scores = cross_val_score(pipe, train_X, train_y, scoring = scoring, cv=skf, n_jobs=-1)
        fitted_pipe=pipe.fit(train_X, train_y)
        val_auc = roc_auc_score(test_y, fitted_pipe.predict_proba(test_X)[:, 1])
        return round(np.nanmean(scores), 4), round(np.nanstd(scores), 5), round(val_auc, 4), fitted_pipe
    
    if scoring != 'roc_auc':
        if scoring == 'binary':
            scores = cross_val_score(pipe, train_X, train_y, scoring = 'f1', cv=skf, n_jobs=-1)
        else:
            scores = cross_val_score(pipe, train_X, train_y, scoring = f'f1_{scoring}', cv=skf, n_jobs=-1)
        fitted_pipe=pipe.fit(train_X, train_y)
        f1 = f1_score(test_y, fitted_pipe.predict(test_X), average= scoring)
        return round(np.nanmean(scores), 4), round(np.nanstd(scores), 5), round(f1, 4), fitted_pipe

### 4. Output the roc_auc score - (Train set mean score, Train set score standard deviation, Test score)

In [51]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/2) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_0 = smote_choose('roc_auc', 0.1, 0.5)
print(logreg_0[:3])
fitted_pipe0 = logreg_0[3]

(0.7069, 0.01093, 0.7094)


In [52]:
# oversample the minority to 1/10 of the majority labels
logreg_1 = smote_choose('roc_auc', 0.1, None)
print(logreg_1[:3])
fitted_pipe1 = logreg_1[3]

(0.7065, 0.01021, 0.7096)


In [13]:
# oversample the minority to 1/2 of the majority labels
logreg_2 = smote_choose('roc_auc', 0.5, None)
print(logreg_2[:3])
fitted_pipe2 = logreg_2[3]

(0.7057, 0.01058, 0.707)


In [14]:
# oversample the minority labels to the same amount (1:1) of the majority labels
logreg_3 = smote_choose('roc_auc', 1, None)
print(logreg_3[:3])
fitted_pipe3 = logreg_3[3]

(0.7048, 0.01065, 0.7065)


In [55]:
# No oversampling nor undersampling
logreg_4 = smote_choose('roc_auc', None, None)
print(logreg_4[:3])
fitted_pipe4 = logreg_4[3]

(0.7796, 0.0116, 0.7809)


In [56]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/10) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_5 = smote_choose('roc_auc', 0.1, 0.1)
print(logreg_5[:3])
fitted_pipe5 = logreg_5[3]

(0.7065, 0.01028, 0.7096)


In [57]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/4) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_6 = smote_choose('roc_auc', 0.1, 0.25)
print(logreg_6[:3])
fitted_pipe6 = logreg_6[3]

(0.7071, 0.01074, 0.7087)


In [58]:
# undersample the majority/minority labels to ratio 10:1
logreg_7 = smote_choose('roc_auc', None, 0.1)
print(logreg_7[:3])
fitted_pipe7 = logreg_7[3]

(0.7818, 0.01045, 0.7855)


In [59]:
# undersample the majority/minority labels to ratio 4:1
logreg_8 = smote_choose('roc_auc', None, 0.25)
print(logreg_8[:3])
fitted_pipe8 = logreg_8[3]

(0.782, 0.00997, 0.7836)


In [60]:
# undersample the majority/minority labels to ratio 2:1
logreg_9 = smote_choose('roc_auc', None, 0.5)
print(logreg_9[:3])
fitted_pipe9 = logreg_9[3]

(0.7804, 0.01061, 0.783)


In [15]:
# undersample the majority/minority labels to ratio 1:1
logreg_10 = smote_choose('roc_auc', None, 1)
print(logreg_10[:3])
fitted_pipe10 = logreg_10[3]

(0.7799, 0.01023, 0.778)
