### 1. Read in the enriched PartB data for year 2013 & 2014

In [1]:
import pandas as pd
import numpy as np
import os 
from dateutil.relativedelta import relativedelta
import warnings
warnings.filterwarnings('ignore')
parent = os.path.dirname(os.path.dirname(os.getcwd()))

# read in the claim data with year from 2013 to 2014
df = pd.read_csv(parent + '/CMS_datasets/data/year2013_2014_combined_with_labels.csv')

# follow the unsupervised learning code for minibatch
# fill nulls with 0
df.fillna(0, inplace=True)
print(df.shape)

(1847558, 89)


### 2. Split the data to a train set (used for cross validation) and a test set (holdout)

In [10]:
import random

random.seed(42)
holdout_rate = 0.1

uniq_id = df.Rndrng_NPI.unique()
holdout_set = random.sample(list(uniq_id), int(len(uniq_id)*holdout_rate))
train_set = list(set(uniq_id) - set(holdout_set))

train=df.query("`Rndrng_NPI` in @train_set")
holdout=df.query("`Rndrng_NPI` in @holdout_set")
ctodrop = ['Fraud_Indicator','Rndrng_Prvdr_Mdcr_Prtcptg_Ind','Rndrng_NPI','Rndrng_Prvdr_Gndr','Rndrng_Prvdr_Type','YEAR']
train_X = train.drop(columns = ctodrop)
train_y = train.Fraud_Indicator

test_X = holdout.drop(columns = ctodrop)
test_y = holdout.Fraud_Indicator

### 3. build up a pipeline with dimensionality reduction by TruncatedSVD

In [11]:
from sklearn.decomposition import TruncatedSVD ,IncrementalPCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, f1_score

# define pipeline
def smote_MiniBatchPCA(scoring= None, oversample_rate = None, undersample_rate= None, n_comp = 3):
    model = LogisticRegression(random_state=42, class_weight='balanced')

    
    if (oversample_rate != None) & (undersample_rate != None):
        over = SMOTE(sampling_strategy=oversample_rate, random_state=42)
        under = RandomUnderSampler(sampling_strategy=undersample_rate, random_state=42)
        steps = [('over', over), 
                 ('under', under), 
                 ('pca', IncrementalPCA(n_components=n_comp, batch_size=500)),
                 ('model', model)]
        
    elif (oversample_rate != None) & (undersample_rate == None):
        over = SMOTE(sampling_strategy=oversample_rate, random_state=42)
        steps = [('over', over),
                 ('pca', IncrementalPCA(n_components=n_comp, batch_size=500)),
                 ('model', model)]    
        
    elif (oversample_rate== None) & (undersample_rate != None):
        under = RandomUnderSampler(sampling_strategy=undersample_rate, random_state=42)
        steps = [('under', under), 
                 ('pca', IncrementalPCA(n_components=n_comp, batch_size=500)),
                 ('model', model)]     
        
    elif (oversample_rate== None) & (undersample_rate == None):
        steps = [
                 ('pca', IncrementalPCA(n_components=n_comp, batch_size=500)),
                 ('model', model)]
        
    pipe = Pipeline(steps=steps)


    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

    if scoring == 'roc_auc':
        scores = cross_val_score(pipe, train_X, train_y, scoring = scoring, cv=skf, n_jobs=-1)
        fitted_pipe=pipe.fit(train_X, train_y)
        val_auc = c(test_y, fitted_pipe.predict_proba(test_X)[:, 1])
        return round(np.nanmean(scores), 4), round(np.nanstd(scores), 5), round(val_auc, 4), fitted_pipe
    
    if scoring != 'roc_auc':
        if scoring == 'binary':
            scores = cross_val_score(pipe, train_X, train_y, scoring = 'f1', cv=skf, n_jobs=-1)
        else:
            scores = cross_val_score(pipe, train_X, train_y, scoring = f'f1_{scoring}', cv=skf, n_jobs=-1)
        fitted_pipe=pipe.fit(train_X, train_y)
        f1 = f1_score(test_y, fitted_pipe.predict(test_X), average= scoring)
        return round(np.nanmean(scores), 4), round(np.nanstd(scores), 5), round(f1, 4), fitted_pipe

In [12]:
# oversample the minority labels to the same amount (1:1) of the majority labels
logreg_3 = smote_MiniBatchPCA('roc_auc', 1, None)
print(logreg_3[:3])
fitted_pipe3 = logreg_3[3]

(0.6302, 0.00556, 0.6965)


In [13]:
# oversample the minority to 1/2 of the majority labels
logreg_2 = smote_MiniBatchPCA('roc_auc', 0.5, None)
print(logreg_2[:3])
fitted_pipe2 = logreg_2[3]

(0.6373, 0.00156, 0.6964)


In [14]:
# oversample the minority to 1/10 of the majority labels
logreg_1 = smote_MiniBatchPCA('roc_auc', 0.1, None)
print(logreg_1[:3])
fitted_pipe1 = logreg_1[3]

(0.6469, 0.00709, 0.6953)


In [15]:
# No oversampling nor undersampling
logreg_4 = smote_MiniBatchPCA('roc_auc', None, None)
print(logreg_4[:3])
fitted_pipe4 = logreg_4[3]

(0.6391, 0.01077, 0.6917)


In [16]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/2) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_0 = smote_MiniBatchPCA('roc_auc', 0.1, 0.5)
print(logreg_0[:3])
fitted_pipe0 = logreg_0[3]

(0.6414, 0.01394, 0.6968)


In [17]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/10) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_5 = smote_MiniBatchPCA('roc_auc', 0.1, 0.1)
print(logreg_5[:3])
fitted_pipe5 = logreg_5[3]

(0.6402, 0.00419, 0.6954)


In [18]:
# First oversample to 1/10 of the majority labels, then undersample the majority label to  
# THen setting desired ratio (1/4) of the number of samples in the minority class 
# over the number of samples in the majority class after resampling
logreg_6 = smote_MiniBatchPCA('roc_auc', 0.1, 0.25)
print(logreg_6[:3])
fitted_pipe6 = logreg_6[3]

(0.6472, 0.00724, 0.6967)


In [19]:
# undersample the majority/minority labels to ratio 10:1
logreg_7 = smote_MiniBatchPCA('roc_auc', None, 0.1)
print(logreg_7[:3])
fitted_pipe7 = logreg_7[3]

(0.6361, 0.01206, 0.6872)


In [20]:
# undersample the majority/minority labels to ratio 4:1
logreg_8 = smote_MiniBatchPCA('roc_auc', None, 0.25)
print(logreg_8[:3])
fitted_pipe8 = logreg_8[3]

(0.635, 0.01319, 0.6854)


In [21]:
# undersample the majority/minority labels to ratio 2:1
logreg_9 = smote_MiniBatchPCA('roc_auc', None, 0.5)
print(logreg_9[:3])
fitted_pipe9 = logreg_9[3]

(0.636, 0.01265, 0.6877)


In [22]:
# undersample the majority/minority labels to ratio 1:1
logreg_10 = smote_MiniBatchPCA('roc_auc', None, 1)
print(logreg_10[:3])
fitted_pipe10 = logreg_10[3]

(0.6416, 0.00672, 0.6876)


### 5. Addtional evaluation with 2015 data, in which the NPI not in training set

In [None]:
# read in the full dataset, keeping the data collect for year 2015 only
dfx = pd.read_csv(parent + '/CMS_datasets_2020-2021/data/year2013_to_2021_combined_with_labels.csv')
dfx = dfx[dfx.YEAR == 2015]
print(dfx.shape)

eval_2015 = list(set(dfx.Rndrng_NPI.unique()) - set(train_set))
evalset_2015 = dfx.query("`Rndrng_NPI` in @eval_2015")
print(evalset_2015.shape)
eval_X = evalset_2015.drop(columns=['Rndrng_NPI', 'Fraud_Indicator'])
eval_y = evalset_2015.Fraud_Indicator