# Data Wrangling 

In [362]:
import numpy as np
import pandas as pd
import datetime
from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 150)

from itertools import cycle
from collections import Counter

In [363]:
#available here: https://www.kaggle.com/rohitrox/healthcare-provider-fraud-detection-analysis

### Various dataframes:

In [364]:
test_provider_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Test.csv', low_memory=False)   
test_bene_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Test_Beneficiarydata.csv', low_memory=False)   
test_inp_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Test_Inpatientdata.csv', low_memory=False)   
test_outp_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Test_Outpatientdata.csv', low_memory=False)   

train_provider_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Train.csv', low_memory=False)   
train_bene_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Train_Beneficiarydata.csv', low_memory=False)   
train_inp_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Train_Inpatientdata.csv', low_memory=False)    
train_outp_data = pd.read_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/datasets/Train_Outpatientdata.csv', low_memory=False)    

In [365]:
print("There are %.0f observations in the provider test data." % len(test_provider_data))
test_provider_data.head()

There are 1353 observations in the provider test data.


Unnamed: 0,Provider
0,PRV51002
1,PRV51006
2,PRV51009
3,PRV51010
4,PRV51018


In [366]:
print("There are %.0f observations in the training provider data." % len(train_provider_data))
train_provider_data.head()

There are 5410 observations in the training provider data.


Unnamed: 0,Provider,PotentialFraud
0,PRV51001,No
1,PRV51003,Yes
2,PRV51004,No
3,PRV51005,Yes
4,PRV51007,No


In [367]:
# the test set does not list potential fraud. 
# We will prepare the training set for study.
# Will we separately apply the same steps to the test data. 

In [368]:
test_bene_data.head()

Unnamed: 0,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt
0,BENE11001,1943-01-01,,1,1,0,39,230,12,12,1,2,1,2,2,1,1,1,2,1,1,36000,3204,60,70
1,BENE11007,1940-09-01,2009-12-01,1,2,0,45,610,12,12,1,1,2,2,2,2,1,2,1,1,2,0,0,1490,160
2,BENE11010,1936-07-01,,2,1,0,41,30,12,12,2,1,2,1,1,2,1,1,1,2,2,0,0,1170,660
3,BENE11011,1914-03-01,,2,2,0,1,360,12,12,2,1,1,2,2,1,1,2,2,1,1,5000,1068,250,320
4,BENE11014,1938-04-01,,2,1,Y,45,780,12,12,2,1,1,2,1,1,2,1,2,2,2,21260,2136,120,100


In [369]:
train_bene_data.head()

Unnamed: 0,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt
0,BENE11001,1943-01-01,,1,1,0,39,230,12,12,1,2,1,2,2,1,1,1,2,1,1,36000,3204,60,70
1,BENE11002,1936-09-01,,2,1,0,39,280,12,12,2,2,2,2,2,2,2,2,2,2,2,0,0,30,50
2,BENE11003,1936-08-01,,1,1,0,52,590,12,12,1,2,2,2,2,2,2,1,2,2,2,0,0,90,40
3,BENE11004,1922-07-01,,1,1,0,39,270,12,12,1,1,2,2,2,2,1,1,1,1,2,0,0,1810,760
4,BENE11005,1935-09-01,,1,1,0,24,680,12,12,2,2,2,2,1,2,1,2,2,2,2,0,0,1790,1200


### Missing data notes:
* ##### We will transform DOD and can remove claim procedure codes 4-6 from data
* Beneficiary data:
    * missing data for deaths
* Test Outpatient data:
  * higher Claim Diagnosis/Procedure Codes means more missing data
  * no test information on Claim Procedure Code 4-6, and barely any for 1-3 
* Train Outpatient data:
  * no test information on Claim Procedure Code 5-6, and barely any for 1-4
* Test Inpatient data:
  * no test information on Claim Procedure Code 6
  * less missing data than outpatient in general 
* Test Inpatient data: 
  * no test information on Claim Procedure Code 6


In [370]:
def perc_nan(df):
    percnan=(df.isnull().sum(axis=0))/len(df)
    print(percnan)

# test
perc_nan(test_bene_data)
perc_nan(test_provider_data)
perc_nan(test_outp_data)
perc_nan(test_inp_data)

# train
perc_nan(train_bene_data)
perc_nan(train_provider_data)
perc_nan(train_outp_data)
perc_nan(train_inp_data)


BeneID                             0.000000
DOB                                0.000000
DOD                                0.991027
Gender                             0.000000
Race                               0.000000
RenalDiseaseIndicator              0.000000
State                              0.000000
County                             0.000000
NoOfMonths_PartACov                0.000000
NoOfMonths_PartBCov                0.000000
ChronicCond_Alzheimer              0.000000
ChronicCond_Heartfailure           0.000000
ChronicCond_KidneyDisease          0.000000
ChronicCond_Cancer                 0.000000
ChronicCond_ObstrPulmonary         0.000000
ChronicCond_Depression             0.000000
ChronicCond_Diabetes               0.000000
ChronicCond_IschemicHeart          0.000000
ChronicCond_Osteoporasis           0.000000
ChronicCond_rheumatoidarthritis    0.000000
ChronicCond_stroke                 0.000000
IPAnnualReimbursementAmt           0.000000
IPAnnualDeductibleAmt           

In [371]:
claims_dat = [train_outp_data, train_inp_data, test_outp_data, test_inp_data]
null_codes = ['ClmProcedureCode_6', 'ClmProcedureCode_5', 'ClmProcedureCode_4']

train_outp_data = train_outp_data.drop(null_codes, axis=1)
train_inp_data = train_inp_data.drop(null_codes, axis=1)
test_outp_data = test_outp_data.drop(null_codes, axis=1)
test_inp_data = test_inp_data.drop(null_codes, axis=1)

In [372]:
test_inp_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3
0,BENE11014,CLM67387,2009-09-09,2009-09-16,PRV57070,9000,PHY317786,PHY427017,,2009-09-09,5789,1068.0,2009-09-16,332,5780,5533,496,V420,40390,2851,5990,570,41071.0,4280.0,4443.0,5849.0,
1,BENE11017,CLM31237,2008-12-25,2009-01-08,PRV54750,14000,PHY314656,PHY426644,,2008-12-25,5939,1068.0,2009-01-08,661,1889,41071,5990,5601,4588,5845,4549,29570,34831.0,,5551.0,,
2,BENE11026,CLM78930,2009-12-09,2009-12-13,PRV53758,2000,PHY349495,,,2009-12-09,4019,1068.0,2009-12-13,241,4010,78791,60000,41401,V1254,4372,78650,7813,4254.0,,,,
3,BENE11031,CLM56810,2009-06-23,2009-07-06,PRV55825,16000,PHY429538,PHY371893,,2009-06-23,8208,1068.0,2009-07-06,564,8208,4168,920,5990,40391,2859,4254,41400,5849.0,41401.0,8152.0,3320.0,
4,BENE11085,CLM34625,2009-01-20,2009-01-31,PRV52338,19000,PHY397161,,,2009-01-20,4279,1068.0,2009-01-31,880,29654,V142,78702,30503,V140,V4582,V6109,7242,,,,,


In [373]:
train_inp_data.head(10)

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,7866,1068.0,2009-04-18,201,1970,4019,5853,7843.0,2768,71590.0,2724.0,19889.0,5849.0,,,,
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,6186,1068.0,2009-09-02,750,6186,2948,56400,,,,,,,,7092.0,,
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,29590,1068.0,2009-09-20,883,29623,30390,71690,34590.0,V1581,32723.0,,,,,,,
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,431,1068.0,2009-02-22,67,43491,2762,7843,32723.0,V1041,4254.0,25062.0,40390.0,4019.0,,331.0,,
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,78321,1068.0,2009-08-30,975,42,3051,34400,5856.0,42732,486.0,5119.0,29620.0,20300.0,,3893.0,,
5,BENE11017,CLM70950,2009-10-06,2009-10-12,PRV54986,8000,PHY402711,PHY402711,PHY402711,2009-10-06,1749,1068.0,2009-10-12,597,1745,V4571,78702,28521.0,7019,1963.0,2948.0,25000.0,25002.0,,863.0,,
6,BENE11018,CLM32075,2009-01-02,2009-01-07,PRV54090,8000,PHY412314,PHY347494,,2009-01-02,5699,1068.0,2009-01-07,390,1536,73300,7230,3659.0,2859,4019.0,2948.0,2809.0,56210.0,,4576.0,,
7,BENE11028,CLM62376,2009-08-03,2009-08-07,PRV51148,6000,PHY346286,PHY405514,,2009-08-03,78605,1068.0,2009-08-07,379,56212,25000,30000,73300.0,2766,53081.0,2851.0,4439.0,41401.0,,9904.0,,
8,BENE11031,CLM62784,2009-08-06,2009-08-09,PRV55839,7000,PHY385030,,,2009-08-06,2859,1068.0,2009-08-09,294,42823,4280,6822,40390.0,43811,78322.0,,,,,,,
9,BENE11034,CLM31519,2008-12-29,2009-01-05,PRV55215,29000,PHY355604,PHY415867,,2008-12-29,41401,1068.0,2009-01-05,262,41041,3669,V851,25062.0,V074,2766.0,4019.0,4111.0,4589.0,,3612.0,4139.0,


In [374]:
test_outp_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,DeductibleAmtPaid,ClmAdmitDiagnosisCode
0,BENE11001,CLM392397,2009-06-02,2009-06-02,PRV55962,30,PHY347633,,PHY347633,V5832,V5861,2724.0,3182.0,V5869,42731.0,,,,,,,,0,
1,BENE11001,CLM430760,2009-06-23,2009-06-23,PRV56112,30,PHY381777,,PHY381777,9594,E9174,4019.0,,,,,,,,,,,0,
2,BENE11007,CLM233081,2009-03-07,2009-03-07,PRV56979,200,PHY425311,,PHY425311,7248,,,,,,,,,,,,,0,
3,BENE11007,CLM496381,2009-07-29,2009-07-29,PRV56573,10,PHY393253,PHY347995,,58889,2449,,,,,,,,,,,,0,5939.0
4,BENE11007,CLM521391,2009-08-12,2009-08-12,PRV56573,10,PHY417685,,PHY382041,V666,,,,,,,,,,,,,0,


In [375]:
train_outp_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,DeductibleAmtPaid,ClmAdmitDiagnosisCode
0,BENE11002,CLM624349,2009-10-11,2009-10-11,PRV56011,30,PHY326117,,,78943,V5866,V1272,,,,,,,,,,,0,56409.0
1,BENE11003,CLM189947,2009-02-12,2009-02-12,PRV57610,80,PHY362868,,,6115,,,,,,,,,,,,,0,79380.0
2,BENE11003,CLM438021,2009-06-27,2009-06-27,PRV57595,10,PHY328821,,,2723,,,,,,,,,,,,,0,
3,BENE11004,CLM121801,2009-01-06,2009-01-06,PRV56011,40,PHY334319,,,71988,,,,,,,,,,,,,0,
4,BENE11004,CLM150998,2009-01-22,2009-01-22,PRV56011,200,PHY403831,,,82382,30000,72887,4280.0,7197.0,V4577,,,,,,,,0,71947.0


### Modifying original dataframes

In [376]:
#removing letters:
for df in [train_bene_data, train_inp_data, train_outp_data, test_bene_data, test_inp_data, test_outp_data]:
    df['BeneID'] = df['BeneID'].str.extract('(\d+)', expand=False)

for df in [train_inp_data, train_outp_data, train_provider_data, test_inp_data, test_outp_data, test_provider_data]:
    df['Provider'] = df['Provider'].str.extract('(\d+)', expand=False)
   
for df in [train_inp_data, train_outp_data, test_inp_data, test_outp_data]:
    df['ClaimID'] = df['ClaimID'].str.extract('(\d+)', expand=False)
    df['AttendingPhysician'] = df['AttendingPhysician'].str.extract('(\d+)', expand=False)
    df['OperatingPhysician'] = df['OperatingPhysician'].str.extract('(\d+)', expand=False)
    df['OtherPhysician'] = df['OtherPhysician'].str.extract('(\d+)', expand=False)


In [377]:
# creating inpatient / outpatient identifier:

for df in [train_inp_data, test_inp_data]:
    df['In_Out'] = 1
    
for df in [train_outp_data, test_outp_data]:
    df['In_Out'] = 0

In [378]:
# chronic conditions to binary:
chronic_cond_list = ['ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke']

for col in chronic_cond_list:
    train_bene_data[col] = train_bene_data[col].replace(2, 0)
    test_bene_data[col] = test_bene_data[col].replace(2, 0)
        

In [379]:
# to datetime
# TRAIN
train_bene_data['DOB'] = pd.to_datetime(train_bene_data['DOB'])
train_inp_data['ClaimStartDt'] = pd.to_datetime(train_inp_data['ClaimStartDt'])
train_inp_data['ClaimEndDt'] = pd.to_datetime(train_inp_data['ClaimEndDt'])
train_inp_data['AdmissionDt'] = pd.to_datetime(train_inp_data['AdmissionDt'])
train_inp_data['DischargeDt'] = pd.to_datetime(train_inp_data['DischargeDt'])

train_outp_data['ClaimStartDt'] = pd.to_datetime(train_outp_data['ClaimStartDt'])
train_outp_data['ClaimEndDt'] = pd.to_datetime(train_outp_data['ClaimEndDt'])

# TEST
test_bene_data['DOB'] = pd.to_datetime(test_bene_data['DOB'])
test_inp_data['ClaimStartDt'] = pd.to_datetime(test_inp_data['ClaimStartDt'])
test_inp_data['ClaimEndDt'] = pd.to_datetime(test_inp_data['ClaimEndDt'])
test_inp_data['AdmissionDt'] = pd.to_datetime(test_inp_data['AdmissionDt'])
test_inp_data['DischargeDt'] = pd.to_datetime(test_inp_data['DischargeDt'])

train_outp_data['ClaimStartDt'] = pd.to_datetime(train_outp_data['ClaimStartDt'])
train_outp_data['ClaimEndDt'] = pd.to_datetime(train_outp_data['ClaimEndDt'])

# Duration: new column
# TRAIN
train_inp_data['Duration'] = train_inp_data.DischargeDt - train_inp_data.AdmissionDt

durations = []
train_inp_data['Duration'] = train_inp_data['Duration'].astype('str')

for duration in train_inp_data['Duration']:
    if duration == 'NaT':
        durations.append([0])
    else:
        durations.append([int(s) for s in duration.split() if s.isdigit()])
    
durations = [val for sublist in durations for val in sublist]
train_inp_data['Duration'] = durations

# TEST
test_inp_data['Duration'] = test_inp_data.DischargeDt - test_inp_data.AdmissionDt

durations = []
test_inp_data['Duration'] = test_inp_data['Duration'].astype('str')

for duration in test_inp_data['Duration']:
    if duration == 'NaT':
        durations.append([0])
    else:
        durations.append([int(s) for s in duration.split() if s.isdigit()])
    
durations = [val for sublist in durations for val in sublist]
test_inp_data['Duration'] = durations


# Death column as binary variable
# TRAIN 
train_bene_data['DOD'] = train_bene_data['DOD'].astype('str')
train_bene_data['DOD'].fillna(0)

death = []
for i, row in train_bene_data.iterrows():
    if train_bene_data['DOD'][i] == 'nan':
        death.append(0)
    else:
        death.append(1)
train_bene_data['Death'] = death

# TEST 
test_bene_data['DOD'] = test_bene_data['DOD'].astype('str')
test_bene_data['DOD'].fillna(0)

death = []
for i, row in test_bene_data.iterrows():
    if test_bene_data['DOD'][i] == 'nan':
        death.append(0)
    else:
        death.append(1)
test_bene_data['Death'] = death


# Creating age column. 
# TRAIN
train_bene_data['DOD'] = pd.to_datetime(train_bene_data['DOD'])
train_bene_data['Age'] = round(((train_bene_data['DOD'] - train_bene_data['DOB']).dt.days)/365)
train_bene_data['Age'] = train_bene_data['Age'].fillna(round(((train_bene_data['DOD'].max() - train_bene_data['DOB']).dt.days)/365)) 

# TEST
test_bene_data['DOD'] = pd.to_datetime(test_bene_data['DOD'])
test_bene_data['Age'] = round(((test_bene_data['DOD'] - test_bene_data['DOB']).dt.days)/365)
test_bene_data['Age'] = test_bene_data['Age'].fillna(round(((test_bene_data['DOD'].max() - test_bene_data['DOB']).dt.days)/365)) 

#making gender binary variable:
# TRAIN 
train_bene_data['Gender'] = train_bene_data['Gender'] - 1

# TEST 
test_bene_data['Gender'] = test_bene_data['Gender'] - 1


# RenalDiseaseIndicator to numeric type:
# TRAIN
rd_indicator = []
for i, row in train_bene_data.iterrows():
    if train_bene_data['RenalDiseaseIndicator'][i] == 'Y':
        rd_indicator.append(1)
    else:
        rd_indicator.append(0)
train_bene_data['RenalDiseaseIndicator'] = rd_indicator

train_bene_data['RenalDiseaseIndicator'] = train_bene_data['RenalDiseaseIndicator'].astype('int64')

# TEST
rd_indicator = []
for i, row in test_bene_data.iterrows():
    if test_bene_data['RenalDiseaseIndicator'][i] == 'Y':
        rd_indicator.append(1)
    else:
        rd_indicator.append(0)
test_bene_data['RenalDiseaseIndicator'] = rd_indicator

test_bene_data['RenalDiseaseIndicator'] = test_bene_data['RenalDiseaseIndicator'].astype('int64')

### Merging dataframes:

In [380]:
print("These columns are only applicable to inpatient claims:")
for col in train_inp_data.columns:
    if col not in train_outp_data.columns:
        print("-", col)

These columns are only applicable to inpatient claims:
- AdmissionDt
- DischargeDt
- DiagnosisGroupCode
- Duration


In [381]:
col_list = [col for col in train_outp_data.columns]

train_all_claims = train_outp_data.merge(train_inp_data, how = 'outer', on = col_list)
test_all_claims = train_outp_data.merge(train_inp_data, how = 'outer', on = col_list)

In [382]:
train_claims_bene_data = train_all_claims.merge(train_bene_data, how = 'outer', on = 'BeneID')
test_claims_bene_data = test_all_claims.merge(train_bene_data, how = 'outer', on = 'BeneID')

In [383]:
train_all_data = train_claims_bene_data.merge(train_provider_data, how = 'outer', on = 'Provider')
test_all_data = test_claims_bene_data.merge(test_provider_data, how = 'outer', on = 'Provider')

In [384]:
train_all_in_data = train_all_data[(train_all_data['In_Out'] == 1)]
test_all_in_data = test_all_data[(test_all_data['In_Out'] == 1)]
train_all_out_data = train_all_data[(train_all_data['In_Out'] == 0)]
test_all_out_data = test_all_data[(test_all_data['In_Out'] == 0)]

In [385]:
train_all_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_all_df.csv', index=False)
test_all_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/test_all_df.csv', index=False)

train_inp_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_inp_data.csv', index=False)
test_inp_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/test_inp_data.csv', index=False)

train_outp_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_outp_data.csv', index=False)
test_outp_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/test_outp_data.csv', index=False)

train_bene_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_bene_data.csv', index=False)
test_bene_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/test_bene_data.csv', index=False)

train_provider_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_provider_data.csv', index=False)
test_provider_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/test_provider_data.csv', index=False)

# Aggregation
### Aggregating information on beneficiaries, according to claims data

In [410]:
train_all_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,DeductibleAmtPaid,ClmAdmitDiagnosisCode,In_Out,AdmissionDt,DischargeDt,DiagnosisGroupCode,Duration,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Death,Age,PotentialFraud
0,11002,624349,2009-10-11,2009-10-11,56011,30,326117,,,78943,V5866,V1272,,,,,,,,,,,0.0,56409.0,0,NaT,NaT,,,1936-09-01,NaT,1,1,0,39,280,12,12,0,0,0,0,0,0,0,0,0,0,0,0,0,30,50,0,73.0,Yes
1,11004,121801,2009-01-06,2009-01-06,56011,40,334319,,,71988,,,,,,,,,,,,,0.0,,0,NaT,NaT,,,1922-07-01,NaT,0,1,0,39,270,12,12,1,1,0,0,0,0,1,1,1,1,0,0,0,1810,760,0,87.0,Yes
2,11004,150998,2009-01-22,2009-01-22,56011,200,403831,,,82382,30000,72887,4280,7197.0,V4577,,,,,,,,0.0,71947.0,0,NaT,NaT,,,1922-07-01,NaT,0,1,0,39,270,12,12,1,1,0,0,0,0,1,1,1,1,0,0,0,1810,760,0,87.0,Yes
3,11004,173224,2009-02-03,2009-02-03,56011,20,339887,,,20381,,,,,,,,,,,,,0.0,,0,NaT,NaT,,,1922-07-01,NaT,0,1,0,39,270,12,12,1,1,0,0,0,0,1,1,1,1,0,0,0,1810,760,0,87.0,Yes
4,11004,224741,2009-03-03,2009-03-03,56011,40,345721,,,V6546,4280,2449,V854,,,,,,,,,,0.0,,0,NaT,NaT,,,1922-07-01,NaT,0,1,0,39,270,12,12,1,1,0,0,0,0,1,1,1,1,0,0,0,1810,760,0,87.0,Yes


In [436]:
# general information about a provider's patients and costs

train_final_data = train_provider_data.copy()

# mean beneficiary data, duration of stay, and charges
mean_list = ['Age', 'Death', 'Gender', 'Duration', 'RenalDiseaseIndicator', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov', \
        'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease', \
        'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression', \
        'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart', 'ChronicCond_Osteoporasis', \
        'ChronicCond_rheumatoidarthritis', 'ChronicCond_stroke', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt',  'InscClaimAmtReimbursed', 'In_Out']

mean_dict = {0:'Mean_Age', 1:'Mean_Death', 2:'Mean_Gender', 3:'Mean_Duration', 4:'RenalDiseaseIndicator', 5:'NoOfMonths_PartACov', 6:'NoOfMonths_PartBCov', 
        7:'ChronicCond_Alzheimer', 8:'ChronicCond_Heartfailure', 9:'ChronicCond_KidneyDisease', 
        10:'ChronicCond_Cancer', 11:'ChronicCond_ObstrPulmonary', 12:'ChronicCond_Depression', 
        13:'ChronicCond_Diabetes', 14:'ChronicCond_IschemicHeart', 15:'ChronicCond_Osteoporasis', 
        16:'ChronicCond_rheumatoidarthritis', 17:'ChronicCond_stroke', 18:'Mean_IPAnnualReimbursementAmt', 19:'Mean_IPAnnualDeductibleAmt',
       20:'Mean_OPAnnualReimbursementAmt', 21:'Mean_OPAnnualDeductibleAmt',  22:'Mean_InscClaimAmtReimbursed', 23:'Mean_In_Out'}

mean_tuples = list(zip(mean_list, range(0, 24)))

#averages for all values above
for col_, num in mean_tuples:
    grouped = (train_all_data.groupby(['Provider'])[col_].mean()).reset_index()
    grouped = grouped.rename(index=str, columns={col_:mean_dict[num]})
    train_final_data = pd.merge(grouped, train_final_data, on='Provider')

    
cols = ['Age', 'Duration', 'IPAnnualReimbursementAmt', 'IPAnnualDeductibleAmt',
       'OPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt',  'InscClaimAmtReimbursed', 'NoOfMonths_PartACov',
       'NoOfMonths_PartBCov']

median_dict = {0:'Median_Age', 1:'Median_Duration', 2:'Median_IPAnnualReimbursementAmt', 3:'Median_IPAnnualDeductibleAmt',
       4:'Median_OPAnnualReimbursementAmt', 5:'Median_OPAnnualDeductibleAmt',  6:'Median_InscClaimAmtReimbursed', 7:'Median_NoOfMonths_PartACov',
       8:'Median_NoOfMonths_PartBCov'}

#median values for non-binary variables above
median_tuples = list(zip(cols, range(0, 9)))

for col_, num in median_tuples:
    grouped = (train_all_data.groupby(['Provider'])[col_].median()).reset_index()
    grouped = grouped.rename(index=str, columns={col_:median_dict[num]})
    train_final_data = pd.merge(grouped, train_final_data, on='Provider')


In [437]:
# ALTERNATIVE IF HIGH DIMENSIONAL DATA IS PROBLEMATIC:

max_one_race = train_all_data.groupby(['Provider', 'Race'])['Race'].count().groupby('Provider').max()
max_all_people = train_all_data.groupby(['Provider', 'Race'])['Race'].count().groupby('Provider').sum()
race_ratio = max_one_race / all_people
# train_final_data = pd.merge(race_ratio.to_frame('High_Race_Perc'), train_final_data, left_index=True, right_on='Provider')

def mode_col(col, t_data):
    name_ = t_data.groupby(['Provider', col])[col].count().groupby('Provider').idxmax()
    list_ = [x[1] for x in name_]
    train_final_data[col] = list_
    
# mode_col('State', train_all_data)
# mode_col('County', train_all_data)

In [438]:
def proportional_categories (col_name, train_final_data):
    groups = train_all_data[['Provider', col_name, 'ClaimID']].groupby(['Provider', col_name]).count().reset_index()
    sum_claims = train_all_data[['Provider', 'ClaimID']].groupby('Provider').count().reset_index()
    groups = groups.pivot(index='Provider', columns=col_name, values='ClaimID').fillna(0)
    for i in groups.columns:
        i_array = groups[i].values / sum_claims['ClaimID'].values
        groups[i] = i_array
    groups.columns = [col_name + '_' + str(col) for col in groups.columns]
    return groups
    
cols = proportional_categories('Race', train_final_data)
train_final_data = pd.merge(cols, train_final_data, on='Provider')
cols = proportional_categories('County', train_final_data)
train_final_data = pd.merge(cols, train_final_data, on='Provider')
cols = proportional_categories('State', train_final_data)
train_final_data = pd.merge(cols, train_final_data, on='Provider')

In [439]:
train_final_data.head()

Unnamed: 0,Provider,State_1,State_2,State_3,State_4,State_5,State_6,State_7,State_8,State_9,State_10,State_11,State_12,State_13,State_14,State_15,State_16,State_17,State_18,State_19,State_20,State_21,State_22,State_23,State_24,State_25,State_26,State_27,State_28,State_29,State_30,State_31,State_32,State_33,State_34,State_35,State_36,State_37,State_38,State_39,State_41,State_42,State_43,State_44,State_45,State_46,State_47,State_49,State_50,State_51,State_52,State_53,State_54,County_0,County_1,County_10,County_11,County_14,County_20,County_25,County_30,County_34,County_40,County_50,County_55,County_60,County_70,County_80,County_84,County_88,County_90,County_100,County_110,County_111,County_113,County_117,County_120,County_130,County_131,County_140,County_141,County_150,County_160,County_161,County_170,County_180,County_190,County_191,County_194,County_200,County_210,County_211,County_212,County_213,County_220,County_221,County_222,County_223,County_224,County_230,County_240,County_241,County_250,County_251,County_260,County_270,County_271,County_280,County_281,County_288,County_290,County_291,County_292,County_300,County_301,County_310,County_311,County_312,County_320,County_321,County_328,County_330,County_331,County_340,County_341,County_342,County_343,County_350,County_360,County_361,County_362,County_370,County_380,County_381,County_390,County_391,County_392,County_400,County_410,County_411,County_412,County_420,County_421,County_430,County_431,County_440,County_441,County_450,County_451,County_460,County_461,County_462,County_470,County_471,County_480,County_490,County_500,County_510,County_511,County_520,County_521,County_522,County_530,County_531,County_540,County_541,County_542,County_550,County_551,County_552,County_560,County_561,County_562,County_563,County_564,County_570,County_580,County_581,County_582,County_583,County_590,County_591,County_592,County_600,County_601,County_610,County_611,County_612,County_620,County_621,County_622,County_630,County_631,County_632,County_640,County_641,County_650,County_651,County_652,County_653,County_654,County_660,County_661,County_662,County_670,County_671,County_672,County_680,County_681,County_690,County_691,County_700,County_701,County_702,County_703,County_710,County_711,County_712,County_720,County_722,County_730,County_731,County_734,County_740,County_741,County_742,County_743,County_744,County_750,County_751,County_752,County_753,County_754,County_755,County_756,County_757,County_758,County_760,County_761,County_770,County_771,County_772,County_780,County_782,County_783,County_784,County_785,County_790,County_791,County_792,County_793,County_794,County_795,County_796,County_797,County_800,County_801,County_802,County_803,County_804,County_810,County_811,County_812,County_820,County_821,County_822,County_830,County_831,County_832,County_834,County_835,County_838,County_840,County_841,County_842,County_843,County_844,County_845,County_850,County_851,County_860,County_861,County_862,County_867,County_870,County_871,County_873,County_874,County_875,County_876,County_878,County_879,County_880,County_881,County_882,County_883,County_884,County_885,County_886,County_887,County_888,County_890,County_891,County_892,County_893,County_900,County_901,County_902,County_903,County_904,County_905,County_910,County_911,County_912,County_913,County_920,County_921,County_930,County_931,County_932,County_940,County_941,County_942,County_943,County_944,County_945,County_946,County_947,County_948,County_949,County_950,County_951,County_952,County_953,County_954,County_955,County_960,County_961,County_962,County_963,County_970,County_971,County_972,County_973,County_974,County_975,County_976,County_977,County_978,County_979,County_980,County_981,County_982,County_983,County_984,County_985,County_986,County_987,County_988,County_989,County_990,County_991,County_992,County_993,County_994,County_996,County_999,Race_1,Race_2,Race_3,Race_5,Median_NoOfMonths_PartBCov,Median_NoOfMonths_PartACov,Median_InscClaimAmtReimbursed,Median_OPAnnualDeductibleAmt,Median_OPAnnualReimbursementAmt,Median_IPAnnualDeductibleAmt,Median_IPAnnualReimbursementAmt,Median_Duration,Median_Age,Mean_In_Out,Mean_InscClaimAmtReimbursed,Mean_OPAnnualDeductibleAmt,Mean_OPAnnualReimbursementAmt,Mean_IPAnnualDeductibleAmt,Mean_IPAnnualReimbursementAmt,ChronicCond_stroke,ChronicCond_rheumatoidarthritis,ChronicCond_Osteoporasis,ChronicCond_IschemicHeart,ChronicCond_Diabetes,ChronicCond_Depression,ChronicCond_ObstrPulmonary,ChronicCond_Cancer,ChronicCond_KidneyDisease,ChronicCond_Heartfailure,ChronicCond_Alzheimer,NoOfMonths_PartBCov,NoOfMonths_PartACov,RenalDiseaseIndicator,Mean_Duration,Mean_Gender,Mean_Death,Mean_Age,PotentialFraud
0,51001,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.16,0.0,0.0,12.0,12,400,150,1590,0,0,4.0,80.0,0.2,4185.6,463.92,2615.2,897.12,17606.0,0.24,0.32,0.24,0.92,0.84,0.36,0.4,0.2,0.68,0.76,0.6,12.0,12.0,0.32,5.0,0.64,0.0,78.84,No
1,51003,0.719697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174242,0.106061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068182,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098485,0.007576,0.0,0.022727,0.0,0.05303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.181818,0.0,0.0,0.0,0.0,0.106061,0.0,0.0,0.0,0.0,0.0,0.075758,0.0,0.0,0.0,0.007576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.007576,0.022727,0.015152,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007576,0.0,0.0,0.0,0.0,0.015152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.810606,0.181818,0.0,0.007576,12.0,12,1750,210,1235,1068,4580,4.0,72.0,0.469697,4588.409091,737.121212,2678.181818,931.424242,7568.181818,0.090909,0.287879,0.25,0.848485,0.757576,0.409091,0.310606,0.075758,0.484848,0.606061,0.424242,11.871212,11.818182,0.219697,5.16129,0.590909,0.007576,70.022727,Yes
2,51004,0.503356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040268,0.080537,0.0,0.0,0.026846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234899,0.0,0.006711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.053691,0.0,0.0,0.0,0.0,0.0,0.006711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04698,0.0,0.0,0.0,0.0,0.006711,0.0,0.0,0.026846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026846,0.0,0.0,0.0,0.0,0.147651,0.013423,0.0,0.053691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026846,0.0,0.0,0.0,0.0,0.0,0.073826,0.0,0.006711,0.0,0.013423,0.040268,0.0,0.0,0.0,0.0,0.006711,0.0,0.0,0.0,0.0,0.006711,0.0,0.0,0.013423,0.0,0.0,0.006711,0.0,0.0,0.0,0.0,0.0,0.006711,0.114094,0.0,0.0,0.0,0.020134,0.0,0.020134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107383,0.0,0.0,0.0,0.0,0.0,0.0,0.080537,0.0,0.073826,0.013423,0.0,0.033557,0.0,0.0,0.0,0.0,0.013423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013423,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.805369,0.161074,0.033557,0.0,12.0,12,70,380,1140,0,0,,73.0,0.0,350.134228,622.751678,2194.899329,434.95302,4351.879195,0.114094,0.308725,0.328859,0.724832,0.704698,0.422819,0.275168,0.107383,0.33557,0.590604,0.42953,11.959732,11.865772,0.154362,,0.691275,0.006711,72.161074,No
3,51005,0.985408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000858,0.0,0.0,0.0,0.000858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000858,0.0,0.000858,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008584,0.000858,0.0,0.000858,0.0,0.0,0.0,0.0,0.043777,0.0,0.0,0.006867,0.0,0.0,0.0,0.0,0.024893,0.0,0.0,0.016309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301288,0.0,0.0,0.020601,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001717,0.0,0.002575,0.0,0.015451,0.0,0.0103,0.0,0.0,0.0,0.0,0.026609,0.0,0.013734,0.000858,0.0,0.012876,0.0,0.0,0.049785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.404292,0.0,0.0,0.006867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.766524,0.224893,0.008584,0.0,12.0,12,70,360,1200,0,0,,71.0,0.0,241.124464,636.328755,2109.733906,379.162232,3623.991416,0.106438,0.28412,0.295279,0.76824,0.685837,0.416309,0.253219,0.141631,0.435193,0.583691,0.365665,11.939914,11.907296,0.222318,,0.561373,0.003433,70.475536,Yes
4,51007,0.916667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.375,0.0,0.0,0.0,0.472222,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013889,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.805556,0.194444,0.0,0.0,12.0,12,80,240,1290,0,0,5.0,70.5,0.041667,468.194444,469.722222,1729.722222,445.0,3050.0,0.166667,0.305556,0.291667,0.708333,0.680556,0.402778,0.222222,0.166667,0.305556,0.555556,0.361111,11.833333,11.833333,0.152778,5.333333,0.527778,0.013889,69.291667,No


### Aggregating information physicians and codes from claims data:

In [440]:
# How much diversity is there in the number of different physicians per provider?
# We count the unique values in each category per provider and divide by number of claims

count_dict = {0:'Div_AttendingPhysician', 1:'Div_OperatingPhysician', 2:'Div_OtherPhysician'}

count_cols = ['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']

count_tuples = list(zip(count_cols, range(0, 3)))

for col_, num in count_tuples:
    grouped = train_all_data[['Provider', col_]].groupby(['Provider']).count().reset_index()
    div = train_all_data.groupby(['Provider'])['ClaimID'].count().reset_index()
    grouped[col_] = grouped[col_]/div['ClaimID']
    grouped = grouped.rename(index=str, columns={col_:count_dict[num]})
    train_final_data = pd.merge(grouped, train_final_data, on="Provider" )


In [441]:
# Are physicians making duplicate claims per patient?
# Maximum number of repeated pairs (attending physician and beneficiary) per provider

counts = train_all_data.groupby(['Provider', 'AttendingPhysician', 'BeneID'])['ClaimID'].count()
counts_maxs = counts.groupby(['Provider']).max().reset_index()
counts_maxs.columns = ['Provider', 'Phys_Bene_Repeats']
train_final_data = pd.merge(counts_maxs, train_final_data, on = 'Provider')


In [442]:
# ALTERNATIVE IF HIGH-DIMENSIONAL DATA BECOMES PROBLEMATIC:
# How many claims is the provider processing?

grouped = train_all_data.groupby(['Provider', 'ClaimID'])['ClaimID'].count().groupby('Provider').sum().reset_index()
grouped = grouped.rename(index=str, columns={'ClaimID':'Sum_ClaimID'})
del grouped
# train_final_data = pd.merge(grouped, train_final_data, on="Provider")

In [443]:
# max average reimbursement per doctor
# We will use ClmDiagnosisCode_1 to classify outpatient claims.
# Will will use DiagnosisGroupCode to classify inpatient claims

# ALTERNATIVE IF HIGH-DIMENSIONAL DATA BECOMES PROBLEMATIC:
# series = train_all_out_data[['ClmDiagnosisCode_1', 'ClaimID']].groupby('ClmDiagnosisCode_1').count().reset_index()
# keep_list_out = list((series['ClmDiagnosisCode_1'][series['ClaimID'] >= 30]))

# series = train_all_in_data[['DiagnosisGroupCode', 'ClaimID']].groupby('DiagnosisGroupCode').count().reset_index()
# keep_list_in = list((series['DiagnosisGroupCode'][series['ClaimID'] >= 30]))

#inpatient
# alternative: train_all_in_data_agg = train_all_in_data[(train_all_in_data['DiagnosisGroupCode'].isin(keep_list_in))]
avgs = train_all_in_data.groupby(['Provider', 'DiagnosisGroupCode', 'AttendingPhysician'])['InscClaimAmtReimbursed'].mean().reset_index().drop('AttendingPhysician', axis=1)
in_maxs = avgs.groupby(['Provider', 'DiagnosisGroupCode'])['InscClaimAmtReimbursed'].max().reset_index()
in_maxs = in_maxs.pivot(index='Provider', columns='DiagnosisGroupCode', values='InscClaimAmtReimbursed').fillna(0)
in_maxs = in_maxs.add_prefix('in_c_')
train_final_data = pd.merge(in_maxs, train_final_data, how='right', on = 'Provider')

#outpatient
# alternative: train_all_out_data_agg = train_all_out_data[(train_all_out_data['ClmDiagnosisCode_1'].isin(keep_list_out))]                                         
avgs = train_all_out_data_agg.groupby(['Provider', 'ClmDiagnosisCode_1', 'AttendingPhysician'])['InscClaimAmtReimbursed'].mean().reset_index().drop('AttendingPhysician', axis=1)
out_maxs = avgs.groupby(['Provider', 'ClmDiagnosisCode_1'])['InscClaimAmtReimbursed'].max().reset_index()
out_maxs = out_maxs.pivot(index='Provider', columns='ClmDiagnosisCode_1', values='InscClaimAmtReimbursed').fillna(0)
out_maxs = out_maxs.add_prefix('out_c_')
train_final_data = pd.merge(out_maxs, train_final_data, how='right', on = 'Provider')


In [444]:
# Counts per diagnosis and procedure groups.

#diagnosis codes
for diag_col in ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
             'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5',
             'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
             'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10']:
    
    avgs = train_all_data.groupby(['Provider', diag_col])['ClaimID'].count().reset_index()
    avgs_pivot = avgs.pivot(index='Provider', columns=diag_col, values='ClaimID')
    avgs_pivot = avgs_pivot.add_prefix('diag_').fillna(0)
    train_final_data = pd.merge(avgs_pivot, train_final_data, how='right', on = 'Provider')

#procedure codes
for proc_col in ['ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3']:
    
    avgs = train_all_data.groupby(['Provider', proc_col])['ClaimID'].count().reset_index()
    avgs_pivot = avgs.pivot(index='Provider', columns=proc_col, values='ClaimID')
    avgs_pivot = avgs_pivot.add_prefix('proc_').fillna(0)
    train_final_data = pd.merge(avgs_pivot, train_final_data, how='right', on = 'Provider')

In [445]:
train_final_data = train_final_data.sort_index(axis=1)
col_list = train_final_data.columns
new_cols = []

for i in range(0, len(train_final_data.columns)):
    if (col_list[i].startswith('proc_') | col_list[i].startswith('diag_')) :
        if col_list[i].endswith('_y') | col_list[i].endswith('_x'):
            new_cols.append(col_list[i][:-2])
        else: new_cols.append(col_list[i])
    else:
        new_cols.append(col_list[i])
        
train_final_data.columns = new_cols

train_final_data = train_final_data.groupby(train_final_data.columns, axis=1).sum()

In [455]:
train_final_data['PotentialFraud'] = train_final_data['PotentialFraud'].eq('Yes').astype(int)

In [456]:
train_final_data.isnull().values.any()

False

In [457]:
train_final_data.head()

Unnamed: 0,ChronicCond_Alzheimer,ChronicCond_Cancer,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_Heartfailure,ChronicCond_IschemicHeart,ChronicCond_KidneyDisease,ChronicCond_ObstrPulmonary,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,County_0,County_1,County_10,County_100,County_11,County_110,County_111,County_113,County_117,County_120,County_130,County_131,County_14,County_140,County_141,County_150,County_160,County_161,County_170,County_180,County_190,County_191,County_194,County_20,County_200,County_210,County_211,County_212,County_213,County_220,County_221,County_222,County_223,County_224,County_230,County_240,County_241,County_25,County_250,County_251,County_260,County_270,County_271,County_280,County_281,County_288,County_290,County_291,County_292,County_30,County_300,County_301,County_310,County_311,County_312,County_320,County_321,County_328,County_330,County_331,County_34,County_340,County_341,County_342,County_343,County_350,County_360,County_361,County_362,County_370,County_380,County_381,County_390,County_391,County_392,County_40,County_400,County_410,County_411,County_412,County_420,County_421,County_430,County_431,County_440,County_441,County_450,County_451,County_460,County_461,County_462,County_470,County_471,County_480,County_490,County_50,County_500,County_510,County_511,County_520,County_521,County_522,County_530,County_531,County_540,County_541,County_542,County_55,County_550,County_551,County_552,County_560,County_561,County_562,County_563,County_564,County_570,County_580,County_581,County_582,County_583,County_590,County_591,County_592,County_60,County_600,County_601,County_610,County_611,County_612,County_620,County_621,County_622,County_630,County_631,County_632,County_640,County_641,County_650,County_651,County_652,County_653,County_654,County_660,County_661,County_662,County_670,County_671,County_672,County_680,County_681,County_690,County_691,County_70,County_700,County_701,County_702,County_703,County_710,County_711,County_712,County_720,County_722,County_730,County_731,County_734,County_740,County_741,County_742,County_743,County_744,County_750,County_751,County_752,County_753,County_754,County_755,County_756,County_757,County_758,County_760,County_761,County_770,County_771,County_772,County_780,County_782,County_783,County_784,County_785,County_790,County_791,County_792,County_793,County_794,County_795,County_796,County_797,County_80,County_800,County_801,County_802,County_803,County_804,County_810,County_811,County_812,County_820,County_821,County_822,County_830,County_831,County_832,County_834,County_835,County_838,County_84,County_840,County_841,County_842,County_843,County_844,County_845,County_850,County_851,County_860,County_861,County_862,County_867,County_870,County_871,County_873,County_874,County_875,County_876,County_878,County_879,County_88,County_880,...,proc_864.0,proc_8659.0,proc_8666.0,proc_8667.0,proc_8669.0,proc_8670.0,proc_8674.0,proc_8675.0,proc_8683.0,proc_8689.0,proc_8693.0,proc_8694.0,proc_8698.0,proc_8703.0,proc_8708.0,proc_8721.0,proc_8722.0,proc_8724.0,proc_8737.0,proc_8741.0,proc_8744.0,proc_8749.0,proc_8751.0,proc_8753.0,proc_8754.0,proc_8761.0,proc_8769.0,proc_8773.0,proc_8774.0,proc_8775.0,proc_8777.0,proc_8778.0,proc_8801.0,proc_881.0,proc_8814.0,proc_8819.0,proc_8823.0,proc_8826.0,proc_8827.0,proc_8828.0,proc_8829.0,proc_8837.0,proc_8838.0,proc_8841.0,proc_8842.0,proc_8843.0,proc_8844.0,proc_8845.0,proc_8847.0,proc_8848.0,proc_8849.0,proc_8853.0,proc_8855.0,proc_8856.0,proc_8857.0,proc_8866.0,proc_8867.0,proc_8871.0,proc_8872.0,proc_8873.0,proc_8874.0,proc_8875.0,proc_8876.0,proc_8877.0,proc_8879.0,proc_8881.0,proc_889.0,proc_8891.0,proc_8893.0,proc_8894.0,proc_8897.0,proc_8903.0,proc_8904.0,proc_8905.0,proc_8909.0,proc_8914.0,proc_8915.0,proc_8919.0,proc_8921.0,proc_8922.0,proc_8929.0,proc_8937.0,proc_8938.0,proc_8939.0,proc_8941.0,proc_8944.0,proc_8945.0,proc_8949.0,proc_8951.0,proc_8952.0,proc_8954.0,proc_8959.0,proc_8961.0,proc_8962.0,proc_8964.0,proc_8965.0,proc_8968.0,proc_90.0,proc_9012.0,proc_9031.0,proc_9052.0,proc_9054.0,proc_9064.0,proc_9065.0,proc_9072.0,proc_9092.0,proc_9112.0,proc_9172.0,proc_9181.0,proc_920.0,proc_9201.0,proc_9204.0,proc_9205.0,proc_9214.0,proc_9215.0,proc_9218.0,proc_9219.0,proc_9222.0,proc_9224.0,proc_9226.0,proc_9227.0,proc_9228.0,proc_9229.0,proc_9301.0,proc_9308.0,proc_9311.0,proc_9316.0,proc_9322.0,proc_9326.0,proc_9331.0,proc_9338.0,proc_9339.0,proc_9351.0,proc_9353.0,proc_9354.0,proc_9356.0,proc_9357.0,proc_9359.0,proc_9365.0,proc_9375.0,proc_9383.0,proc_9389.0,proc_9390.0,proc_9391.0,proc_9394.0,proc_9396.0,proc_9399.0,proc_9408.0,proc_9411.0,proc_9419.0,proc_9423.0,proc_9425.0,proc_9427.0,proc_9435.0,proc_9437.0,proc_9438.0,proc_9439.0,proc_9444.0,proc_9449.0,proc_9461.0,proc_9462.0,proc_9463.0,proc_9464.0,proc_9465.0,proc_9466.0,proc_9467.0,proc_9468.0,proc_9469.0,proc_9604.0,proc_9605.0,proc_9607.0,proc_9608.0,proc_9609.0,proc_9619.0,proc_9625.0,proc_9626.0,proc_9627.0,proc_9633.0,proc_9636.0,proc_9638.0,proc_9639.0,proc_9642.0,proc_9648.0,proc_9649.0,proc_9652.0,proc_9656.0,proc_9659.0,proc_966.0,proc_9671.0,proc_9672.0,proc_9702.0,proc_9703.0,proc_9705.0,proc_9714.0,proc_9723.0,proc_9738.0,proc_9749.0,proc_9751.0,proc_9761.0,proc_9762.0,proc_9764.0,proc_9784.0,proc_9787.0,proc_9789.0,proc_9805.0,proc_9815.0,proc_9851.0,proc_9903.0,proc_9904.0,proc_9905.0,proc_9906.0,proc_9907.0,proc_9910.0,proc_9914.0,proc_9915.0,proc_9916.0,proc_9917.0,proc_9918.0,proc_9919.0,proc_9920.0,proc_9921.0,proc_9922.0,proc_9923.0,proc_9925.0,proc_9926.0,proc_9928.0,proc_9929.0,proc_9938.0,proc_9939.0,proc_9952.0,proc_9955.0,proc_9959.0,proc_9960.0,proc_9961.0,proc_9962.0,proc_9969.0,proc_9971.0,proc_9972.0,proc_9973.0,proc_9974.0,proc_9975.0,proc_9978.0,proc_9979.0,proc_9982.0,proc_9984.0,proc_9986.0,proc_9992.0,proc_9995.0,proc_9998.0,proc_9999.0
0,0.365759,0.233463,0.451362,0.754864,0.564202,0.762646,0.474708,0.400778,0.272374,0.330739,0.105058,0.011673,0.0,0.011673,0.011673,0.0,0.0,0.0,0.0,0.0,0.0,0.015564,0.0,0.0,0.003891,0.0,0.07393,0.0,0.0,0.0,0.003891,0.011673,0.0,0.0,0.003891,0.0,0.0,0.0,0.0,0.0,0.011673,0.0,0.0,0.0,0.0,0.007782,0.011673,0.0,0.0,0.054475,0.0,0.0,0.035019,0.0,0.0,0.0,0.0,0.007782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07393,0.0,0.0,0.0,0.0,0.07393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042802,0.0,0.0,0.0,0.0,0.0,0.019455,0.0,0.003891,0.0,0.0,0.0,0.003891,0.0,0.0,0.019455,0.0,0.0,0.0,0.003891,0.038911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003891,0.0,0.0,0.0,0.019455,0.0,0.0,0.042802,0.0,0.0,0.0,0.0,0.011673,0.011673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015564,0.0,0.0,0.0,0.0,0.0,0.077821,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.093385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159533,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.426901,0.175439,0.444444,0.730994,0.649123,0.807018,0.473684,0.380117,0.280702,0.345029,0.076023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023392,0.0,0.0,0.005848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070175,0.0,0.0,0.0,0.0,0.0,0.192982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023392,0.0,0.0,0.0,0.005848,0.0,0.0,0.040936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.429515,0.229075,0.451542,0.685022,0.596916,0.799559,0.398678,0.34141,0.370044,0.290749,0.063877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156388,0.0,0.0,0.0,0.0,0.0,0.002203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.325991,0.008811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04185,0.0,0.0,0.0,0.0,0.0,0.0,0.453744,0.0,0.0,0.0,0.0,0.0,0.0,0.011013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.496454,0.191489,0.446809,0.77305,0.624113,0.794326,0.460993,0.304965,0.326241,0.326241,0.099291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134752,0.0,0.156028,0.0,0.0,0.0,0.0,0.312057,0.0,0.0,0.0,0.0,0.0,0.0,0.12766,0.0,0.0,0.0,0.007092,0.0,0.0,0.0,0.0,0.0,0.007092,0.0,0.0,0.049645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.322917,0.15625,0.385417,0.645833,0.645833,0.6875,0.395833,0.302083,0.291667,0.270833,0.104167,0.0,0.0,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.135417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052083,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [458]:
train_final_data.to_csv('/Users/Julia/Documents/bootcamp/fraud_capstone/data_out/train_final_data.csv', index=False)

In [448]:
train_final_data.shape

(5410, 16888)