## Setup and Data Import

In [1]:
import sys
sys.path.insert(0, '..')

from joblib import load

import Functions as fxns
from Sita_Functions import np, pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from datetime import timedelta

## Claims DF

In [2]:
# !python ../Preprocessing.py

In [3]:
claims = load('../claims.pkl')

### New Columns

In [4]:
claims['AgeAtService'] = ((claims.ClaimStartDt - claims.DOB)
                              / timedelta(days=365)).astype(int)
claims['HasDied'] = claims.DOD.notna()

physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
claims['HasAnyPhysician'] = claims[physician_cols].notna().any(axis=1)
claims['HasAllPhysicians'] = claims[physician_cols].notna().all(axis=1)

claims['ClaimDuration'] = \
    (claims.ClaimEndDt  - claims.ClaimStartDt).dt.days + 1
claims['IPDuration'] = \
    (claims.DischargeDt - claims.AdmissionDt).dt.days + 1

outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]
claims['IPClaimDuration'] = \
    (inpatient_claims.ClaimEndDt
     - inpatient_claims.ClaimStartDt).dt.days + 1
claims['OPClaimDuration'] = \
    (outpatient_claims.ClaimEndDt
     - outpatient_claims.ClaimStartDt).dt.days + 1

claims['ClaimCost'] = \
    claims.InscClaimAmtReimbursed + claims.DeductibleAmtPaid
claims['PercInsCovered'] = \
    round((claims.InscClaimAmtReimbursed / claims.ClaimCost) * 100)
claims['DailyClaimCost'] = \
    round(claims.ClaimCost / claims.ClaimDuration)

In [5]:
# add date cols containing only day, week, year for each column
fxns.split_date(claims, ['ClaimStartDt', 'ClaimEndDt',
                         'AdmissionDt', 'DischargeDt'])

In [6]:
claims = fxns.add_count_per_col(
    claims, 'BeneID', 'ClaimID', 'ClaimsPerBene')
claims = fxns.add_count_per_col(
    claims, 'AttendingPhysician', 'ClaimID', 'ClaimsPerAttendingPhysician')
claims = fxns.add_count_per_col(
    claims, 'OperatingPhysician', 'ClaimID', 'ClaimsPerOperatingPhysician')
claims = fxns.add_count_per_col(
    claims, 'OtherPhysician', 'ClaimID', 'ClaimsPerOtherPhysician')

In [7]:
claims.columns[claims.columns.str.contains('Reimb')].to_list()

['InscClaimAmtReimbursed',
 'IPAnnualReimbursementAmt',
 'OPAnnualReimbursementAmt']

In [8]:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

# Mean amount of reimbursed claims
# NEED TO REMOVE ReimbPerIP_sum
inpatient_claims = fxns.add_mean_per_col(
    inpatient_claims, 'BeneID', 'InscClaimAmtReimbursed', 'ReimbPerIP')
outpatient_claims = fxns.add_mean_per_col(
    outpatient_claims, 'BeneID', 'InscClaimAmtReimbursed', 'ReimbPerOP')
claims = pd.concat([inpatient_claims, outpatient_claims])


# Total (or mean) deductible paid for inpatients
# Total (or mean) deductible paid for outpatients

In [9]:
claims.columns

Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'ClmProcedureCode_6', 'IsOutpatient', 'DOB', 'DOD', 'Gender', 'Race',
       'RenalDiseaseIndicator', 'State', 'County', 'NoOfMonths_PartACov',
       'NoOfMonths_PartBCov', 'ChronicCond_Alzheimer',
       'ChronicCond_Heartfailure', 'ChronicCond_KidneyDisease',
       'ChronicCond_Cancer', 'ChronicCond_ObstrPulmonary',
       'ChronicCond_Depressi

### Pre-processing

In [10]:
# encode bool colums to 0/1
fxns.re_encode_bool(claims,
    ['HasDied', 'HasAnyPhysician', 'HasAllPhysicians'])

In [11]:
# drop ChronicCond_ prefix from applicable cols
fxns.drop_chronic_prefix(claims)

In [12]:
# change object cols to dtype category
fxns.to_category_dtype(claims)

### Variables

In [13]:
numeric_cols     = fxns.cols_by_dtype(claims)[0]
categorical_cols = fxns.cols_by_dtype(claims)[1]
date_cols        = fxns.cols_by_dtype(claims)[2]

physician_cols = \
    claims.columns[claims.columns.str.contains('Physician')].to_list()
chronic_cols   = \
    claims.columns[claims.columns.str.contains('Chronic')].to_list()
diagnosis_cols = \
    claims.columns[claims.columns.str.contains('Diagnosis')].to_list()
procedure_cols = \
    claims.columns[claims.columns.str.contains('Procedure')].to_list()

### Summary Data

In [14]:
outpatient_claims = claims.loc[claims.IsOutpatient == 1]
inpatient_claims  = claims.loc[claims.IsOutpatient == 0]

In [15]:
groupby_providers   = claims.groupby('Provider')

In [16]:
mean_by_provider    = \
    groupby_providers.mean().add_suffix('_mean').reset_index()

In [17]:
sum_by_provider     = \
    groupby_providers.sum().add_suffix('_sum').reset_index()

In [18]:
chronic_by_provider = claims[['Provider'] + chronic_cols
                            ].groupby('Provider').sum().reset_index()

In [19]:
# need to get unique counts
count_by_provider = groupby_providers[
    ['BeneID', 'ClaimID', 'County', 'State', 'IPDuration']
    + physician_cols + diagnosis_cols + procedure_cols
    ].count().add_suffix('_count').reset_index()

# ADD PHYSICIAN RATIOS, SUMS OF EACH
# CLAIM ADMIT CODE, DIAGNOSIS GROUP CODE?
# ADD RANGE OF AGE

In [20]:
providers = pd.merge(mean_by_provider, sum_by_provider, on='Provider')
providers = pd.merge(providers, count_by_provider, on='Provider')
providers = pd.merge(providers, chronic_by_provider, on='Provider')

In [21]:
providers.sort_index(axis=1, inplace=True)

In [22]:
# print(claims.columns, '\n')

# Can't use, but have IPDuration count/mean/sum:
#     'AdmissionDt', 'AdmissionDt_Week',
#     'DischargeDt', 'DischargeDt_Week'

# Can't use, but have ClaimDuration count/mean/sum:
#     'ClaimEndDt', 'ClaimEndDt_Week', 'ClaimID'
#     'ClaimStartDt', 'ClaimStartDt_Week',


# Can't use, but have AgeAtService mean/sum and HasDied sum/ratio:
#     'DOB', 'DOD'

In [23]:
print(providers.shape, '\n')
print(providers.columns)
providers.sample(5)

(5410, 87) 

Index(['AgeAtService_mean', 'AgeAtService_sum', 'Alzheimers_Chronic',
       'AttendingPhysician_count', 'BeneID_count', 'Cancer_Chronic',
       'ClaimCost_mean', 'ClaimCost_sum', 'ClaimDuration_mean',
       'ClaimDuration_sum', 'ClaimID_count',
       'ClaimsPerAttendingPhysician_count', 'ClaimsPerAttendingPhysician_mean',
       'ClaimsPerAttendingPhysician_sum', 'ClaimsPerBene_mean',
       'ClaimsPerBene_sum', 'ClaimsPerOperatingPhysician_count',
       'ClaimsPerOperatingPhysician_mean', 'ClaimsPerOperatingPhysician_sum',
       'ClaimsPerOtherPhysician_count', 'ClaimsPerOtherPhysician_mean',
       'ClaimsPerOtherPhysician_sum', 'ClmAdmitDiagnosisCode_count',
       'ClmDiagnosisCode_10_count', 'ClmDiagnosisCode_1_count',
       'ClmDiagnosisCode_2_count', 'ClmDiagnosisCode_3_count',
       'ClmDiagnosisCode_4_count', 'ClmDiagnosisCode_5_count',
       'ClmDiagnosisCode_6_count', 'ClmDiagnosisCode_7_count',
       'ClmDiagnosisCode_8_count', 'ClmDiagnosisCode_9_cou

Unnamed: 0,AgeAtService_mean,AgeAtService_sum,Alzheimers_Chronic,AttendingPhysician_count,BeneID_count,Cancer_Chronic,ClaimCost_mean,ClaimCost_sum,ClaimDuration_mean,ClaimDuration_sum,ClaimID_count,ClaimsPerAttendingPhysician_count,ClaimsPerAttendingPhysician_mean,ClaimsPerAttendingPhysician_sum,ClaimsPerBene_mean,ClaimsPerBene_sum,ClaimsPerOperatingPhysician_count,ClaimsPerOperatingPhysician_mean,ClaimsPerOperatingPhysician_sum,ClaimsPerOtherPhysician_count,ClaimsPerOtherPhysician_mean,ClaimsPerOtherPhysician_sum,ClmAdmitDiagnosisCode_count,ClmDiagnosisCode_10_count,ClmDiagnosisCode_1_count,ClmDiagnosisCode_2_count,ClmDiagnosisCode_3_count,ClmDiagnosisCode_4_count,ClmDiagnosisCode_5_count,ClmDiagnosisCode_6_count,ClmDiagnosisCode_7_count,ClmDiagnosisCode_8_count,ClmDiagnosisCode_9_count,ClmProcedureCode_1_count,ClmProcedureCode_2_count,ClmProcedureCode_3_count,ClmProcedureCode_4_count,ClmProcedureCode_5_count,ClmProcedureCode_6_count,County_count,DailyClaimCost_mean,DailyClaimCost_sum,DeductibleAmtPaid_mean,DeductibleAmtPaid_sum,Depression_Chronic,Diabetes_Chronic,DiagnosisGroupCode_count,HasAllPhysicians_count,HasAnyPhysician_count,HeartFailure_Chronic,IPAnnualDeductibleAmt_mean,IPAnnualDeductibleAmt_sum,IPAnnualReimbursementAmt_mean,IPAnnualReimbursementAmt_sum,IPClaimDuration_mean,IPClaimDuration_sum,IPDuration_count,IPDuration_mean,IPDuration_sum,InscClaimAmtReimbursed_mean,InscClaimAmtReimbursed_sum,IschemicHeart_Chronic,KidneyDisease_Chronic,NoOfMonths_PartACov_mean,NoOfMonths_PartACov_sum,NoOfMonths_PartBCov_mean,NoOfMonths_PartBCov_sum,OPAnnualDeductibleAmt_mean,OPAnnualDeductibleAmt_sum,OPAnnualReimbursementAmt_mean,OPAnnualReimbursementAmt_sum,OPClaimDuration_mean,OPClaimDuration_sum,ObstrPulmonary_Chronic,OperatingPhysician_count,Osteoporosis_Chronic,OtherPhysician_count,PercInsCovered_mean,PercInsCovered_sum,Provider,ReimbPerIP_mean_mean,ReimbPerIP_mean_sum,ReimbPerOP_mean_mean,ReimbPerOP_mean_sum,RheumatoidArthritis_Chronic,State_count,Stroke_Chronic
3663,73.6,1472,7,20,20,2,114.0,2280.0,3.4,68,20,20,1.55,31.0,6.6,132,3,1.0,3.0,5,1.4,7.0,4,0,19,10,9,2,2,2,0,0,0,0,0,0,0,0,0,20,87.4,1748.0,2.0,40.0,7,14,0,20,20,10,480.6,9612,4502.0,90040,,0.0,0,,0.0,112.0,2240,14,7,12.0,240,12.0,240,363.5,7270,1618.0,32360,3.4,68.0,7,3,8,5,99.631579,1893.0,PRV55589,,0.0,225.429113,4508.582251,7,20,3
3096,70.333333,211,2,3,3,0,11568.0,23136.0,11.666667,35,3,3,1.666667,5.0,2.333333,7,3,3.0,9.0,0,,0.0,3,0,3,3,3,3,3,3,2,2,2,3,1,0,0,0,0,3,4525.5,9051.0,1068.0,2136.0,1,2,3,3,3,1,1068.0,3204,26326.666667,78980,11.666667,35.0,3,11.666667,35.0,19333.333333,58000,3,2,12.0,36,12.0,36,70.0,210,233.333333,700,,0.0,1,3,0,0,90.5,181.0,PRV54876,13333.333333,40000.0,,0.0,1,3,0
5340,70.292308,4569,25,65,65,8,327.538462,21290.0,1.476923,96,65,65,65.0,4225.0,6.292308,409,13,13.0,169.0,24,24.0,576.0,13,0,65,39,26,12,5,2,1,0,0,1,0,0,0,0,0,65,311.6,20254.0,2.769231,180.0,33,45,0,65,65,38,312.184615,20292,3233.692308,210190,,0.0,0,,0.0,324.769231,21110,42,20,12.0,780,12.0,780,629.692308,40930,1814.461538,117940,1.476923,96.0,17,13,13,24,98.238095,6189.0,PRV57684,,0.0,276.222302,17954.449643,21,65,3
50,74.678571,2091,12,28,28,1,2899.703704,78292.0,3.928571,110,28,28,17.142857,480.0,5.821429,163,11,6.636364,73.0,6,2.0,12.0,16,1,28,22,18,15,14,13,10,9,9,8,1,0,0,0,0,28,673.851852,18194.0,360.074074,9722.0,16,20,10,28,28,19,719.857143,20156,5653.214286,158290,6.6,66.0,10,6.6,66.0,2734.642857,76570,23,10,11.571429,324,12.0,336,580.714286,16260,1754.285714,49120,2.444444,44.0,9,11,6,6,92.269231,2399.0,PRV51065,7058.333333,70583.333333,206.040123,3708.722222,5,28,7
2290,70.9,1418,14,20,20,3,8118.0,162360.0,5.05,101,20,20,3.6,72.0,7.5,150,12,2.333333,28.0,4,1.0,4.0,20,3,20,20,20,20,20,19,19,18,14,11,0,0,0,0,0,20,1943.15,38863.0,1068.0,21360.0,11,16,20,20,20,15,2189.4,43788,19812.5,396250,5.05,101.0,20,5.05,101.0,7050.0,141000,19,17,12.0,240,12.0,240,741.5,14830,1803.0,36060,,0.0,9,12,3,4,79.9,1598.0,PRV53855,8516.666667,170333.333333,,0.0,7,20,1


## New Columns

### Beneficiaries

In [24]:
providers['IPClaims_count'] = \
    inpatient_claims.groupby('Provider').IsOutpatient.count().values
providers['OPClaims_count'] = \
    outpatient_claims.groupby('Provider').IsOutpatient.count().values

# Ratio of inpatient claims can be determined from the above - do we need a separate col for it?

In [25]:
def binary_ratios(df, col, group1, group2):
    '''
    Adds a new column to a df that lists the ratio of one value from a
    binary column in the same df. (The ratio of the other value in the
    binary column is the inverse.)
    
    Arguments: A single dataframe.
    
    Output: None.
    
    Returns: Dataframe altered in place.
    '''
    by_provider = df.groupby(['Provider', col]
                            ).ClaimID.count().reset_index()

    colname = pd.DataFrame()
    colname[group1] = by_provider[by_provider[col] == 1].ClaimID.to_list()
    colname[group2] = by_provider[by_provider[col] == 0].ClaimID.to_list()
    
    newcolname1 = f'{group1}_ratio'
    
    providers[newcolname1] = round(
        (colname[group1] / (colname[group1] + colname[group2])) * 100, 2)

In [26]:
binary_ratios(claims, 'Gender', 'GenderZero', 'GenderOne')
binary_ratios(claims, 'RenalDisease', 'HasRenalDisease', 'NotRenalDisease')
binary_ratios(claims, 'HasDied', 'HasDied', 'NotDead')

In [27]:
race_by_provider = \
    claims.groupby(['Provider', 'Race']).ClaimID.count().reset_index()

race = pd.DataFrame()
race['race1'] = race_by_provider[race_by_provider.Race == 1
                                ].ClaimID.to_list()
race['race0'] = race_by_provider[race_by_provider.Race == 0
                                ].ClaimID.to_list()
race['race3'] = race_by_provider[race_by_provider.Race == 3
                                ].ClaimID.to_list()
race['race5'] = race_by_provider[race_by_provider.Race == 5
                                ].ClaimID.to_list()

providers['RaceZero_ratio']  = round(
    (race.race0 / np.sum(race, axis=1) * 100), 2)
providers['RaceThree_ratio'] = round(
    (race.race3 / np.sum(race, axis=1) * 100), 2)
providers['RaceFive_ratio']  = round(
    (race.race5 / np.sum(race, axis=1) * 100), 2)

In [28]:
# DO WE WANT CHRONIC CONDITION RATIOS? WHY/WHY NOT?

In [29]:
# ratio_cols = providers.columns[providers.columns.str.contains('_ratio')
#                               ].to_list()
# providers[['Provider'] + ratio_cols].sample(5)

### Inpatients/Outpatiens

In [30]:
binary_ratios(claims, 'IsOutpatient', 'IsOutpatient', 'Inpatient')

In [31]:
# Based on providers.IsOutpatient_ratio, do we need this?
# Whether the provider serves both in/out patients

### Doctors

In [32]:
providers.columns

Index(['AgeAtService_mean', 'AgeAtService_sum', 'Alzheimers_Chronic',
       'AttendingPhysician_count', 'BeneID_count', 'Cancer_Chronic',
       'ClaimCost_mean', 'ClaimCost_sum', 'ClaimDuration_mean',
       'ClaimDuration_sum', 'ClaimID_count',
       'ClaimsPerAttendingPhysician_count', 'ClaimsPerAttendingPhysician_mean',
       'ClaimsPerAttendingPhysician_sum', 'ClaimsPerBene_mean',
       'ClaimsPerBene_sum', 'ClaimsPerOperatingPhysician_count',
       'ClaimsPerOperatingPhysician_mean', 'ClaimsPerOperatingPhysician_sum',
       'ClaimsPerOtherPhysician_count', 'ClaimsPerOtherPhysician_mean',
       'ClaimsPerOtherPhysician_sum', 'ClmAdmitDiagnosisCode_count',
       'ClmDiagnosisCode_10_count', 'ClmDiagnosisCode_1_count',
       'ClmDiagnosisCode_2_count', 'ClmDiagnosisCode_3_count',
       'ClmDiagnosisCode_4_count', 'ClmDiagnosisCode_5_count',
       'ClmDiagnosisCode_6_count', 'ClmDiagnosisCode_7_count',
       'ClmDiagnosisCode_8_count', 'ClmDiagnosisCode_9_count',
       '

### Codes

### Money

### Time

### Location

In [72]:
providers[providers.columns[providers.columns.str.contains('Reimb')].to_list()].iloc[:,-5:].sample(5)

Unnamed: 0,OPAnnualReimbursementAmt_sum,ReimbPerIP_mean_mean,ReimbPerIP_mean_sum,ReimbPerOP_mean_mean,ReimbPerOP_mean_sum
953,31630,,0.0,216.4843,4979.138889
4953,107410,,0.0,278.600625,20337.845599
3507,44840,,0.0,212.285714,10402.0
3948,34890,,0.0,240.641023,5775.38456
2495,406540,10256.17284,553833.333333,202.310673,39450.581169


In [35]:
providers.sample(10)

Unnamed: 0,AgeAtService_mean,AgeAtService_sum,Alzheimers_Chronic,AttendingPhysician_count,BeneID_count,Cancer_Chronic,ClaimCost_mean,ClaimCost_sum,ClaimDuration_mean,ClaimDuration_sum,ClaimID_count,ClaimsPerAttendingPhysician_count,ClaimsPerAttendingPhysician_mean,ClaimsPerAttendingPhysician_sum,ClaimsPerBene_mean,ClaimsPerBene_sum,ClaimsPerOperatingPhysician_count,ClaimsPerOperatingPhysician_mean,ClaimsPerOperatingPhysician_sum,ClaimsPerOtherPhysician_count,ClaimsPerOtherPhysician_mean,ClaimsPerOtherPhysician_sum,ClmAdmitDiagnosisCode_count,ClmDiagnosisCode_10_count,ClmDiagnosisCode_1_count,ClmDiagnosisCode_2_count,ClmDiagnosisCode_3_count,ClmDiagnosisCode_4_count,ClmDiagnosisCode_5_count,ClmDiagnosisCode_6_count,ClmDiagnosisCode_7_count,ClmDiagnosisCode_8_count,ClmDiagnosisCode_9_count,ClmProcedureCode_1_count,ClmProcedureCode_2_count,ClmProcedureCode_3_count,ClmProcedureCode_4_count,ClmProcedureCode_5_count,ClmProcedureCode_6_count,County_count,DailyClaimCost_mean,DailyClaimCost_sum,DeductibleAmtPaid_mean,DeductibleAmtPaid_sum,Depression_Chronic,Diabetes_Chronic,DiagnosisGroupCode_count,HasAllPhysicians_count,HasAnyPhysician_count,HeartFailure_Chronic,IPAnnualDeductibleAmt_mean,IPAnnualDeductibleAmt_sum,IPAnnualReimbursementAmt_mean,IPAnnualReimbursementAmt_sum,IPClaimDuration_mean,IPClaimDuration_sum,IPDuration_count,IPDuration_mean,IPDuration_sum,InscClaimAmtReimbursed_mean,InscClaimAmtReimbursed_sum,IschemicHeart_Chronic,KidneyDisease_Chronic,NoOfMonths_PartACov_mean,NoOfMonths_PartACov_sum,NoOfMonths_PartBCov_mean,NoOfMonths_PartBCov_sum,OPAnnualDeductibleAmt_mean,OPAnnualDeductibleAmt_sum,OPAnnualReimbursementAmt_mean,OPAnnualReimbursementAmt_sum,OPClaimDuration_mean,OPClaimDuration_sum,ObstrPulmonary_Chronic,OperatingPhysician_count,Osteoporosis_Chronic,OtherPhysician_count,PercInsCovered_mean,PercInsCovered_sum,Provider,ReimbPerIP_mean_mean,ReimbPerIP_mean_sum,ReimbPerOP_mean_mean,ReimbPerOP_mean_sum,RheumatoidArthritis_Chronic,State_count,Stroke_Chronic,IPClaims_count,OPClaims_count,GenderZero_ratio,HasRenalDisease_ratio,HasDied_ratio,RaceZero_ratio,RaceThree_ratio,RaceFive_ratio,IsOutpatient_ratio
1416,79.0,79,0,1,1,0,50.0,50.0,1.0,1,1,1,8.0,8.0,9.0,9,0,,0.0,0,,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,50.0,50.0,0.0,0.0,1,1,0,1,1,0,0.0,0,0.0,0,,0.0,0,,0.0,50.0,50,0,1,12.0,12,12.0,12,640.0,640,970.0,970,1.0,1.0,0,0,1,0,100.0,100.0,PRV52757,,0.0,96.666667,96.666667,0,1,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,100.0
4352,69.485714,92416,494,1323,1330,184,973.853806,1292304.0,2.753383,3662,1330,1323,42.807256,56634.0,6.803008,9048,275,9.534545,2622.0,520,20.869231,10852.0,326,8,1300,837,564,389,271,199,152,119,94,49,11,2,0,0,0,1330,338.150716,448726.0,65.677468,87154.0,554,923,81,1330,1330,746,425.178947,565488,3725.601504,4955050,6.703704,543.0,81,6.703704,543.0,927.932331,1234150,958,555,11.96391,15912,11.974436,15926,569.676692,757670,2118.075188,2817040,2.497198,3119.0,432,275,395,520,97.272304,125384.0,PRV56442,10978.395062,889250.0,304.66232,380523.237787,447,1330,123,81,1249,44.06,22.41,1.05,13.91,0.75,0.0,93.91
1390,78.5,157,1,2,2,0,190.0,380.0,11.0,22,2,2,2.0,4.0,7.5,15,1,1.0,1.0,1,1.0,1.0,1,0,2,2,2,2,0,0,0,0,0,0,0,0,0,0,0,2,152.0,304.0,0.0,0.0,0,1,0,2,2,1,0.0,0,0.0,0,,0.0,0,,0.0,190.0,380,1,1,12.0,24,12.0,24,280.0,560,1150.0,2300,11.0,22.0,0,1,1,1,100.0,200.0,PRV52726,,0.0,148.660714,297.321429,0,2,0,0,2,50.0,0.0,0.0,0.0,0.0,50.0,100.0
4025,70.307692,914,7,13,13,1,119.230769,1550.0,1.230769,16,13,13,3.846154,50.0,8.923077,116,2,1.0,2.0,3,1.666667,5.0,2,0,13,6,2,1,1,1,0,0,0,0,0,0,0,0,0,13,113.461538,1475.0,1.538462,20.0,7,10,0,13,13,9,328.615385,4272,1270.0,16510,,0.0,0,,0.0,117.692308,1530,10,9,12.0,156,12.0,156,565.384615,7350,2497.692308,32470,1.230769,16.0,5,2,6,3,99.75,1197.0,PRV56034,,0.0,268.604052,3491.852675,0,13,1,0,13,30.77,30.77,0.0,15.38,0.0,0.0,100.0
2579,74.1,1482,9,20,20,1,235.5,4710.0,3.25,65,20,20,4.8,96.0,4.5,90,1,1.0,1.0,6,2.333333,14.0,5,0,18,9,3,2,1,0,0,0,0,0,0,0,0,0,0,20,193.55,3871.0,5.0,100.0,10,17,0,20,20,14,213.6,4272,2263.5,45270,,0.0,0,,0.0,230.5,4610,16,7,12.0,240,12.0,240,433.0,8660,1456.0,29120,3.25,65.0,3,1,7,6,97.368421,1850.0,PRV54210,,0.0,229.927778,4598.555556,7,20,2,0,20,45.0,10.0,0.0,20.0,0.0,0.0,100.0
2354,74.092784,7187,36,97,97,15,229.072165,22220.0,2.628866,255,97,97,2.28866,222.0,6.659794,646,20,1.35,27.0,37,1.378378,51.0,18,0,95,57,38,29,18,13,6,5,2,0,0,0,0,0,0,97,152.278351,14771.0,3.402062,330.0,45,72,0,97,97,57,420.742268,40812,5115.257732,496180,,0.0,0,,0.0,225.670103,21890,75,39,11.938144,1158,12.0,1164,683.608247,66310,2309.587629,224030,2.628866,255.0,29,20,30,37,97.544444,8779.0,PRV53940,,0.0,281.599957,27315.195804,30,97,4,0,97,45.36,18.56,0.0,15.46,2.06,1.03,100.0
1441,73.284289,29387,173,400,401,66,291.197007,116770.0,2.139651,858,401,400,200.045,80018.0,6.159601,2470,77,77.0,5929.0,168,168.0,28224.0,83,2,394,250,163,100,59,41,28,18,15,0,0,0,0,0,0,401,219.142145,87876.0,3.715711,1490.0,171,274,0,401,401,250,531.371571,213080,4548.902743,1824110,,0.0,0,,0.0,287.481297,115280,318,172,11.885287,4766,11.785536,4726,614.064838,246240,2171.620948,870820,2.139651,858.0,128,77,133,168,97.486842,37045.0,PRV52788,,0.0,274.873208,110224.156336,121,401,39,0,401,42.89,17.46,0.25,15.96,3.49,1.75,100.0
331,98.0,98,0,1,1,1,150.0,150.0,1.0,1,1,1,1.0,1.0,8.0,8,1,1.0,1.0,0,,0.0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,150.0,150.0,100.0,100.0,0,1,0,1,1,1,0.0,0,0.0,0,,0.0,0,,0.0,50.0,50,1,1,12.0,12,12.0,12,1710.0,1710,3440.0,3440,1.0,1.0,1,1,0,0,33.0,33.0,PRV51423,,0.0,431.25,431.25,0,1,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,100.0
3505,79.962963,2159,3,27,27,4,1444.222222,38994.0,2.259259,61,27,27,3.888889,105.0,7.740741,209,4,1.0,4.0,9,2.333333,21.0,7,0,27,19,15,10,6,6,4,3,3,2,0,0,0,0,0,27,421.037037,11368.0,118.666667,3204.0,9,20,3,27,27,20,237.333333,6408,1888.888889,51000,5.333333,16.0,3,5.333333,16.0,1325.555556,35790,23,6,12.0,324,12.0,324,1060.740741,28640,1438.518519,38840,1.875,45.0,15,4,15,9,98.615385,2564.0,PRV55399,10333.333333,31000.0,159.377671,3825.064103,5,27,1,3,24,40.74,22.22,0.0,0.0,0.0,0.0,88.89
1280,73.2,3660,21,50,50,4,3895.52,194776.0,4.34,217,50,50,6.6,330.0,6.76,338,19,4.052632,77.0,16,2.5,40.0,25,1,50,36,30,25,21,18,17,16,15,10,1,0,0,0,0,50,1259.32,62966.0,363.12,18156.0,21,42,17,50,50,25,640.8,32040,5548.2,277410,5.058824,86.0,17,5.058824,86.0,3532.4,176620,41,21,11.04,552,11.88,594,764.8,38240,2687.8,134390,3.969697,131.0,16,19,13,16,95.0,4560.0,PRV52588,9666.666667,164333.333333,406.060783,13400.005828,12,50,10,17,33,42.0,22.0,0.0,8.0,0.0,2.0,66.0


### Size

In [34]:
# Ratio of attending physicians serving for different hospitals (75% threshold)
# Ratio of operating physicians serving for different hospitals (75% threshold)
# Ratio of other physicians serving for different hospitals (75% threshold)
# Ratio of inpatients going to different hospitals (75% threshold)
# Ratio of outpatients going to different hospitals (75% threshold)
# Ratio of inpatients that receive both in/out patient service
# Ratio of outpatients that receive both in/out patient service
# Mean amount of reimbursed claims in dollars for inpatients
# Mean amount of reimbursed claims in dollars for outpatients
# Total (or mean) deductible paid for inpatients
# Total (or mean) deductible paid for outpatients
# Ratio of claims that didn’t have any physician involved
# Ratio of claims that had all physicians involved
# Mean admission duration for inpatients
# Average age of patients
# Ratio of inpatients with top 5 frequent chronic disease (from PotentialFraud)
# Ratio of outpatients with top 5 frequent chronic disease (from PotentialFraud)
# The mean number of chronic condition for inpatients
# The mean number of chronic condition for outpatients
# Ratio of inpatient claims with top 5 admtcode (from PotentialFraud)
# Ratio of outpatient claims with top 5 admtcode (from PotentialFraud)
# avg Insurance covered Ratio for inpatients (Reimbursement/(Reimbursement+Deductible)
# avg Insurance covered Ratio for outpatients
# Avg revenue per day for inpatients
# Avg revenue per day for outpatients
# Inpatient duplicate ratio
# Outpatient duplicate ratio
# All Diag codes are NaN ratio
# All Proc codes are NaN ratio
# The mean number of chronic condition
# Number of states for inpatient per provider
# Number of states for outpatient per provider
# Ratio of claims from top 5 fraudulent states per provider