# Outlier Handling Notebook: Models 1 & 2

**Medicare DeSYNPuf Data: 
2009 Inpatient Claims for Risk Adjustment Modeling and Correspondence Analysis**

HDS 823 Final Project: Advanced Statistics in Healthcare
Kyle P. Rasku RN BSN

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
# Load 2009 IP & ELIX Data, Created by the Cleaning Notebook
desynpuf = pd.read_csv("/home/kylier/python/DS/data/PUF/PUF_RWJF_IP2009.csv")

In [3]:
desynpuf.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,DESYNPUF_ID,BENE_AGE,B_SEX,NH_WHITE,AA_BLACK,OTHER,NW_HISPANIC,B_DIVERSE,...,COUNTY,NUM_COUNTIES,HO_RANK,HO_PROP_RANK,HO_QUARTILE,HF_RANK,HF_PROP_RANK,HF_QUARTILE,StateCode,CountyFIPS
0,0,0,0000438E79D01BEA,73.0,1,1,0,0,0,0.0,...,Hillsborough,67.0,27.0,0.597,2.0,31.0,0.537,2.0,12.0,57.0
1,1,1,00010E7BEA69142C,72.0,0,1,0,0,0,0.0,...,New Castle,3.0,1.0,0.667,1.0,1.0,0.667,1.0,10.0,3.0
2,2,2,00015BF6509E1DF7,76.0,0,1,0,0,0,0.0,...,Mecklenburg,100.0,5.0,0.95,1.0,14.0,0.86,1.0,37.0,119.0
3,3,3,000240D599ED789C,67.0,0,0,1,0,0,1.0,...,Eau Claire,72.0,13.0,0.819,1.0,4.0,0.944,1.0,55.0,35.0
4,4,4,0003E73AFBA840A8,68.0,0,1,0,0,0,0.0,...,Monroe,62.0,31.0,0.5,2.0,17.0,0.726,2.0,36.0,55.0


In [4]:
desynpuf.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True, errors="ignore")
desynpuf.isnull().sum() 

DESYNPUF_ID             0
BENE_AGE                0
B_SEX                   0
NH_WHITE                0
AA_BLACK                0
OTHER                   0
NW_HISPANIC             0
B_DIVERSE               0
B_ALZHDMTA              0
B_CHF                   0
B_CNCR                  0
B_COPD                  0
B_DEPRESSN              0
B_DIABETES              0
B_ISCHMCHT              0
B_OSTEOPRS              0
B_CHRNKIDN              0
B_RA_OA                 0
B_STRKETIA              0
B_ESRD                  0
B_DIED                  0
CVRG_MOS                0
RX_CVRG_MOS             0
MEAN_ELIX_SCORE         0
TOTAL_VISITS            0
TOTAL_CODES             0
MEAN_CODES_VISIT        0
MEAN_CHRONIC            0
ALLCOSTS                0
MEAN_COST_VISIT         0
TOTAL_LOS               0
MEAN_LOS            95763
TOTAL_PHYS              0
MEAN_PHYS_VISIT         0
CBSA                    0
FULL_FIPS_CODE          0
FIPS_STATE_CODE         0
FIPS_COUNTY_CODE        0
TOTCHRONIC  

In [5]:
desynpuf["MEAN_LOS"].fillna(0, inplace=True)
desynpuf["FIPS"].fillna(0, inplace=True)
desynpuf["STATE"].fillna("None", inplace=True)
desynpuf["COUNTY"].fillna("None", inplace=True)
desynpuf.drop(["NUM_COUNTIES", "HO_RANK", "HF_RANK", "StateCode", "CountyFIPS"], axis=1, inplace=True, errors="ignore")

In [6]:
desynpuf.shape

(114380, 49)

## Deaths in 2009

In [7]:
# Set B_DIED fields that are = 2 to 1
desynpuf.loc[desynpuf["B_DIED"]>1, "B_DIED"] = 1.0
desynpuf["B_DIED"].value_counts(normalize=True)

0.0    0.984927
1.0    0.015073
Name: B_DIED, dtype: float64

In [8]:
s = desynpuf[["MEAN_ELIX_SCORE", "TOTAL_LOS", "MEAN_LOS", "TOTAL_VISITS", "TOTAL_CODES", "MEAN_CODES_VISIT", 
              "TOTAL_PHYS", "MEAN_PHYS_VISIT", "ALLCOSTS", "MEAN_COST_VISIT", "TOTCHRONIC", "MEAN_CHRONIC"]]

In [9]:
sb = s.describe().T
sb["X6"] = sb["mean"]+(sb["std"]*6)
sb

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,X6
MEAN_ELIX_SCORE,114380.0,1.277365,2.188974,-1.0,0.0,0.0,2.0,34.0,14.41121
TOTAL_LOS,114380.0,1.222635,4.52545,0.0,0.0,0.0,0.0,216.0,28.375337
MEAN_LOS,114380.0,0.889788,3.023198,0.0,0.0,0.0,0.0,140.0,19.028974
TOTAL_VISITS,114380.0,0.217346,0.564258,0.0,0.0,0.0,0.0,8.0,3.602893
TOTAL_CODES,114380.0,2.078703,5.599753,0.0,0.0,0.0,0.0,94.0,35.677223
MEAN_CODES_VISIT,114380.0,1.547467,3.654879,0.0,0.0,0.0,0.0,16.0,23.476743
TOTAL_PHYS,114380.0,0.434079,1.126821,0.0,0.0,0.0,0.0,16.0,7.195006
MEAN_PHYS_VISIT,114380.0,0.325153,0.737642,0.0,0.0,0.0,0.0,2.0,4.751003
ALLCOSTS,114380.0,4260.904494,21220.42163,-2932.0,0.0,0.0,0.0,1570688.0,131583.434272
MEAN_COST_VISIT,114380.0,2514.398654,8297.948786,-2932.0,0.0,0.0,0.0,196336.0,52302.091369


## Outlier Removal

Remove: 

Rows where MEAN_ELIX_SCORE > 13

Rows where TOTAL_VISITS > 6

Rows where TOTAL_LOS > 65

Rows where MEAN_LOS > 20

Rows where MEAN_CODES_VISIT > 15

Rows where TOTAL_CODES > 40

Rows where TOTAL_PHYS > 8

Rows where ALLCOSTS > 400000

Rows where TOTCHRONIC > 11

Rows where MEAN_COST_VISIT > 100000

In [10]:
desynpuf.shape

(114380, 49)

In [11]:
desynpuf = desynpuf[desynpuf["MEAN_ELIX_SCORE"]<10]
desynpuf.shape

(113145, 49)

In [12]:
desynpuf = desynpuf[desynpuf["TOTAL_VISITS"]<7]
desynpuf.shape

(113140, 49)

In [13]:
desynpuf = desynpuf[desynpuf["TOTAL_LOS"]<66]
desynpuf.shape

(113096, 49)

In [14]:
desynpuf = desynpuf[desynpuf["MEAN_LOS"]<21]
desynpuf.shape

(112749, 49)

In [15]:
desynpuf = desynpuf[desynpuf["MEAN_CODES_VISIT"]<16]
desynpuf.shape

(112673, 49)

In [16]:
desynpuf = desynpuf[desynpuf["TOTAL_CODES"]<41]
desynpuf.shape

(112502, 49)

In [17]:
desynpuf = desynpuf[desynpuf["TOTAL_PHYS"]<9]
desynpuf.shape

(112501, 49)

In [18]:
desynpuf = desynpuf[desynpuf["ALLCOSTS"]<400001]
desynpuf.shape

(112500, 49)

In [19]:
desynpuf = desynpuf[desynpuf["TOTCHRONIC"]<10]
desynpuf.shape

(112206, 49)

In [20]:
desynpuf = desynpuf[desynpuf["MEAN_COST_VISIT"]<100001]
desynpuf.shape

(112183, 49)

In [21]:
desynpuf = desynpuf[desynpuf["ALLCOSTS"]>=0]
desynpuf.shape

(112182, 49)

## Total Rows Removed: 2198

In [22]:
desynpuf.to_csv("/home/kylier/python/DS/data/PUF/PUF_ELIX_IP2009.csv")