# Outlier Handling Notebook: Models 1 & 2

**Medicare DeSYNPuf Data: 
2009 Inpatient Claims for Risk Adjustment Modeling and Correspondence Analysis**

HDS 823 Final Project: Advanced Statistics in Healthcare
Kyle P. Rasku RN BSN

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
# Load 2009 IP & ELIX Data, Created by the Cleaning Notebook
desynpuf = pd.read_csv("/home/kylier/python/DS/data/PUF/PUF_RWJF_ALLIP2009.csv")

In [3]:
desynpuf.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,DESYNPUF_ID,FIPS_COUNTY_CODE,FULL_FIPS_CODE,CBSA,CBSA_NAME,FIPS_STATE_CODE,CLM_ID,BENE_AGE,...,COUNTY,NUM_COUNTIES,HO_RANK,HO_PROP_RANK,HO_QUARTILE,HF_RANK,HF_PROP_RANK,HF_QUARTILE,StateCode,CountyFIPS
0,0,0,0000438E79D01BEA,57,12057,45300,"Tampa-St. Petersburg-Clearwater, FL",12,,73.0,...,Hillsborough,67.0,27.0,0.597,2.0,31.0,0.537,2.0,12.0,57.0
1,1,1,00010E7BEA69142C,3,10003,48864,"Wilmington, DE-MD-NJ",10,,72.0,...,New Castle,3.0,1.0,0.667,1.0,1.0,0.667,1.0,10.0,3.0
2,2,2,00015BF6509E1DF7,119,37119,16740,"Charlotte-Gastonia-Concord, NC-SC",37,,76.0,...,Mecklenburg,100.0,5.0,0.95,1.0,14.0,0.86,1.0,37.0,119.0
3,3,3,000240D599ED789C,35,55035,20740,"Eau Claire, WI",55,692591100000000.0,67.0,...,Eau Claire,72.0,13.0,0.819,1.0,4.0,0.944,1.0,55.0,35.0
4,4,4,0003E73AFBA840A8,55,36055,40380,"Rochester, NY",36,,68.0,...,Monroe,62.0,31.0,0.5,2.0,17.0,0.726,2.0,36.0,55.0


In [4]:
desynpuf.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1, inplace=True, errors="ignore")
desynpuf.isnull().sum() 

DESYNPUF_ID             0
FIPS_COUNTY_CODE        0
FULL_FIPS_CODE          0
CBSA                    0
CBSA_NAME           27839
FIPS_STATE_CODE         0
CLM_ID              95763
BENE_AGE                0
AGE_GROUP               0
B_SEX                   0
B_ALZHDMTA              0
B_CHF                   0
B_CNCR                  0
B_COPD                  0
B_DEPRESSN              0
B_DIABETES              0
B_ISCHMCHT              0
B_OSTEOPRS              0
B_CHRNKIDN              0
B_RA_OA                 0
B_STRKETIA              0
B_ESRD                  0
NH_WHITE                0
AA_BLACK                0
OTHER                   0
NW_HISPANIC             0
MEAN_ELIX_SCORE         0
CODES_COUNT             0
TOTCHRONIC              0
ALLCOSTS                0
CVRG_MOS                0
RX_CVRG_MOS             0
LENGTH_OF_STAY      95763
PHYS_COUNT              0
B_DIED                  0
PERCENT_COSTS           0
ANY_COSTS               0
POC                     0
FIPS        

In [5]:
desynpuf["LENGTH_OF_STAY"].fillna(0, inplace=True)
desynpuf.drop(["FIPS", "NUM_COUNTIES", "StateCode", "CountyFIPS"], axis=1, inplace=True, errors="ignore")

In [6]:
# These are the foreign beneficiaries with no FIPS codes
# They will also have not Health Outcomes or Proportional Ranks

null_states = desynpuf[desynpuf["STATE"].isnull()]
null_states.head()

Unnamed: 0,DESYNPUF_ID,FIPS_COUNTY_CODE,FULL_FIPS_CODE,CBSA,CBSA_NAME,FIPS_STATE_CODE,CLM_ID,BENE_AGE,AGE_GROUP,B_SEX,...,ANY_COSTS,POC,STATE,COUNTY,HO_RANK,HO_PROP_RANK,HO_QUARTILE,HF_RANK,HF_PROP_RANK,HF_QUARTILE
43,00187098A769FC5E,0,0,0,,0,,55.0,45-64,0,...,0.0,1.0,,,,,,,,
77,002B54E82E2DD278,0,0,0,,0,,63.0,45-64,0,...,0.0,0.0,,,,,,,,
113,003F7869F3D9735E,0,0,0,,0,,61.0,45-64,0,...,0.0,0.0,,,,,,,,
176,005F4826CFA35337,0,0,0,,0,,53.0,45-64,1,...,0.0,0.0,,,,,,,,
196,006D06D8AD310405,0,0,0,,0,,61.0,45-64,0,...,0.0,0.0,,,,,,,,


In [7]:
desynpuf.shape

(120623, 46)

In [8]:
desynpuf["B_DIED"].value_counts()

0.0    118896
1.0      1727
Name: B_DIED, dtype: int64

In [9]:
desynpuf["B_DIED"].value_counts(normalize=True)

0.0    0.985683
1.0    0.014317
Name: B_DIED, dtype: float64

In [10]:
s = desynpuf[["MEAN_ELIX_SCORE", "LENGTH_OF_STAY", "CODES_COUNT", "PHYS_COUNT", "ALLCOSTS", "TOTCHRONIC"]]

In [11]:
sb = s.describe().T
sb["X6"] = sb["std"]*6
sb

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,X6
MEAN_ELIX_SCORE,120623.0,1.434454,2.300053,-1.0,0.0,0.0,2.285714,34.0,13.80032
LENGTH_OF_STAY,120623.0,1.159356,3.659783,0.0,0.0,0.0,0.0,140.0,21.958701
CODES_COUNT,120623.0,1.971117,4.05385,0.0,0.0,0.0,0.0,16.0,24.323098
PHYS_COUNT,120623.0,0.411613,0.808224,0.0,0.0,0.0,0.0,2.0,4.849343
ALLCOSTS,120623.0,4040.375849,11649.869488,-2932.0,0.0,0.0,0.0,196336.0,69899.216927
TOTCHRONIC,120623.0,2.798521,2.654667,0.0,0.0,2.0,5.0,11.0,15.928004


## Outlier Removal

Remove: 

Rows where MEAN_ELIX_SCORE > 13

Rows where LENGTH_OF_STAY > 22

Rows where CODES_COUNT > 24

Rows where PHYS_COUNT > 5

Rows where ALLCOSTS > 100000

Rows where TOTCHRONIC > 11

In [12]:
desynpuf.shape

(120623, 46)

In [13]:
desynpuf = desynpuf[desynpuf["MEAN_ELIX_SCORE"]<14]
desynpuf.shape

(120203, 46)

In [14]:
desynpuf = desynpuf[desynpuf["LENGTH_OF_STAY"]<23]
desynpuf.shape

(119655, 46)

In [15]:
desynpuf = desynpuf[desynpuf["CODES_COUNT"]<25]
desynpuf.shape

(119655, 46)

In [16]:
desynpuf = desynpuf[desynpuf["PHYS_COUNT"]<6]
desynpuf.shape

(119655, 46)

In [17]:
desynpuf = desynpuf[desynpuf["ALLCOSTS"]<100001]
desynpuf.shape

(119508, 46)

In [18]:
desynpuf = desynpuf[desynpuf["TOTCHRONIC"]<12]
desynpuf.shape

(119508, 46)

## Total Rows Removed: 1115

In [19]:
desynpuf.to_csv("/home/kylier/python/DS/data/PUF/PUF_RWJF_ALLIP2009_NO.csv")