In [8]:
import pandas as pd 
import numpy as np 

In [5]:
df = pd.read_csv('data/pima-indians-diabetes.csv')
df.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
df.describe()


Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
df.dtypes

#to change types
# df['diabetes'] = df['diabetes'].astype(int)

times_pregnant                    int64
plasma_glucose_concentration      int64
diastolic_blood_pressure          int64
triceps_thickness                 int64
2-hour_serum_insulin              int64
BMI                             float64
diabetes_pedigreen              float64
age                               int64
diabetes                          int64
dtype: object

## find that 0's are used for missing or empty values - convert to NaN
## iterate through columns using numpy to replace 0's with NaN

In [9]:
#create a df copy
df2 = df.copy()
for col in df2.columns[1:7]:
    df2[col] = df2[col].replace(0, np.nan)

df2.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


## Alternate replace - using inplace replacement

In [13]:
df3 = df.copy()
#have to specify the columns and values or else will replace every 0 with Nan and we want to keep the diabetes and times_pregnant 0 
df3.replace({'plasma_glucose_concentration': 0, 'diastolic_blood_pressure': 0, 'triceps_thickness': 0}, np.nan, inplace=True)
df3.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148.0,72.0,35.0,0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,0,26.6,0.351,31,0
2,8,183.0,64.0,,0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168,43.1,2.288,33,1


## Outliers
### can say that any value plus or minus some number of standard deviations away from the mean constitutes an outlier
### in this case select 2.5 standard deviations to consider a data point an outlier
What to do with values we find as outliers??
- can remove it and replace with NaN
- can replace it with original mean
    - preserves the original mean
- imputation (not covered in video)
    - runs a model from non-missing data in order to predict the values of missing cells

In [14]:
df2.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### replace outliers with mean

In [18]:
plasma_mean = np.mean(df2['plasma_glucose_concentration'])
plasma_std = np.std(df2['plasma_glucose_concentration'])
print(plasma_mean,plasma_std)

# replace outliers with original mean
df4 = df2.copy()

for (i,val) in enumerate(df3['plasma_glucose_concentration']):
    if (val > plasma_mean + 2.5*plasma_std) or (val < plasma_mean - 2.5*plasma_std):
        df4.at[i, 'plasma_glucose_concentration'] = plasma_mean
        print('replaced outlier: ', i, val)


121.6867627785059 30.515624262345657
replaced outlier:  62 44.0
replaced outlier:  561 198.0
replaced outlier:  661 199.0


In [20]:
df4.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.587235,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.149764,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,56.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.5,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,140.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,197.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Alternative - more robust approach to replacing outliers

In [21]:
df5 = df2.copy()

df5.loc[df5['plasma_glucose_concentration'] > (plasma_mean + ( plasma_std * 2.5 )), 'plasma_glucose_concentration'] = plasma_mean
df5.loc[df5['plasma_glucose_concentration'] < (plasma_mean - ( plasma_std * 2.5 )), 'plasma_glucose_concentration'] = plasma_mean

df5.describe()


Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.587235,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.149764,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,56.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.5,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,140.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,197.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


## Alt - using a mask

In [23]:
df6 = df2.copy()

threshold = plasma_std*2.5

mask = (df5['plasma_glucose_concentration'] > ( plasma_mean + threshold)) | (df5['plasma_glucose_concentration'] < ( plasma_mean - threshold))

df6.loc[mask, 'plasma_glucose_concentration'] = plasma_mean

df6.describe()
#broken - didn't remove 

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0
