### Throughout this exercise, you may need to refer to online documentation.

Pandas: http://pandas.pydata.org/pandas-docs/stable/

Numpy & Scipy: https://docs.scipy.org/doc/

In [1]:
# First, import packages and load some data
import pandas      as pd
import numpy       as np
import scipy       as sp
import scipy.stats as stats
from statistics import mean

datafile = './data/pima-indians-diabetes.csv'

df = pd.read_csv(datafile, header=0, index_col=None)
# df.head()

## Replace 0 with nan in appropriate columns 

In [25]:
replace_dict = {'plasma_glucose_concentration': 0, 'diastolic_blood_pressure': 0, 'triceps_thickness': 0, '2-hour_serum_insulin': 0, 'BMI': 0, 'diabetes_pedigreen': 0, 'age': 0}

#replace 0 values with nan for columns in replace_dict
df.replace(replace_dict, np.nan, inplace=True)

#check min values for selected columns no longe 0.0
df.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [111]:
# Get the count (of all non-NaNs), min, max, mean, median and mode of each column.

#set index and labels
index = ['Mean', 'Median', 'Min', 'Max', 'Non NaN Values']
labels = ['times_pregnant', 'plasma_glucose_concentration', 'diastolic_blood_pressure', 'triceps_thickness', '2-hour_serum_insulin', 'BMI', 'diabetes_pedigreen', 'age', 'diabetes']

#get all stats values that we can with df built in funcs
means = df.mean(axis=0, skipna=True) #=> describe already skips na
medians = df.median(axis=0, skipna=True)
mins = df.min(axis=0, skipna=True)
maxs = df.max(axis=0, skipna=True)
not_na_counts = df.notnull().sum()

#create stats df
statdf = pd.DataFrame([means, medians, mins, maxs, not_na_counts], index=index)

#get mode using stats.mode function
mode_result = stats.mode(df, nan_policy='omit')
mode_arr = mode_result[0][0]

# create mode df to add to other stats df
modes = pd.DataFrame([[1, 99, 70, 32, 105, 32, 0.254, 22, 0]], columns=labels, index=['Mode'])

#join mode df to stats df
statdf.append(modes)

# Which column has the most missing values?
    # just look at non-null count and find min
    # in this case the 2-hour serum insulin column has the most missing values


Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
Mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
Median,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
Min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
Max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0
Non NaN Values,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
Mode,1.0,99.0,70.0,32.0,105.0,32.0,0.254,22.0,0.0


## Calculate z-scores

In [3]:
def centre(col):
    array_vals = col.to_numpy()
    centered_vals = stats.zscore(array_vals, nan_policy='omit')
    return centered_vals


def check_z_transform(values):
    values = np.array(values)
    mean = round(np.nanmean(values), 2)
    std = np.nanstd(values)
    print('Mean: ', mean, '\nStd: ', std)

# run a test on a column to check mean and std of z-scores
centered_plasma = centre(df['plasma_glucose_concentration'])
check_z_transform(centered_plasma)


Mean:  -0.0 
Std:  1.0


## Apply center function to DataFrame

In [7]:
# Make a Copy
cdf = df.copy()
# Apply the centre function to the dataset
cdf = cdf.apply(centre)
# round data to make describe easier to visually check that means and stds are about 0 and 1 respectively
cdf.round(3).describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,-2e-05,3e-06,-4.6e-05,8.1e-05,-5.5e-05,-1.3e-05,3e-05,4.6e-05,0.000115
std,1.000765,1.000674,1.000727,1.00057,1.000693,1.00062,1.000651,1.000611,1.000644
min,-1.142,-3.784,-3.573,-1.288,-0.693,-4.06,-1.19,-1.042,-0.732
25%,-0.845,-0.685,-0.367,-1.288,-0.693,-0.596,-0.68875,-0.786,-0.732
50%,-0.251,-0.122,0.15,0.155,-0.428,0.001,-0.3005,-0.361,-0.732
75%,0.64,0.60575,0.563,0.719,0.41225,0.585,0.46575,0.66,1.366
max,3.907,2.444,2.735,4.922,6.653,4.456,5.884,4.064,1.366


## Get min and max of centered dataset

In [8]:
# Finally! Let's look for outliers by getting the min and max in the new centred df:

mins = cdf.min(axis=0, skipna=True)
maxs = cdf.max(axis=0, skipna=True)
means = cdf.mean(axis=0, skipna=True)
statcdf = pd.DataFrame([mins, maxs, means], index=['Min', 'Max', 'Mean'])
statcdf



Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
Min,-1.141852,-3.783654,-3.572597,-1.288212,-0.6928906,-4.060474,-1.189553,-1.041549,-0.7321202
Max,3.906578,2.444478,2.734528,4.921866,6.652839,4.455807,5.883565,4.063716,1.365896
Mean,-6.476301e-17,-9.251859000000001e-18,1.5034270000000003e-17,1.00614e-16,-3.0068540000000005e-17,2.59052e-16,2.451743e-16,1.931325e-16,7.401487e-17


### trick question: What is the mean of cdf['times_pregnant']? Why isn't it zero?

The mean of times_pregnant is -6.47e-17 which is approximately 0.   
Not sure if this is what this question is getting at.
