### Throughout this exercise, you may need to refer to online documentation.

Pandas: http://pandas.pydata.org/pandas-docs/stable/

Numpy & Scipy: https://docs.scipy.org/doc/

In [2]:
# First, import packages and load some data
import pandas      as pd
import numpy       as np
import scipy       as sp
import scipy.stats as stats
from statistics import mean

datafile = './data/pima-indians-diabetes.csv'

df = pd.read_csv(datafile, header=0, index_col=None)
df.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [27]:
# Missing data is coded as 0 in all but the first and last columns (where 0 is meaningful)
# Change 0s in columns 1-7 to np.nan

# Hint: you can conditionally assign slices of a dataframe to a new value:
# eg. df[df[c]==x] = v  will change values in column c to v where they were originall x.
# And remember you can slice multiple columns with as df[[0,1]] for the first two columns
# You can also look at pandas' DataFrame.replace() function

replace_dict = {'plasma_glucose_concentration': 0, 'diastolic_blood_pressure': 0, 'triceps_thickness': 0, '2-hour_serum_insulin': 0, 'BMI': 0, 'diabetes_pedigreen': 0, 'age': 0}

df.replace(replace_dict, np.nan, inplace=True)
#check min values for selected columns no longe 0.0
df.describe()



Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
# The basics: counts, min, max, mean, median and mode
# Get the count (of all non-NaNs), min, max, mean, median and mode of each column.

#set index and labels
index = ['Mean', 'Median', 'Min', 'Max', 'Non NaN Values']
labels = ['times_pregnant', 'plasma_glucose_concentration', 'diastolic_blood_pressure', 'triceps_thickness', '2-hour_serum_insulin', 'BMI', 'diabetes_pedigreen', 'age', 'diabetes']

#get all stats values that we can with df built in funcs
means = df.mean(axis=0, skipna=True) #=> describe already skips na
medians = df.median(axis=0, skipna=True)
mins = df.min(axis=0, skipna=True)
maxs = df.max(axis=0, skipna=True)
not_na_counts = df.notnull().sum()

#create stats df
statdf = pd.DataFrame([means, medians, mins, maxs, not_na_counts], index=index)

#get mode using stats.mode function
mode_result = stats.mode(df, nan_policy='omit')
mode_arr = mode_result[0][0]

# create mode df to add to other stats df
modes = pd.DataFrame([[1, 99, 70, 32, 105, 32, 0.254, 22, 0]], columns=labels, index=['Mode'])

#join mode df to stats df
statdf.append(modes)

# Which column has the most missing values?
    # just look at non-null count and find min


Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
Mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
Median,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
Min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
Max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0
Non NaN Values,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
Mode,1.0,99.0,70.0,32.0,105.0,32.0,0.254,22.0,0.0


In [77]:
# Now let's try to better characterize the column distributions.
# First, we can "centre" each column (also called a z-transform) so that it has mean=0, variance=1
# To center a value, we simply subtract the mean of the distribution, and divide that value 
#   by the standard deviation of the distribution:
# for each value v in a distribution of values V:
#    v = (v-mean(V)) / std(V)

# Write a centre() function
# Hint: centre([0,1,2,3,5,10]) => [-1.059, -0.757, -0.454, -0.151, 0.454, 1.967]

# Remember to use np.nanmean() and np.nanstd() to deal with NaNs

def centre(col):
    mean_col = np.nanmean(col)
    std_col = np.nanstd(col)
    array_vals = col.to_numpy()
    centered_vals = stats.zscore(array_vals, nan_policy='omit')
    return centered_vals

centered_plasma = centre(df['plasma_glucose_concentration'])

def check_z_transform(values):
    values = np.array(values)
    mean = round(np.nanmean(values), 2)
    std = np.nanstd(values)
    print('Mean: ', mean, '\nStd: ', std)

check_z_transform(centered_plasma)




Mean:  0.0 
Std:  1.0


In [83]:

df2 = df.copy()

# print(np.nanmean(values[][1]))
def centre2(df2):
    values = df2.values
    calc = (values - np.nanmean(values, axis=0)/np.nanstd(values, axis=0))
    df3 = pd.DataFrame(calc, columns = labels)
    return df3

def centre3(df2):
    # print(df2.mean(), df2.std())
    return ((df2 - df2.mean())/df2.std())

df3 = centre3(df2).round(2)

df3.head()
df3.describe()
# new_vals = centre2(values)
# # print(new_vals[1][:5])
# # print(values[1][:5])
# df3 = pd.DataFrame(new_vals, columns=labels)
# # print(df3.iloc[1][:5])
# df3.describe()


Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,0.001003,0.000197,-1.4e-05,0.000222,-0.000228,-4e-05,-0.000104,-0.001263,0.002813
std,0.999467,0.999918,0.999702,1.000039,1.000088,0.999924,0.999958,1.000932,1.001598
min,-1.14,-2.54,-3.91,-2.11,-1.19,-2.06,-1.19,-1.04,-0.73
25%,-0.84,-0.74,-0.68,-0.68,-0.6675,-0.72,-0.69,-0.79,-0.73
50%,-0.25,-0.15,-0.03,-0.01,-0.26,-0.02,-0.3,-0.36,-0.73
75%,0.64,0.63,0.61,0.65,0.29,0.6,0.47,0.66,1.37
max,3.9,2.53,4.01,6.67,5.81,5.0,5.88,4.06,1.37


In [82]:
# Let's make a copy of df, to hold the centred values:
cdf = df.copy()

cdf = cdf.apply(centre)
cdf = cdf.round(2)

cdf.head()
cdf.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,0.001042,7.9e-05,0.000314,1.8e-05,-0.000178,7.9e-05,-6.5e-05,-0.000586,0.002813
std,0.999596,1.000616,1.000813,1.000781,1.001355,1.000823,1.000559,1.001484,1.001598
min,-1.14,-2.55,-3.91,-2.12,-1.19,-2.06,-1.19,-1.04,-0.73
25%,-0.84,-0.74,-0.68,-0.68,-0.6675,-0.72,-0.69,-0.79,-0.73
50%,-0.25,-0.15,-0.03,-0.01,-0.26,-0.02,-0.3,-0.36,-0.73
75%,0.64,0.63,0.61,0.65,0.29,0.6,0.47,0.66,1.37
max,3.91,2.53,4.01,6.67,5.82,5.01,5.88,4.06,1.37


In [24]:
# Finally! Let's look for outliers by getting the min and max in the new centred df:
# (You can re-use code from above!)

mins = cdf.min(axis=0, skipna=True)
maxs = cdf.max(axis=0, skipna=True)
means = cdf.mean(axis=0, skipna=True)
statcdf = pd.DataFrame([mins, maxs, means], index=['Min', 'Max', 'Mean'])
statcdf



Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
Min,-1.14,-2.55,-3.91,-2.12,-1.19,-2.06,-1.19,-1.04,-0.73
Max,3.91,2.53,4.01,6.67,5.82,5.01,5.88,4.06,1.37
Mean,0.001042,7.9e-05,0.000314,1.8e-05,-0.000178,7.9e-05,-6.5e-05,-0.000586,0.002813


In [25]:
# trick question:
# What is the mean of cdf['times_pregnant']?
# Why isn't it zero?

something = [1, 2, 5, 4, 9, 15, 4, 2, 4, 9]
zsomething = stats.zscore(something)
zsomething = np.array(zsomething)

# ???? 
# thought maybe somethig to do with all positive ints but no luck

In [26]:
# need to adjust which columns - did the pregnent column replace value where 0 is a valid vaule

In [86]:
def centre(col):
    array_vals = col.to_numpy()
    return stats.zscore(array_vals, nan_policy='omit')

cdf = df.copy()
cdf = cdf.apply(centre).round(3)
cdf.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,-2e-05,8e-06,1.8e-05,-2e-06,5e-06,-2.2e-05,3e-05,4.6e-05,0.000115
std,1.000765,1.000672,1.000694,1.000935,1.001286,1.00066,1.000651,1.000611,1.000644
min,-1.142,-2.546,-3.912,-2.116,-1.193,-2.06,-1.19,-1.042,-0.732
25%,-0.845,-0.743,-0.679,-0.683,-0.66875,-0.716,-0.68875,-0.786,-0.732
50%,-0.251,-0.154,-0.033,-0.015,-0.258,-0.023,-0.3005,-0.361,-0.732
75%,0.64,0.633,0.614,0.654,0.29,0.599,0.46575,0.66,1.366
max,3.907,2.534,4.008,6.673,5.82,5.006,5.884,4.064,1.366
