In [335]:
# First, import packages and load some data
import pandas      as pd
import numpy       as np
import scipy       as sp
import scipy.stats as stats

# We can actually load remote data from a URL!
remote_data = 'https://www.doc.gold.ac.uk/~agero001/data_science/lab_data/pima-indians-diabetes.csv'

df = pd.DataFrame.from_csv(remote_data, header=0, index_col=None)
df.head(10)

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [336]:
# Missing data is coded as 0 in all but the first and last columns (where 0 is meaningful)
# Change 0s in columns 1-7 to np.nan

# Hint: you can conditionally assign slices of a dataframe to a new value:
# eg. df[df[c]==x] = v  will change values in column c to v where they were originall x.
# And remember you can slice multiple columns with as df[[0,1]] for the first two columns
# You can also look at pandas' DataFrame.replace() function

In [337]:
df = df.replace({"plasma_glucose_concentration":0, "diastolic_blood_pressure":0, "triceps_thickness":0,"2-hour_serum_insulin":0, "BMI":0, "diabetes_pedigreen":0, "age":0},np.NaN)
df

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,6,148,72,35,,33.6,0.627,50,1
1,1,85,66,29,,26.6,0.351,31,0
2,8,183,64,,,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,,,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,,,,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,,,,0.232,54,1


Another way to do it is:

df = df.iloc[:,1:8].replace(0,np.nan)


df

In [338]:
# The basics: counts, min, max, mean, median and mode
# Get the count (of all non-NaNs), min, max, mean, median and mode of each column.

# Hint 1: use numpy's min(), max(), nanmean() and nanmedian() functions.
# Hint 2: scipy.stats has a mode function ... but it returns a complex object.

# Which column has the most missing values?

In [339]:
dfd = df.describe()
dfd.loc[["mean","max","min"],]


Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0


In [340]:
df.median()

times_pregnant                    3.0000
plasma_glucose_concentration    117.0000
diastolic_blood_pressure         72.0000
triceps_thickness                29.0000
2-hour_serum_insulin            125.0000
BMI                              32.3000
diabetes_pedigreen                0.3725
age                              29.0000
diabetes                          0.0000
dtype: float64

In [341]:
df.mode()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
0,1.0,99,70.0,32.0,105.0,32.0,0.254,22.0,0.0
1,,100,,,,,0.258,,


Here we count the data points without NaN

In [342]:
df.count()

times_pregnant                  768
plasma_glucose_concentration    763
diastolic_blood_pressure        733
triceps_thickness               541
2-hour_serum_insulin            394
BMI                             757
diabetes_pedigreen              768
age                             768
diabetes                        768
dtype: int64

Here we count the NaN, (the sum of both is 768 per column)

In [343]:
df.isnull().sum()

times_pregnant                    0
plasma_glucose_concentration      5
diastolic_blood_pressure         35
triceps_thickness               227
2-hour_serum_insulin            374
BMI                              11
diabetes_pedigreen                0
age                               0
diabetes                          0
dtype: int64

In [344]:
print("Column 5 (2-hour_serum_insulin) has the most NaNs' with a total of 374")

Column 5 (2-hour_serum_insulin) has the most NaNs' with a total of 374


In [345]:
# Now let's try to better characterize the column distributions.
# First, we can "centre" each column (also called a z-transform) so that it has mean=0, variance=1
# To center a value, we simply subtract the mean of the distribution, and divide the value 
#   by its standard deviation of the distribution:
# for each value v in a distribution of values V:
#    v = (v-mean(V)) / std(V)

# Write a centre() function
# V = [0,1,2,3,5,10]
# Example: centre(V) => [-1.059, -0.757, -0.454, -0.151, 0.454, 1.967]


In [346]:
def centre(z):
    mean = np.nanmean(z)
    std = np.nanstd(z)
    term = [(float(y)-mean)/ std for y in z]
    return term
    





Okay, lets test it.

In [347]:
V = [0,1,2,3,5,10]
centre(V)

[-1.0593098718411684,
 -0.75664990845797742,
 -0.45398994507478646,
 -0.15132998169159548,
 0.45398994507478646,
 1.9672897619907412]

Looks like it works, lets see its variance and mean!

In [348]:
a = centre(V)
print(np.mean(a))
print(np.var(a))

-7.40148683083e-17
1.0


We are making the df data set to it normalized way. We will use it later.

In [326]:
normaldf = (df-np.mean(df))/(np.std(df))
normaldf.mean()

times_pregnant                 -7.748432e-17
plasma_glucose_concentration    1.316844e-16
diastolic_blood_pressure       -5.468947e-16
triceps_thickness              -2.021386e-17
2-hour_serum_insulin           -7.185454e-18
BMI                             3.458704e-15
diabetes_pedigreen              1.918604e-15
age                             2.192980e-16
diabetes                        1.069746e-16
dtype: float64

In [327]:
normaldf.var()

times_pregnant                  1.001304
plasma_glucose_concentration    1.001312
diastolic_blood_pressure        1.001366
triceps_thickness               1.001852
2-hour_serum_insulin            1.002545
BMI                             1.001323
diabetes_pedigreen              1.001304
age                             1.001304
diabetes                        1.001304
dtype: float64

It does work!

In [328]:
# You can make a "deep" copy of df, to hold the centred values:
cdf = df.copy()

In [329]:
# Now fill in cdf by applying your centre() function to each column in df

First I will give my columns a variable.

In [330]:
tp = df["times_pregnant"]
pgc = df["plasma_glucose_concentration"]
dbp = df["diastolic_blood_pressure"]
tt = df["triceps_thickness"]
hsi = df["2-hour_serum_insulin"]
BMI = df["BMI"]
dp = df["diabetes_pedigreen"]
age = df["age"]
dia = df["diabetes"]

Now I will display all the centered values of the columns and check if they have var 1 and mean 0.

In [331]:
centre(tp)

[0.63994726015936043,
 -0.84488505344302278,
 1.2338801856003137,
 -0.84488505344302278,
 -1.1418515161634994,
 0.34298079743888377,
 -0.25095212800206951,
 1.8278131110412668,
 -0.54791859072254612,
 1.2338801856003137,
 0.046014334718407143,
 1.8278131110412668,
 1.8278131110412668,
 -0.84488505344302278,
 0.34298079743888377,
 0.9369137228798371,
 -1.1418515161634994,
 0.9369137228798371,
 -0.84488505344302278,
 -0.84488505344302278,
 -0.25095212800206951,
 1.2338801856003137,
 0.9369137228798371,
 1.5308466483207903,
 2.1247795737617436,
 1.8278131110412668,
 0.9369137228798371,
 -0.84488505344302278,
 2.718712499202697,
 0.34298079743888377,
 0.34298079743888377,
 -0.25095212800206951,
 -0.25095212800206951,
 0.63994726015936043,
 1.8278131110412668,
 0.046014334718407143,
 2.1247795737617436,
 1.5308466483207903,
 -0.54791859072254612,
 0.046014334718407143,
 -0.25095212800206951,
 0.9369137228798371,
 0.9369137228798371,
 1.5308466483207903,
 0.9369137228798371,
 -1.141851516163

In [219]:
c1 = centre(tp)
print(np.mean(c1))
print(np.var(c1))

-6.47630097698e-17
1.0


In [220]:
centre(pgc)

[0.86228736450798971,
 -1.2022288144298274,
 2.0092407972512216,
 -1.0711484221163152,
 0.50181628564583125,
 -0.18635577400010783,
 -1.4316195009784738,
 -0.21912587207848588,
 2.4680221703485143,
 0.10857510870529462,
 -0.38297636247037614,
 1.5176893260755508,
 0.56735648180258735,
 2.2058613857214899,
 1.4521491299187947,
 -0.71067734325415666,
 -0.12081557784335173,
 -0.4812866567055103,
 -0.6123670490190225,
 -0.21912587207848588,
 0.14134520678367268,
 -0.74344744133253471,
 2.4352520722701363,
 -0.088045479764973678,
 0.69843687411609956,
 0.10857510870529462,
 0.82951726642961165,
 -0.80898763748929081,
 0.76397707027285566,
 -0.15358567592172978,
 -0.41574646054875419,
 1.1899883452917703,
 -1.1039185201946933,
 -0.97283812788118096,
 0.010264814470160474,
 -0.6123670490190225,
 0.5345863837242093,
 -0.64513714709740055,
 -1.0383783240379372,
 -0.35020626439199809,
 1.9109305030160872,
 0.37073589333231904,
 -0.51405675478388835,
 1.615999620310685,
 1.2227584433701484,
 1.91

In [221]:
c2 = centre(pgc)
print(np.nanmean(c2))
print(np.nanvar(c2))

1.07093597133e-16
1.0


In [222]:
centre(dbp)

[-0.03274557136424161,
 -0.51764463823270079,
 -0.67927766052218719,
 -0.51764463823270079,
 -2.6188739279960238,
 0.12888745092524478,
 -1.8107088165485921,
 nan,
 -0.194378593653728,
 1.9068506961095952,
 1.5835846515306224,
 0.12888745092524478,
 0.61378651779370397,
 -1.0025437051011601,
 -0.03274557136424161,
 nan,
 0.93705256237267676,
 0.12888745092524478,
 -3.4270390394434562,
 -0.194378593653728,
 1.2603186069516497,
 0.93705256237267676,
 1.4219516292411361,
 0.61378651779370397,
 1.7452176738201088,
 -0.194378593653728,
 0.29052047321473118,
 -0.51764463823270079,
 0.77541954008319036,
 1.5835846515306224,
 0.20970396206998798,
 0.29052047321473118,
 -1.1641767273906465,
 1.5835846515306224,
 0.45215349550421757,
 -1.0025437051011601,
 0.29052047321473118,
 0.29052047321473118,
 -0.3560116159432144,
 -0.03274557136424161,
 -0.67927766052218719,
 0.93705256237267676,
 1.5835846515306224,
 3.038281852136,
 -0.67927766052218719,
 -0.51764463823270079,
 -1.3258097496801329,
 -0.

In [223]:
c3 = centre(dbp)
print(np.nanmean(c3))
print(np.nanvar(c3))

-4.87104672196e-16
1.0


In [224]:
centre(tt)

[0.55855695814306927,
 -0.014657043163412858,
 nan,
 -0.58787104446989502,
 0.55855695814306927,
 nan,
 0.2719499574898282,
 nan,
 1.5139136269872062,
 nan,
 nan,
 nan,
 nan,
 -0.58787104446989502,
 -0.97001371200754982,
 nan,
 1.7049849607560337,
 nan,
 0.84516395879631034,
 0.080878623721000839,
 1.1317709594495515,
 nan,
 nan,
 0.55855695814306927,
 0.36748562437424193,
 -0.30126404381665395,
 nan,
 -1.3521563795452045,
 -0.97001371200754982,
 nan,
 -0.30126404381665395,
 0.654092625027483,
 -1.7342990470828592,
 nan,
 0.17641429060541453,
 0.36748562437424193,
 nan,
 0.74962829191189673,
 1.2273066263339651,
 1.7049849607560337,
 -0.39679971070106762,
 nan,
 -1.0655493788919634,
 -0.49233537758548129,
 nan,
 0.94069962568072407,
 nan,
 -0.20572837693224025,
 0.2719499574898282,
 nan,
 -1.7342990470828592,
 -1.3521563795452045,
 -0.77894237823872237,
 0.4630212912586556,
 1.2273066263339651,
 -1.829834713967273,
 0.94069962568072407,
 2.9469486302534116,
 nan,
 1.1317709594495515,
 

In [225]:
c4 = centre(tt)
print(np.nanmean(c4))
print(np.nanvar(c4))

-3.44764266427e-17
1.0


In [226]:
centre(hsi)

[nan,
 nan,
 nan,
 -0.51884685764887517,
 0.10496753334537656,
 nan,
 -0.56942640286462531,
 nan,
 3.2661891093297606,
 nan,
 nan,
 nan,
 nan,
 5.8204561427251429,
 0.16397700276375174,
 nan,
 0.62762283390812801,
 nan,
 -0.61157602387775045,
 -0.50198700924362516,
 0.66977245492125315,
 nan,
 nan,
 nan,
 -0.080490799112373945,
 -0.34181844939374967,
 nan,
 -0.13107034432812409,
 -0.38396807040687481,
 nan,
 nan,
 0.75407169694750342,
 -0.85604382575387616,
 nan,
 nan,
 0.30728571420837714,
 nan,
 nan,
 nan,
 0.4337345772477525,
 -0.72116503851187574,
 nan,
 nan,
 0.71192207593437828,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -0.62000594808037546,
 -1.0077824614011266,
 -1.1173714760352518,
 1.2177175280918797,
 1.5717743446021306,
 nan,
 1.2514372249023797,
 -0.38396807040687481,
 nan,
 -0.11421049592287405,
 nan,
 nan,
 nan,
 -0.23222943475962438,
 nan,
 nan,
 nan,
 nan,
 -0.99092261299587647,
 -0.46826731243312503,
 -0.5525665544593753,
 -0.13107034432812409,
 nan,
 0.964819802013129,
 n

In [227]:
c5 = centre(hsi)
print(np.nanmean(c5))
print(np.nanvar(c5))

1.12712997424e-17
1.0


In [228]:
centre(BMI)

[0.16509655594797426,
 -0.84640379087629736,
 -1.3232539543791684,
 -0.62965371655681057,
 1.5378470266380573,
 -0.99090384042262192,
 -0.2106035728724697,
 0.4107466401767253,
 -0.28285359764563195,
 nan,
 0.74309675413327236,
 0.80089677395180192,
 -0.77415376610313513,
 -0.34065361746416156,
 -0.96200383051335714,
 -0.35510362241879423,
 1.9279971604131327,
 -0.41290364223732379,
 1.5667470365473215,
 0.30959660549429879,
 0.98874683836202337,
 0.42519664513135796,
 1.0609968631351856,
 -0.49960367196511873,
 0.5985967045869478,
 -0.19615356791783703,
 1.003196843316656,
 -1.3377039593338009,
 -1.4822040088801256,
 0.23734658072113651,
 0.51189667485915291,
 -0.12390354314467478,
 -1.1065038800596816,
 -1.8145541228366719,
 -0.7019037413299728,
 -1.2221039196967414,
 0.10729653612944466,
 0.063946521265546688,
 0.82979678386106726,
 0.67084672936011003,
 0.22289657576650385,
 1.1187968829537163,
 -1.4099539841069633,
 1.8701971405946032,
 -0.73080375123923813,
 1.3788969721371001,
 

In [229]:
c6 = centre(BMI)
print(np.nanmean(c6))
print(np.nanvar(c6))

3.66065610233e-16
1.0


In [230]:
centre(dp)

[0.4684919773786535,
 -0.36506077757159927,
 0.6043973178596731,
 -0.9207626142051013,
 5.4849091004669512,
 -0.8180785791749976,
 -0.67613300133926613,
 -1.0204265305578488,
 -0.94794368230130521,
 -0.72445490017696201,
 -0.84827976594855747,
 0.19668129641661461,
 2.9268685807464276,
 -0.22311519973586799,
 0.34768723028441417,
 0.036615006516747081,
 0.23896295789959848,
 -0.65801228927513022,
 -0.87244071536740542,
 0.17252034699776669,
 0.70104111553506476,
 -0.25331638650942789,
 -0.063048909836000464,
 -0.63083122117892632,
 -0.65801228927513022,
 -0.80599810446557363,
 -0.64895193324306222,
 0.045675362548815227,
 -0.68519335737133413,
 -0.40734243905458312,
 0.22386236451281855,
 1.1449985611063949,
 -0.61875074646950234,
 -0.85734012198062548,
 0.12117832948271484,
 1.4923122090023335,
 -0.15667258883403617,
 0.58325648711818112,
 0.093997261386510919,
 2.7728425282012727,
 -0.60667027176007837,
 0.6768801661162166,
 -0.71539454414489401,
 0.75238313305011628,
 -0.53720754218

In [231]:
c7 = centre(dp)
print(np.nanmean(c7))
print(np.nanvar(c7))

2.45174251271e-16
1.0


In [232]:
centre(age)

[1.4259954044228447,
 -0.19067190513648613,
 -0.1055841520017845,
 -1.0415494364835023,
 -0.020496398867082873,
 -0.27575965827118776,
 -0.61611067080999427,
 -0.36084741140588938,
 1.6812586638269496,
 1.7663464169616512,
 -0.27575965827118776,
 0.06459135426761875,
 2.0216096763657561,
 2.1917851826351593,
 1.5110831575575463,
 -0.1055841520017845,
 -0.19067190513648613,
 -0.19067190513648613,
 -0.020496398867082873,
 -0.1055841520017845,
 -0.53102291767529264,
 1.4259954044228447,
 0.66020562621053014,
 -0.36084741140588938,
 1.5110831575575463,
 0.66020562621053014,
 0.83038113247993339,
 -0.95646168334880077,
 2.0216096763657561,
 0.40494236680642526,
 2.2768729357698612,
 -0.44593516454059101,
 -0.95646168334880077,
 -0.44593516454059101,
 1.0005566387493365,
 -0.020496398867082873,
 0.14967910740232038,
 1.0856443918840382,
 -0.53102291767529264,
 1.9365219232310544,
 -0.61611067080999427,
 0.31985461367172363,
 1.2558198981534414,
 1.7663464169616512,
 0.57511787307582851,
 -0.

In [233]:
c8 = centre(age)
print(np.nanmean(c8))
print(np.nanvar(c8))

1.93132546992e-16
1.0


In [234]:
centre(dia)

[1.3658959117703826,
 -0.73212020870892502,
 1.3658959117703826,
 -0.73212020870892502,
 1.3658959117703826,
 -0.73212020870892502,
 1.3658959117703826,
 -0.73212020870892502,
 1.3658959117703826,
 1.3658959117703826,
 -0.73212020870892502,
 1.3658959117703826,
 -0.73212020870892502,
 1.3658959117703826,
 1.3658959117703826,
 1.3658959117703826,
 1.3658959117703826,
 1.3658959117703826,
 -0.73212020870892502,
 1.3658959117703826,
 -0.73212020870892502,
 -0.73212020870892502,
 1.3658959117703826,
 1.3658959117703826,
 1.3658959117703826,
 1.3658959117703826,
 1.3658959117703826,
 -0.73212020870892502,
 -0.73212020870892502,
 -0.73212020870892502,
 -0.73212020870892502,
 1.3658959117703826,
 -0.73212020870892502,
 -0.73212020870892502,
 -0.73212020870892502,
 -0.73212020870892502,
 -0.73212020870892502,
 1.3658959117703826,
 1.3658959117703826,
 1.3658959117703826,
 -0.73212020870892502,
 -0.73212020870892502,
 -0.73212020870892502,
 1.3658959117703826,
 -0.73212020870892502,
 1.36589591

In [235]:
c9 = centre(dia)
print(np.mean(c9))
print(np.var(c9))

7.40148683083e-17
1.0


In [236]:
# Finally! Let's look for outliers by getting the min and max in the new centred df:
# (You can re-use code from above!)

For this point, I will use my normaldf variable I defined before.

In [334]:
a = normaldf.describe()
outliers = a.loc[["min","max"],]
outliers

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,2-hour_serum_insulin,BMI,diabetes_pedigreen,age,diabetes
min,-1.141852,-2.545803,-3.911938,-2.116442,-1.193241,-2.060204,-1.189553,-1.041549,-0.73212
max,3.906578,2.533562,4.00808,6.67284,5.820456,5.005848,5.883565,4.063716,1.365896


In [238]:
# Bonus (trick) question:
# What is the mean of cdf['times_pregnant']
# Why isn't it zero like expected?

In [332]:
cdf['times_pregnant'].mean()

3.8450520833333335

It is not 0 because the values it takes are from the actual (first) database, not the normalized one.