In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

%matplotlib inline
pd.set_option('max_rows',20000)
pd.set_option('max_columns',11)

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
test = pd.read_csv("test.csv")

In [3]:
test

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age
0,1882185,9,104,51,7,24,27.369832,1.350472,43
1,1662484,6,73,61,35,24,18.743674,0.158365,23
2,1228510,4,115,50,29,243,34.692154,0.079019,23


In [4]:
test['AgeLog'] = np.log(test['Age'])

In [5]:
test

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,AgeLog
0,1882185,9,104,51,7,24,27.369832,1.350472,43,3.7612
1,1662484,6,73,61,35,24,18.743674,0.158365,23,3.135494
2,1228510,4,115,50,29,243,34.692154,0.079019,23,3.135494


In [6]:
test = test.drop(['Age'],axis=1)

In [7]:
test

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,AgeLog
0,1882185,9,104,51,7,24,27.369832,1.350472,3.7612
1,1662484,6,73,61,35,24,18.743674,0.158365,3.135494
2,1228510,4,115,50,29,243,34.692154,0.079019,3.135494


In [8]:
X_scalar = test[['PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI']]

In [9]:
X_scalar.shape

(3, 5)

In [10]:
X_scalar.values

array([[104.        ,  51.        ,   7.        ,  24.        ,
         27.36983156],
       [ 73.        ,  61.        ,  35.        ,  24.        ,
         18.74367404],
       [115.        ,  50.        ,  29.        , 243.        ,
         34.69215364]])

In [11]:
X_minmax = test[['Pregnancies','DiabetesPedigree','AgeLog']]

In [12]:
X_minmax.shape

(3, 3)

In [13]:
X_minmax.values

array([[9.        , 1.35047205, 3.76120012],
       [6.        , 0.15836498, 3.13549422],
       [4.        , 0.07901857, 3.13549422]])

### Use StandardScaler method

In [14]:
scaler = StandardScaler()

In [15]:
X_scale_new = scaler.fit_transform(X_scalar)

In [16]:
X_scale_new

array([[ 0.3748975 , -0.60404045, -1.38462194, -0.70710678,  0.06667678],
       [-1.36837589,  1.40942772,  0.94154292, -0.70710678, -1.25672126],
       [ 0.99347838, -0.80538727,  0.44307902,  1.41421356,  1.19004448]])

In [17]:
X_scale_new = pd.DataFrame(X_scale_new, columns=X_scalar.columns)

In [18]:
X_scale_new.head()

Unnamed: 0,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI
0,0.374898,-0.60404,-1.384622,-0.707107,0.066677
1,-1.368376,1.409428,0.941543,-0.707107,-1.256721
2,0.993478,-0.805387,0.443079,1.414214,1.190044


### Use MinMax method

In [19]:
minmax = MinMaxScaler()

In [20]:
X_mm_new = minmax.fit_transform(X_minmax)

In [21]:
X_mm_new

array([[1.        , 1.        , 1.        ],
       [0.4       , 0.06240607, 0.        ],
       [0.        , 0.        , 0.        ]])

In [22]:
X_mm_new = pd.DataFrame(X_mm_new, columns=X_minmax.columns)

In [23]:
X_mm_new.head()

Unnamed: 0,Pregnancies,DiabetesPedigree,AgeLog
0,1.0,1.0,1.0
1,0.4,0.062406,0.0
2,0.0,0.0,0.0


### Merge the transformed dataset to existing one

In [24]:
test

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,AgeLog
0,1882185,9,104,51,7,24,27.369832,1.350472,3.7612
1,1662484,6,73,61,35,24,18.743674,0.158365,3.135494
2,1228510,4,115,50,29,243,34.692154,0.079019,3.135494


In [25]:
test = test.drop(['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
               'SerumInsulin','BMI','DiabetesPedigree','AgeLog'],axis=1)

In [26]:
test

Unnamed: 0,PatientID
0,1882185
1,1662484
2,1228510


In [27]:
testnew = pd.concat([test,X_scale_new,X_mm_new],axis=1)

In [28]:
testnew

Unnamed: 0,PatientID,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,Pregnancies,DiabetesPedigree,AgeLog
0,1882185,0.374898,-0.60404,-1.384622,-0.707107,0.066677,1.0,1.0,1.0
1,1662484,-1.368376,1.409428,0.941543,-0.707107,-1.256721,0.4,0.062406,0.0
2,1228510,0.993478,-0.805387,0.443079,1.414214,1.190044,0.0,0.0,0.0


In [29]:
testnew.describe()

Unnamed: 0,PatientID,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,Pregnancies,DiabetesPedigree,AgeLog
count,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
mean,1591060.0,2.960595e-16,-3.700743e-17,-7.401487e-17,0.0,2.960595e-16,0.466667,0.354135,0.333333
std,332639.2,1.224745,1.224745,1.224745,1.224745,1.224745,0.503322,0.560205,0.57735
min,1228510.0,-1.368376,-0.8053873,-1.384622,-0.707107,-1.256721,0.0,0.0,0.0
25%,1445497.0,-0.4967392,-0.7047139,-0.4707715,-0.707107,-0.5950222,0.2,0.031203,0.0
50%,1662484.0,0.3748975,-0.6040404,0.443079,-0.707107,0.06667678,0.4,0.062406,0.0
75%,1772334.0,0.6841879,0.4026936,0.692311,0.353553,0.6283606,0.7,0.531203,0.5
max,1882185.0,0.9934784,1.409428,0.9415429,1.414214,1.190044,1.0,1.0,1.0


In [30]:
testnew = testnew.drop(['PatientID'],axis=1)

In [31]:
testnew

Unnamed: 0,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,Pregnancies,DiabetesPedigree,AgeLog
0,0.374898,-0.60404,-1.384622,-0.707107,0.066677,1.0,1.0,1.0
1,-1.368376,1.409428,0.941543,-0.707107,-1.256721,0.4,0.062406,0.0
2,0.993478,-0.805387,0.443079,1.414214,1.190044,0.0,0.0,0.0


In [32]:
#Rearrange columns
testnew = testnew[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness',
          'SerumInsulin','BMI','DiabetesPedigree','AgeLog']]

In [33]:
testnew

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,AgeLog
0,1.0,0.374898,-0.60404,-1.384622,-0.707107,0.066677,1.0,1.0
1,0.4,-1.368376,1.409428,0.941543,-0.707107,-1.256721,0.062406,0.0
2,0.0,0.993478,-0.805387,0.443079,1.414214,1.190044,0.0,0.0


In [34]:
#Save to a new csv file
#testnew.to_csv("testcleaned.csv",index=False)