# Diabetic Dataset

## Importing Dataset

In [56]:
import numpy as np
import pandas as pd

In [57]:
df=pd.read_csv('Diab.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pat_Id    500 non-null    int64  
 1   Gender    500 non-null    object 
 2   OGTT      500 non-null    int64  
 3   DBP       500 non-null    int64  
 4   BMI       500 non-null    float64
 5   Age       500 non-null    int64  
 6   Diabetic  500 non-null    object 
dtypes: float64(1), int64(4), object(2)
memory usage: 27.5+ KB


In [58]:
df.head()

Unnamed: 0,Pat_Id,Gender,OGTT,DBP,BMI,Age,Diabetic
0,101,Male,176,90,33.7,58,Yes
1,102,Male,150,66,34.7,42,No
2,103,Male,73,50,23.0,21,No
3,104,Female,187,68,37.7,41,Yes
4,105,Female,100,88,46.8,31,No


In [59]:
df.shape

(500, 7)

## Transformation of Numerical data 

In [60]:
X_cont = df.iloc[:,2:6] # all row and cloumns 2 to 5 (n-1=5 so n=6)
X_cont

Unnamed: 0,OGTT,DBP,BMI,Age
0,176,90,33.7,58
1,150,66,34.7,42
2,73,50,23.0,21
3,187,68,37.7,41
4,100,88,46.8,31
...,...,...,...,...
495,130,96,22.6,21
496,111,58,29.5,22
497,98,60,34.7,22
498,143,86,30.1,23


In [61]:
# Standardization 
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_cont_sc = sc.fit_transform(X_cont)
X_cont_sc

array([[ 1.80631165,  1.51290352,  0.19122178,  2.13374943],
       [ 0.9576037 , -0.53756432,  0.33411654,  0.76967755],
       [-1.55587752, -1.90454288, -1.33775222, -1.02066678],
       ...,
       [-0.73981219, -1.05018128,  0.33411654, -0.93541229],
       [ 0.72910541,  1.17115888, -0.32319938, -0.8501578 ],
       [ 0.01096792,  0.4876696 , -0.65185734,  0.42865959]])

In [62]:
# since we got in array so we convert it into Data Frame
X_cont_sc=pd.DataFrame(X_cont_sc)
X_cont_sc

Unnamed: 0,0,1,2,3
0,1.806312,1.512904,0.191222,2.133749
1,0.957604,-0.537564,0.334117,0.769678
2,-1.555878,-1.904543,-1.337752,-1.020667
3,2.165380,-0.366692,0.762801,0.684423
4,-0.674527,1.342031,2.063143,-0.168122
...,...,...,...,...
495,0.304751,2.025520,-1.394910,-1.020667
496,-0.315458,-1.221054,-0.408936,-0.935412
497,-0.739812,-1.050181,0.334117,-0.935412
498,0.729105,1.171159,-0.323199,-0.850158


In [63]:
# Here we got data into data frame but dont have column names so add column names we use following codes
list(X_cont)

['OGTT', 'DBP', 'BMI', 'Age']

In [64]:
X_cont_sc.columns = list(X_cont)
X_cont_sc

Unnamed: 0,OGTT,DBP,BMI,Age
0,1.806312,1.512904,0.191222,2.133749
1,0.957604,-0.537564,0.334117,0.769678
2,-1.555878,-1.904543,-1.337752,-1.020667
3,2.165380,-0.366692,0.762801,0.684423
4,-0.674527,1.342031,2.063143,-0.168122
...,...,...,...,...
495,0.304751,2.025520,-1.394910,-1.020667
496,-0.315458,-1.221054,-0.408936,-0.935412
497,-0.739812,-1.050181,0.334117,-0.935412
498,0.729105,1.171159,-0.323199,-0.850158


In [65]:
# ALL in one STANDARDIZATION
# Standardization 
from sklearn.preprocessing import StandardScaler
sc= StandardScaler()
X_cont_sc = sc.fit_transform(X_cont)
X_cont_sc=pd.DataFrame(X_cont_sc)
X_cont_sc.columns = list(X_cont)
X_cont_sc

Unnamed: 0,OGTT,DBP,BMI,Age
0,1.806312,1.512904,0.191222,2.133749
1,0.957604,-0.537564,0.334117,0.769678
2,-1.555878,-1.904543,-1.337752,-1.020667
3,2.165380,-0.366692,0.762801,0.684423
4,-0.674527,1.342031,2.063143,-0.168122
...,...,...,...,...
495,0.304751,2.025520,-1.394910,-1.020667
496,-0.315458,-1.221054,-0.408936,-0.935412
497,-0.739812,-1.050181,0.334117,-0.935412
498,0.729105,1.171159,-0.323199,-0.850158


In [66]:
# ALL in one Normalization
from sklearn.preprocessing import MinMaxScaler
mm= MinMaxScaler()
X_cont_mm = mm.fit_transform(X_cont)
X_cont_mm=pd.DataFrame(X_cont_mm)
X_cont_mm.columns = list(X_cont)
X_cont_mm

Unnamed: 0,OGTT,DBP,BMI,Age
0,0.857143,0.652174,0.316973,0.616667
1,0.688312,0.391304,0.337423,0.350000
2,0.188312,0.217391,0.098160,0.000000
3,0.928571,0.413043,0.398773,0.333333
4,0.363636,0.630435,0.584867,0.166667
...,...,...,...,...
495,0.558442,0.717391,0.089980,0.000000
496,0.435065,0.304348,0.231084,0.016667
497,0.350649,0.326087,0.337423,0.016667
498,0.642857,0.608696,0.243354,0.033333


## Transformation of Categorical Data By Label Encoder

In [67]:
X_cat= df[["Gender","Diabetic"]]
X_cat

Unnamed: 0,Gender,Diabetic
0,Male,Yes
1,Male,No
2,Male,No
3,Female,Yes
4,Female,No
...,...,...
495,Male,No
496,Female,No
497,Female,No
498,Female,No


In [68]:
#from sklearn.preprocessing import LabelEncoder
#le= LabelEncoder()
#X_cat_le= le.fit_transform(X_cat)
#X_cat_le
# The above code will not work as we have taken gender and Diabetic columns# together

In [69]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
X_cat["Gender"]= le.fit_transform(X_cat["Gender"])
X_cat["Diabetic"]= le.fit_transform(X_cat["Diabetic"])
X_cat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_cat["Gender"]= le.fit_transform(X_cat["Gender"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_cat["Diabetic"]= le.fit_transform(X_cat["Diabetic"])


Unnamed: 0,Gender,Diabetic
0,1,1
1,1,0
2,1,0
3,0,1
4,0,0
...,...,...
495,1,0
496,0,0
497,0,0
498,0,0


## Transformation of Categorical Data By One Hot Encoder

In [70]:
X_cat2= df[["Gender","Diabetic"]]
X_cat2

Unnamed: 0,Gender,Diabetic
0,Male,Yes
1,Male,No
2,Male,No
3,Female,Yes
4,Female,No
...,...,...
495,Male,No
496,Female,No
497,Female,No
498,Female,No


In [71]:
from sklearn.preprocessing import OneHotEncoder
ohe= OneHotEncoder()
X_cat_ohe= ohe.fit_transform(X_cat2[["Gender"]]).toarray()
X_cat_ohe

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.

In [72]:
X_cat_ohe=pd.DataFrame(X_cat_ohe)
X_cat_ohe.columns= ["G_Female","G_Male"]
X_cat_ohe

Unnamed: 0,G_Female,G_Male
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0
...,...,...
495,0.0,1.0
496,1.0,0.0
497,1.0,0.0
498,1.0,0.0


In [73]:
from sklearn.preprocessing import OneHotEncoder
ohe= OneHotEncoder()
X_cat_ohe2= ohe.fit_transform(X_cat2[["Diabetic"]]).toarray()
X_cat_ohe2=pd.DataFrame(X_cat_ohe2)
X_cat_ohe2.columns= ["D_NO","D_Yes"]
X_cat_ohe2

Unnamed: 0,D_NO,D_Yes
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
...,...,...
495,1.0,0.0
496,1.0,0.0
497,1.0,0.0
498,1.0,0.0


## Making New Dataset by Merging transformed datasets

In [74]:
df_new= pd.concat([X_cont_sc, X_cat],axis=1)
df_new

Unnamed: 0,OGTT,DBP,BMI,Age,Gender,Diabetic
0,1.806312,1.512904,0.191222,2.133749,1,1
1,0.957604,-0.537564,0.334117,0.769678,1,0
2,-1.555878,-1.904543,-1.337752,-1.020667,1,0
3,2.165380,-0.366692,0.762801,0.684423,0,1
4,-0.674527,1.342031,2.063143,-0.168122,0,0
...,...,...,...,...,...,...
495,0.304751,2.025520,-1.394910,-1.020667,1,0
496,-0.315458,-1.221054,-0.408936,-0.935412,0,0
497,-0.739812,-1.050181,0.334117,-0.935412,0,0
498,0.729105,1.171159,-0.323199,-0.850158,0,0


In [75]:
df_new2= pd.concat([X_cont_mm, X_cat_ohe,X_cat_ohe2], axis=1 )
df_new2

Unnamed: 0,OGTT,DBP,BMI,Age,G_Female,G_Male,D_NO,D_Yes
0,0.857143,0.652174,0.316973,0.616667,0.0,1.0,0.0,1.0
1,0.688312,0.391304,0.337423,0.350000,0.0,1.0,1.0,0.0
2,0.188312,0.217391,0.098160,0.000000,0.0,1.0,1.0,0.0
3,0.928571,0.413043,0.398773,0.333333,1.0,0.0,0.0,1.0
4,0.363636,0.630435,0.584867,0.166667,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
495,0.558442,0.717391,0.089980,0.000000,0.0,1.0,1.0,0.0
496,0.435065,0.304348,0.231084,0.016667,1.0,0.0,1.0,0.0
497,0.350649,0.326087,0.337423,0.016667,1.0,0.0,1.0,0.0
498,0.642857,0.608696,0.243354,0.033333,1.0,0.0,1.0,0.0
