In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder


In [2]:
# Load Dataset
diabetes = pd.read_csv('/content/diabetes.csv')
diabetes.head()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
1,735,34221,M,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,N
2,420,47975,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
3,680,87656,F,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,N
4,504,34223,M,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,N


In [3]:
# Check Missing Values
diabetes.isnull().sum()

Unnamed: 0,0
ID,0
No_Pation,0
Gender,0
AGE,0
Urea,0
Cr,0
HbA1c,0
Chol,0
TG,0
HDL,0


In [10]:
# Handling Categorical Data
cols_with_zero = ['ID', 'No_Pation', 'Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI', 'CLASS']

numeric_cols = diabetes.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = diabetes.select_dtypes(include='object').columns

for col in numeric_cols:
    diabetes[col] = diabetes[col].replace(0, np.nan)
    diabetes[col] = diabetes[col].fillna(diabetes[col].mean())
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in categorical_cols:
    diabetes[col] = le.fit_transform(diabetes[col])


In [11]:
# Handling Outliers (IQR Method)
Q1 = diabetes.quantile(0.25)
Q3 = diabetes.quantile(0.75)
IQR = Q3 - Q1

diabetes = diabetes[~((diabetes < (Q1 - 1.5 * IQR)) |
                      (diabetes > (Q3 + 1.5 * IQR))).any(axis=1)]


In [24]:
# Data Transformation
# Min-Max Scaling
X = diabetes.drop('CLASS', axis=1)
y = diabetes['CLASS']

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_minmax = scaler.fit_transform(X)
X_minmax_df = pd.DataFrame(X_minmax, columns=X.columns)
X_minmax_df.head()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
0,0.892231,0.452186,0.5,0.193548,0.328947,0.690476,0.230769,0.694915,0.723404,0.2,0.673469,0.761905,0.15
1,0.106516,0.316206,0.0,0.354839,0.381579,0.404762,0.9,0.40678,0.361702,0.4,0.387755,0.380952,0.5
2,0.286967,0.188926,0.0,0.354839,0.381579,0.404762,0.9,0.40678,0.361702,0.4,0.387755,0.380952,0.5
3,0.630326,0.303545,0.0,0.290323,0.381579,0.190476,0.346154,0.40678,0.425532,0.6,0.326531,0.428571,0.2
4,0.006266,0.452318,0.0,0.225806,0.25,0.440476,0.238462,0.627119,0.744681,0.6,0.44898,0.761905,0.25


In [30]:
X_minmax_df.describe()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
count,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0
mean,0.415794,0.393271,0.261691,0.549025,0.459982,0.426173,0.533821,0.483721,0.40372,0.475755,0.399919,0.439277,0.579029
std,0.291815,0.182915,0.251747,0.151754,0.198399,0.204541,0.179632,0.187641,0.215402,0.214884,0.203091,0.217786,0.200534
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.14505,0.3166,0.0,0.483871,0.315789,0.27381,0.4,0.355932,0.255319,0.333333,0.239796,0.285714,0.45
50%,0.386591,0.453378,0.5,0.516129,0.447368,0.404762,0.530769,0.474576,0.382979,0.466667,0.387755,0.380952,0.55
75%,0.641917,0.455672,0.5,0.677419,0.578947,0.571429,0.653846,0.610169,0.510638,0.6,0.55102,0.619048,0.7
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
# Standard Scaling
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
X_standard = std_scaler.fit_transform(X)
X_standard_df = pd.DataFrame(X_standard, columns=X.columns)
X_standard_df.head()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
0,1.634136,0.322377,0.947475,-2.344564,-0.661056,1.293338,-1.68859,1.126537,1.485468,-1.284433,1.348145,1.482732,-2.141359
1,-1.060798,-0.421699,-1.040434,-1.280766,-0.395535,-0.104775,2.040328,-0.410418,-0.195241,-0.352859,-0.059949,-0.268049,-0.394447
2,-0.441866,-1.118165,-1.040434,-1.280766,-0.395535,-0.104775,2.040328,-0.410418,-0.195241,-0.352859,-0.059949,-0.268049,-0.394447
3,0.735825,-0.490976,-1.040434,-1.706285,-0.395535,-1.153359,-1.045673,-0.410418,0.101355,0.578715,-0.361683,-0.049201,-1.8918
4,-1.40465,0.323102,-1.040434,-2.131805,-1.059337,0.069989,-1.645729,0.7649,1.584333,0.578715,0.241786,1.482732,-1.642241


In [33]:
X_standard_df.describe()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
count,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0
mean,6.709261000000001e-17,1.246006e-16,1.230031e-16,-3.003193e-16,-2.140574e-16,1.40575e-16,5.367409e-16,8.945682000000001e-17,2.044727e-16,-1.150159e-16,2.300318e-16,1.150159e-16,4.920125e-16
std,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009
min,-1.42614,-2.151958,-1.040434,-3.621121,-2.320561,-2.085434,-2.974423,-2.580238,-1.87595,-2.216007,-1.970934,-2.01883,-2.890036
25%,-0.9286305,-0.4195427,-1.040434,-0.4297283,-0.7274363,-0.7455761,-0.7456449,-0.6816458,-0.6895674,-0.6633834,-0.7891405,-0.7057443,-0.6440056
50%,-0.1001637,0.3288994,0.9474745,-0.2169688,-0.06363426,-0.1047746,-0.01700569,-0.0487819,-0.09637584,-0.04233412,-0.05994892,-0.268049,-0.1448878
75%,0.7755827,0.341454,0.9474745,0.8468288,0.6001678,0.7107909,0.6687723,0.6744911,0.4968157,0.5787152,0.7446763,0.8261893,0.6037889
max,2.003777,3.319986,2.935383,2.974424,2.724334,2.807959,2.597523,2.753901,2.770716,2.441863,2.957396,2.576971,2.101142


In [28]:
print("Original shape:", X.shape)
print("Min-Max scaled shape:", X_minmax.shape)
print("Standardized shape:", X_standard.shape)


Original shape: (556, 13)
Min-Max scaled shape: (556, 13)
Standardized shape: (556, 13)


In [16]:
# Adult Income Dataset
# Step 1: Load Dataset
adult = pd.read_csv('/content/adult.csv')
adult.head()


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [17]:
#Check Missing Values
adult.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
educational-num,0
marital-status,0
occupation,0
relationship,0
race,0
gender,0


In [18]:
# Handling Missing Values
adult.replace('?', np.nan, inplace=True)
adult.dropna(inplace=True)


In [19]:
#Handling Categorical Data
# Identify categorical columns
categorical_cols = adult.select_dtypes(include='object').columns
categorical_cols


Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country', 'income'],
      dtype='object')

In [20]:
#Encode categorical columns using Label Encoding
le = LabelEncoder()

for col in categorical_cols:
    adult[col] = le.fit_transform(adult[col])


In [21]:
#Handling Outliers
Q1 = adult.quantile(0.25)
Q3 = adult.quantile(0.75)
IQR = Q3 - Q1

adult = adult[~((adult < (Q1 - 1.5 * IQR)) |
                (adult > (Q3 + 1.5 * IQR))).any(axis=1)]


In [27]:
#Data Transformation
#Min-Max Scaling
scaler = MinMaxScaler()
adult_minmax = scaler.fit_transform(adult)
X_minmax_df = pd.DataFrame(X_minmax, columns=X.columns)
X_minmax_df.head()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
0,0.892231,0.452186,0.5,0.193548,0.328947,0.690476,0.230769,0.694915,0.723404,0.2,0.673469,0.761905,0.15
1,0.106516,0.316206,0.0,0.354839,0.381579,0.404762,0.9,0.40678,0.361702,0.4,0.387755,0.380952,0.5
2,0.286967,0.188926,0.0,0.354839,0.381579,0.404762,0.9,0.40678,0.361702,0.4,0.387755,0.380952,0.5
3,0.630326,0.303545,0.0,0.290323,0.381579,0.190476,0.346154,0.40678,0.425532,0.6,0.326531,0.428571,0.2
4,0.006266,0.452318,0.0,0.225806,0.25,0.440476,0.238462,0.627119,0.744681,0.6,0.44898,0.761905,0.25


In [31]:
X_minmax_df.describe()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
count,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0
mean,0.415794,0.393271,0.261691,0.549025,0.459982,0.426173,0.533821,0.483721,0.40372,0.475755,0.399919,0.439277,0.579029
std,0.291815,0.182915,0.251747,0.151754,0.198399,0.204541,0.179632,0.187641,0.215402,0.214884,0.203091,0.217786,0.200534
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.14505,0.3166,0.0,0.483871,0.315789,0.27381,0.4,0.355932,0.255319,0.333333,0.239796,0.285714,0.45
50%,0.386591,0.453378,0.5,0.516129,0.447368,0.404762,0.530769,0.474576,0.382979,0.466667,0.387755,0.380952,0.55
75%,0.641917,0.455672,0.5,0.677419,0.578947,0.571429,0.653846,0.610169,0.510638,0.6,0.55102,0.619048,0.7
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [26]:
#Standard Scaling
std_scaler = StandardScaler()
adult_standard = std_scaler.fit_transform(adult)
X_standard_df = pd.DataFrame(X_standard, columns=X.columns)
X_standard_df.head()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
0,1.634136,0.322377,0.947475,-2.344564,-0.661056,1.293338,-1.68859,1.126537,1.485468,-1.284433,1.348145,1.482732,-2.141359
1,-1.060798,-0.421699,-1.040434,-1.280766,-0.395535,-0.104775,2.040328,-0.410418,-0.195241,-0.352859,-0.059949,-0.268049,-0.394447
2,-0.441866,-1.118165,-1.040434,-1.280766,-0.395535,-0.104775,2.040328,-0.410418,-0.195241,-0.352859,-0.059949,-0.268049,-0.394447
3,0.735825,-0.490976,-1.040434,-1.706285,-0.395535,-1.153359,-1.045673,-0.410418,0.101355,0.578715,-0.361683,-0.049201,-1.8918
4,-1.40465,0.323102,-1.040434,-2.131805,-1.059337,0.069989,-1.645729,0.7649,1.584333,0.578715,0.241786,1.482732,-1.642241


In [32]:
X_standard_df.describe()


Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI
count,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0,556.0
mean,6.709261000000001e-17,1.246006e-16,1.230031e-16,-3.003193e-16,-2.140574e-16,1.40575e-16,5.367409e-16,8.945682000000001e-17,2.044727e-16,-1.150159e-16,2.300318e-16,1.150159e-16,4.920125e-16
std,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009,1.0009
min,-1.42614,-2.151958,-1.040434,-3.621121,-2.320561,-2.085434,-2.974423,-2.580238,-1.87595,-2.216007,-1.970934,-2.01883,-2.890036
25%,-0.9286305,-0.4195427,-1.040434,-0.4297283,-0.7274363,-0.7455761,-0.7456449,-0.6816458,-0.6895674,-0.6633834,-0.7891405,-0.7057443,-0.6440056
50%,-0.1001637,0.3288994,0.9474745,-0.2169688,-0.06363426,-0.1047746,-0.01700569,-0.0487819,-0.09637584,-0.04233412,-0.05994892,-0.268049,-0.1448878
75%,0.7755827,0.341454,0.9474745,0.8468288,0.6001678,0.7107909,0.6687723,0.6744911,0.4968157,0.5787152,0.7446763,0.8261893,0.6037889
max,2.003777,3.319986,2.935383,2.974424,2.724334,2.807959,2.597523,2.753901,2.770716,2.441863,2.957396,2.576971,2.101142


In [29]:
print("Original shape:", X.shape)
print("Min-Max scaled shape:", X_minmax.shape)
print("Standardized shape:", X_standard.shape)


Original shape: (556, 13)
Min-Max scaled shape: (556, 13)
Standardized shape: (556, 13)
