In [3]:
# import
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder


In [12]:
# Load dataset
diabetes_df = pd.read_csv("/content/Dataset of Diabetes .csv")

# Handle missing values
# The original 'cols' list contained columns not present in the dataframe.
# The actual columns in diabetes_df are:
# ID, No_Pation, Gender, AGE, Urea, Cr, HbA1c, Chol, TG, HDL, LDL, VLDL, BMI, CLASS
# Based on the error, 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin' are not found.
# 'BMI' is present, so we will only process 'BMI' from the original list for now.
cols = ["BMI"]
diabetes_df[cols] = diabetes_df[cols].replace(0, np.nan)
diabetes_df.fillna(diabetes_df.mean(numeric_only=True), inplace=True)

# Handle categorical data
encoder = LabelEncoder()
for col in diabetes_df.select_dtypes(include='object').columns:
    diabetes_df[col] = encoder.fit_transform(diabetes_df[col])

# Handle outliers
Q1 = diabetes_df.quantile(0.25, numeric_only=True)
Q3 = diabetes_df.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

# Get the list of numeric columns for which Q1, Q3, IQR were calculated
numeric_cols_for_outliers = Q1.index

# Apply the outlier condition only to these numeric columns of diabetes_df
outlier_condition = ((diabetes_df[numeric_cols_for_outliers] < (Q1 - 1.5 * IQR)) |
                    (diabetes_df[numeric_cols_for_outliers] > (Q3 + 1.5 * IQR)))

# Filter out rows where ANY of the numeric columns meet the outlier condition
diabetes_df = diabetes_df[~outlier_condition.any(axis=1)]

# Min-Max Normalization
diabetes_minmax = pd.DataFrame(
    MinMaxScaler().fit_transform(diabetes_df),
    columns=diabetes_df.columns
)

# Standard Scaling
diabetes_standard = pd.DataFrame(
    StandardScaler().fit_transform(diabetes_df),
    columns=diabetes_df.columns
)

print("--- Diabetes Min-Max Normalized Data (Head) ---")
print(diabetes_minmax.head())
print("\n--- Diabetes Standard Scaled Data (Head) ---")
print(diabetes_standard.head())

--- Diabetes Min-Max Normalized Data (Head) ---
         ID  No_Pation  Gender       AGE      Urea        Cr     HbA1c  \
0  0.892231   0.452186     0.5  0.193548  0.328947  0.690476  0.230769   
1  0.106516   0.316206     0.0  0.354839  0.381579  0.404762  0.900000   
2  0.286967   0.188926     0.0  0.354839  0.381579  0.404762  0.900000   
3  0.630326   0.303545     0.0  0.290323  0.381579  0.190476  0.346154   
4  0.006266   0.452318     0.0  0.225806  0.250000  0.440476  0.238462   

       Chol        TG  HDL       LDL      VLDL   BMI  CLASS  
0  0.694915  0.723404  0.2  0.673469  0.761905  0.15    0.0  
1  0.406780  0.361702  0.4  0.387755  0.380952  0.50    0.0  
2  0.406780  0.361702  0.4  0.387755  0.380952  0.50    0.0  
3  0.406780  0.425532  0.6  0.326531  0.428571  0.20    0.0  
4  0.627119  0.744681  0.6  0.448980  0.761905  0.25    0.0  

--- Diabetes Standard Scaled Data (Head) ---
         ID  No_Pation    Gender       AGE      Urea        Cr     HbA1c  \
0  1.634136  

In [13]:
# Load dataset
adult_df = pd.read_csv("/content/adult.csv")

# Handle missing values
adult_df.replace(" ?", np.nan, inplace=True)
for col in adult_df.select_dtypes(include="object").columns:
    adult_df[col].fillna(adult_df[col].mode()[0], inplace=True)

# Handle categorical data
encoder = LabelEncoder()
for col in adult_df.select_dtypes(include="object").columns:
    adult_df[col] = encoder.fit_transform(adult_df[col])

# Handle outliers
Q1 = adult_df.quantile(0.25)
Q3 = adult_df.quantile(0.75)
IQR = Q3 - Q1
adult_df = adult_df[~((adult_df < (Q1 - 1.5 * IQR)) |
                        (adult_df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Min-Max Normalization
adult_minmax = pd.DataFrame(
    MinMaxScaler().fit_transform(adult_df),
    columns=adult_df.columns
)

# Standard Scaling
adult_standard = pd.DataFrame(
    StandardScaler().fit_transform(adult_df),
    columns=adult_df.columns
)

print("--- Adult Min-Max Normalized Data (Head) ---")
print(adult_minmax.head())
print("\n--- Adult Standard Scaled Data (Head) ---")
print(adult_standard.head())

--- Adult Min-Max Normalized Data (Head) ---
        age  workclass    fnlwgt  education  educational-num  marital-status  \
0  0.344262        0.0  0.188277   0.555556         0.363636        0.333333   
1  0.114754        0.0  0.881156   1.000000         0.454545        0.666667   
2  0.147541        0.0  0.169156   0.555556         0.363636        0.666667   
3  0.131148        0.0  0.475807   0.333333         0.727273        0.333333   
4  0.606557        0.0  0.212298   0.555556         0.363636        0.333333   

   occupation  relationship  race  gender  capital-gain  capital-loss  \
0    0.307692           0.0   0.0     1.0           0.0           0.0   
1    0.538462           0.8   0.0     0.0           0.0           0.0   
2    0.000000           0.2   0.0     0.0           0.0           0.0   
3    0.692308           0.0   0.0     1.0           0.0           0.0   
4    0.153846           0.0   0.0     1.0           0.0           0.0   

   hours-per-week  native-country  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  adult_df[col].fillna(adult_df[col].mode()[0], inplace=True)
