In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
# ===============================
# PHASE 3: DATA TRANSFORMATION
# ===============================

import pandas as pd
import sys
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [44]:
path = os.path.abspath("../")
if(path not in sys.path):
    sys.path.append(path)

In [45]:
from src.Read_Any_File_Type import reading_data

file_path = "../data/interim/Diabetes-Cleaned.csv"   # update path if needed
data = reading_data(file_path)
df=data.read_data()

[SUCCESS] File loaded successfully: <class 'pandas.core.frame.DataFrame'>


In [46]:
from src.Data_Transformation import data_transformation
data_transformer = data_transformation()

In [47]:
# =======================================================
# 1. FEATURE ENGINEERING
# =======================================================

print("=== FEATURE ENGINEERING ===")

df['Age_Group'] = df['Age'].apply(data_transformer.age_group)

df['BMI_Category'] = df['BMI'].apply(data_transformer.bmi_category)

df['Glucose_Category'] = df['Glucose'].apply(data_transformer.glucose_category)

print("✅ Feature engineering complete.\n")

=== FEATURE ENGINEERING ===
✅ Feature engineering complete.



In [48]:
# =======================================================
# 2. ENCODING
# =======================================================

print("=== ENCODING ===")
    
df = data_transformer.encode_labels(df)

print("✅ Encoding complete.\n")

=== ENCODING ===


TypeError: encode_labels() takes 1 positional argument but 2 were given

In [None]:
# =======================================================
# 3. SCALING
# =======================================================

print("=== SCALING COMPARISON ===")

# Select numeric features for scaling
numeric_cols = ['Pregnant', 'Glucose', 'Diastolic_BP', 'Skin_Fold', 
                'Serum_Insulin', 'BMI', 'Diabetes_Pedigree', 'Age']

standard_scaled, minmax_scaled = data_transformer.scale_and_compare(df, numeric_cols)

# Print scaled value ranges for comparison
print("\n--- StandardScaler Range ---")
print(standard_scaled.describe().loc[['min', 'max']])

print("\n--- MinMaxScaler Range ---")
print(minmax_scaled.describe().loc[['min', 'max']])

# ✅ Choose one (let’s keep MinMaxScaler for modeling)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[numeric_cols])
scaled_df = pd.DataFrame(scaled_data, columns=numeric_cols)

# Replace original numeric columns with scaled versions
for col in numeric_cols:
    df[col] = scaled_df[col]

print("\n✅ Scaling applied using MinMaxScaler.\n")

=== SCALING COMPARISON ===

--- StandardScaler Range ---
     Pregnant_std  Glucose_std  Diastolic_BP_std  Skin_Fold_std  \
min     -1.141852    -2.552931         -2.768136      -1.931634   
max      3.906578     2.542658          2.706872       1.833069   

     Serum_Insulin_std   BMI_std  Diabetes_Pedigree_std   Age_std  
min          -1.494110 -2.130134              -1.189553 -1.041549  
max           1.414175  2.679918               5.883565  4.063716  

--- MinMaxScaler Range ---
     Pregnant_minmax  Glucose_minmax  Diastolic_BP_minmax  Skin_Fold_minmax  \
min              0.0             0.0                  0.0               0.0   
max              1.0             1.0                  1.0               1.0   

     Serum_Insulin_minmax  BMI_minmax  Diabetes_Pedigree_minmax  Age_minmax  
min                   0.0         0.0                       0.0         0.0  
max                   1.0         1.0                       1.0         1.0  

✅ Scaling applied using MinMaxScaler

In [None]:
# =======================================================
# FINAL CHECK
# =======================================================

print("=== FINAL DATASET PREVIEW ===")
print(df.head())
print("\nColumns in final dataset:", df.columns.tolist())

# Save transformed dataset
df.to_csv("../data/interim/Diabetes-Transformed.csv", index=False)
print("\n✅ Transformed dataset saved as 'Diabetes-Transformed.csv'")

=== FINAL DATASET PREVIEW ===
   Pregnant   Glucose  Diastolic_BP  Skin_Fold  Serum_Insulin       BMI  \
0  0.352941  0.670968       0.50000   0.732143       0.527174  0.480499   
1  0.058824  0.264516       0.40625   0.517857       0.527174  0.262090   
2  0.470588  0.896774       0.37500   0.517857       0.527174  0.159126   
3  0.058824  0.290323       0.40625   0.303571       0.000000  0.308892   
4  0.000000  0.600000       0.00000   0.732143       1.000000  0.776911   

   Diabetes_Pedigree       Age  Class BMI_Category  BMI_Category_Encoded  \
0           0.234415  0.483333      1        Obese                     3   
1           0.116567  0.166667      0   Overweight                     2   
2           0.253629  0.183333      1       Normal                     1   
3           0.038002  0.000000      0   Overweight                     2   
4           0.943638  0.200000      1        Obese                     3   

   Age_Group_Senior  Age_Group_Young  Glucose_Category_Normal 