# Practical 2


In [1]:
''' 
Aim:
    Apply data pre-processing techniques such as standardization/normalization, transformation,
    aggregation, discretization/binarization, sampling etc. on any dataset
'''

' \nAim:\n    Apply data pre-processing techniques such as standardization/normalization, transformation,\n    aggregation, discretization/binarization, sampling etc. on any dataset\n'

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.datasets import load_iris
import pandas as pd

In [4]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target

print("Original Dataset: ")
print(df.head())

Original Dataset: 
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  


In [11]:
# 1. STANDARDIZATION (Z-score normalization: mean=0, SD=1)
# ----------------------------------------------------------------
scaler = StandardScaler()
X_std = scaler.fit_transform(df[iris.feature_names])

print("\nStandardized Sample:")
print(X_std[:5])


Standardized Sample:
[[-0.90068117  1.01900435 -1.34022653 -1.3154443 ]
 [-1.14301691 -0.13197948 -1.34022653 -1.3154443 ]
 [-1.38535265  0.32841405 -1.39706395 -1.3154443 ]
 [-1.50652052  0.09821729 -1.2833891  -1.3154443 ]
 [-1.02184904  1.24920112 -1.34022653 -1.3154443 ]]


In [7]:
# 2. NORMALIZATION (Min-Max scaling → [0,1])
# ----------------------------------------------------------------
mms = MinMaxScaler()
X_norm = mms.fit_transform(df[iris.feature_names])

print("\nNormalized Sample:")
print(X_norm[:5])



Normalized Sample:
[[0.22222222 0.625      0.06779661 0.04166667]
 [0.16666667 0.41666667 0.06779661 0.04166667]
 [0.11111111 0.5        0.05084746 0.04166667]
 [0.08333333 0.45833333 0.08474576 0.04166667]
 [0.19444444 0.66666667 0.06779661 0.04166667]]


In [9]:
# 3. TRANSFORMATION (Yeo-Johnson for variance stabilization)
# ----------------------------------------------------------------
pt = PowerTransformer(method="yeo-johnson")
X_trans = pt.fit_transform(df[iris.feature_names])

print("\nTransformed Sample:")
print(X_trans[:5])


Transformed Sample:
[[-0.89568956  1.02290812 -1.3323059  -1.33226632]
 [-1.18517298 -0.08191725 -1.3323059  -1.33226632]
 [-1.48792061  0.37512615 -1.38596224 -1.33226632]
 [-1.64460908  0.14928268 -1.27844068 -1.33226632]
 [-1.03883758  1.22963765 -1.3323059  -1.33226632]]


In [12]:
# 4. AGGREGATION (Summarizing info by target class)
# ----------------------------------------------------------------
agg = df.groupby("target").agg(['mean', 'median', 'std'])
print("\nAggregated Features per Class:")
print(agg)


Aggregated Features per Class:
       sepal length (cm)                  sepal width (cm)                   \
                    mean median       std             mean median       std   
target                                                                        
0                  5.006    5.0  0.352490            3.428    3.4  0.379064   
1                  5.936    5.9  0.516171            2.770    2.8  0.313798   
2                  6.588    6.5  0.635880            2.974    3.0  0.322497   

       petal length (cm)                  petal width (cm)                   
                    mean median       std             mean median       std  
target                                                                       
0                  1.462   1.50  0.173664            0.246    0.2  0.105386  
1                  4.260   4.35  0.469911            1.326    1.3  0.197753  
2                  5.552   5.55  0.551895            2.026    2.0  0.274650  


In [13]:
# 5. DISCRETIZATION (Binning: turning numeric → categories)
# ----------------------------------------------------------------
bins = [0, 2.5, 4.5, 6.5, 8]
labels = ['XS', 'S', 'M', 'L']

df["sepal_length_bin"] = pd.cut(df["sepal length (cm)"], bins=bins, labels=labels)

print("\nDiscretized sample:")
print(df[["sepal length (cm)", "sepal_length_bin"]].head())


Discretized sample:
   sepal length (cm) sepal_length_bin
0                5.1                M
1                4.9                M
2                4.7                M
3                4.6                M
4                5.0                M


In [14]:
# 6. BINARIZATION (One-Hot Encoding)
# ----------------------------------------------------------------
df_encoded = pd.get_dummies(df, columns=["sepal_length_bin"], dummy_na=True)

print("\nAfter One-Hot Encoding:")
print(df_encoded.head())


After One-Hot Encoding:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  sepal_length_bin_XS  sepal_length_bin_S  sepal_length_bin_M  \
0       0                False               False                True   
1       0                False               False                True   
2       0                False               False                True   
3       0                False               False                True   
4       0                False               False                True   

   sepal_length_bin_L  sepal_length_bin_nan  
0        

In [15]:
# 7. SAMPLING (Making dataset balanced with downsampling)
# ----------------------------------------------------------------
# Make an intentionally imbalanced dataset for demo
df_imbalanced = df[df["target"] != 2]

majority = df_imbalanced[df_imbalanced.target == 0]
minority = df_imbalanced[df_imbalanced.target == 1]

majority_down = resample(majority, 
                         replace=False,
                         n_samples=len(minority),
                         random_state=42)

df_balanced = pd.concat([majority_down, minority])

print("\nBalanced Class Counts:")
print(df_balanced["target"].value_counts())


Balanced Class Counts:
target
0    50
1    50
Name: count, dtype: int64


In [16]:
# 8. STRATIFIED TRAIN-TEST SPLIT
# ---------------------------------------------------------------
X = df[iris.feature_names]
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("\nTrain/Test Split Shapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)


Train/Test Split Shapes:
X_train: (120, 4)
X_test: (30, 4)
