# Practical 2


In [None]:
''' 
Aim:
    Apply data pre-processing techniques such as standardization/normalization, transformation,
    aggregation, discretization/binarization, sampling etc. on any dataset
'''

In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

In [None]:
# STANDARDIZATION (zero mean, unit variance)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # X: numeric features



In [None]:
# NORMALIZATION (min-max to [0,1])
mms = MinMaxScaler()
X_norm = mms.fit_transform(X)



In [None]:
# TRANSFORMATION (stabilize variance; Box-Cox/yeo-johnson)
pt = PowerTransformer(method='yeo-johnson')  # handles zeros and negatives
X_trans = pt.fit_transform(X)



In [None]:
# AGGREGATION
# Example: aggregate reviews per paper_id -> compute mean score, count reviews
agg = df.groupby('paper_id').agg({'score':['mean','median','std','count']})
agg.columns = ['_'.join(col).strip() for col in agg.columns.values]



In [None]:
# DISCRETIZATION / BINARIZATION
# Example: bin scores into classes
bins = [0,4,6,8,10]
labels = ['poor','average','good','excellent']
df['score_bin'] = pd.cut(df['score'], bins=bins, labels=labels, include_lowest=True)

# OR binarize for ML (one-hot)
df = pd.get_dummies(df, columns=['score_bin','recommendation'], dummy_na=True)



In [None]:
# SAMPLING
# Undersample majority class
from imblearn.under_sampling import RandomUnderSampler  # optional
# OR simple resample:
major = df[df.y == 0]
minor = df[df.y == 1]
major_down = resample(major, replace=False, n_samples=len(minor), random_state=42)
df_balanced = pd.concat([major_down, minor])



In [None]:
# STRATIFIED SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)