
# Data Preprocessing Pipeline

This notebook performs the following preprocessing steps on the dataset:
1. One-hot encoding of categorical variables
2. Classification of final grades (G3) into Low, Medium, and High
3. Outlier removal using IQR method
4. Min-Max Scaling of numeric features
5. Saving the final cleaned dataset

Make sure `final_combined_dataset.csv` is present in the same directory.


In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def classify_grade(g):
    if g <= 9:
        return "Low"
    elif g <= 14:
        return "Medium"
    else:
        return "High"

def remove_outliers_iqr(df, numeric_cols):
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

def full_preprocessing_pipeline(input_csv, output_csv):
    # Step 1: Load dataset
    df = pd.read_csv(input_csv)

    # Step 2: One-hot encode categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

    # Step 3: Classify G3
    if 'G3' in df_encoded.columns:
        df_encoded['G3_Class'] = df_encoded['G3'].apply(classify_grade)

    # Step 4: Remove outliers
    numeric_cols = df_encoded.select_dtypes(include=[np.number]).columns.tolist()
    if 'G3' in numeric_cols:
        numeric_cols.remove('G3')
    df_clean = remove_outliers_iqr(df_encoded, numeric_cols)

    # Step 5: Scale features
    X = df_clean.drop(columns=['G3', 'G3_Class'], errors='ignore')
    y = df_clean['G3_Class']
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    # Step 6: Merge features with target
    df_scaled['G3_Class'] = y.values

    # Step 7: Save to CSV
    df_scaled.to_csv(output_csv, index=False)
    print(f"✅ Dataset processed and saved to: {output_csv}")

# Example usage
if __name__ == "__main__":
    full_preprocessing_pipeline(
        input_csv="final_combined_dataset.csv",
        output_csv="full_processed_dataset.csv"
    )
