# Data Preprocessing

## 0. Importing Libraries

In [6]:
from typing import Tuple, Union

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#! DO NOT ADD OR MODIFY IMPORTS - YOU NEED TO WORK WITH THE ABOVE IMPORTS !

In [7]:
scaler = StandardScaler()
encoder = OneHotEncoder()

## 1. Handling Missing Values

### 1.1 What I've been told to do

> Strategy :
> - Numerical Features : Fill with median
> - Categorical Features : fill with mode

> Edge Cases to Consider :
> - Numerical column with all NaN values: Should fill with 0 or exclude the column
> - Categorical column with all NaN values: Should fill with 'unknown'

### 1.2 What I've learned to do

#### Handling Missing Values in Numerical Features

In the case of numerical features with all NaN values, it depends on the context of the data for the decision to fill with 0 or exclude the column.

> **Example 1** : In a dataset about customer purchases, if a column represents a numerical feature (e.g., “number of luxury items bought”) but contains only NaNs, it may be better to remove it as it adds no value.

> **Example 2** : In a dataset where a column represents “number of visits to a website,” and the missing values mean no visits occurred, replacing NaN with 0 makes sense.

#### Handling Missing Values in Categorical Features

In the case of categorical features with all NaN values, the decision to exclude or fill with 'unknown' depends on the context of the data.

> **Example 1** : In a dataset about customer preferences, if a column represents a categorical feature (e.g., “favorite luxury brand”) but contains only NaN, it may be better to remove it as it adds no value.

> **Example 2** : In a dataset where a column represents “preferred contact method,” and the missing values indicate no preference or unknown status, replacing NaN with 'unknown' makes sense.


In [16]:
def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Handle missing values in the dataset.
    
    Args:
        df: Input DataFrame

    Returns:
        DataFrame with handled missing values
    """

    # Select only numerical columns (Age, Salary, Experience)
    numerical_cols = df.select_dtypes(include=["number"])
    
    # Select only categorical columns (Department, Education)
    categorical_cols = df.select_dtypes(include=["object"])
    
    # If all values in Age column are missing, drop the column (since 0 has a meaning in this context)
    if df['age'].isnull().all():
        df = df.drop(['age'])
    else:
        df['age'] = df['age'].fillna(df['age'].median())
    
    # If all values in other numerical columns except Age (Salary and Experience) are missing, fill with 0
    # In terms of Salary, NaN means no salary, so it is reasonable to fill with 0
    # In terms of Experience, NaN means no experience, so it is reasonable to fill with 0
    for col in numerical_cols:
        if col != 'age':    # True for Salary and Experience
            if df[col].isnull().all():
                df[col] = 0
            else:
                df[col] = df[col].fillna(df[col].median())

    for col in categorical_cols:
        # If all values in categorical columns are missing, fill with "unknown"
        # In terms of Department, NaN means unknown department, so it is reasonable to fill with "unknown"
        # In terms of Education, NaN means unknown education, so it is reasonable to fill with "unknown"
        # We shouldn't remove the column since it has a meaning and could have non-NaN values in the future and could have an impact on the model
        if df[col].isnull().all():
            df[col] = "unknown"
        else:
            # Mode could be the most frequent value in the column
            df[col] = df[col].fillna(df[col].mode(dropna=True)[0])

    return df

## 2. Remove Outliers

### 2.1. What I've been told to do

> - For each column in 'columns', calculate the mean and standard deviation.
> - Compute the z-score for each value in the column.
> - Drop rows where the z-score exceeds 'threshold'.


```text
Z-score calculation (pseudo-code):

z_scores = (column_values - column_mean) / column_std
Use 'np.abs(z_scores) < threshold' to filter rows.
```

### 2.2. What I've learned to do

We are doing the manual calculation of z-scores to remove outliers. However, we can use the `zscore` function from the `scipy.stats` module to calculate z-scores for each value in the column.

```python
from scipy.stats import zscore

# Calculate z-scores for each value in the column
z_scores = zscore(df['column_name'])

# Filter rows based on z-scores
threshold = 3
df = df[(np.abs(z_scores) < threshold)]
```

Careful: Here we consider only one column at a time. If you want to remove outliers from multiple columns, you can calculate z-scores for each column and then combine them to filter rows.
It is shown in the code below.

In [9]:
def remove_outliers(df: pd.DataFrame, columns: list, threshold: float = 3
    ) -> pd.DataFrame:
    """
    Remove outliers from specified numerical columns using z-score method.

    Args:
        df: Input DataFrame
        columns: List of numerical columns to check for outliers
        threshold: Z-score threshold (default = 3)

    Returns:
        DataFrame with outliers removed
    """
    # Calculate z-scores for the specified columns (same shape as df)
    z_scores = (df[columns] - df[columns].mean(axis=0)) / df[columns].std(axis=0)
    
    # Remove for each row where the absolute value of z-scores is less than the threshold
    # for every columns in that row
    return df[(np.abs(z_scores) < threshold).all(axis=1)]

## 3. Scale Numerical Features

### 3.1. What I've been told to do

Scale numerical features using StandardScaler. (A bit vague)

### 3.2. What I've learned to do

We can use the `StandardScaler` class from the `sklearn.preprocessing` module to scale numerical features. The `StandardScaler` scales each feature to have a mean of 0 and a standard deviation of 1.

By default, the `StandardScaler` scales each feature (columns) independently. Same goes for `MinMaxScaler` but with a different formula.

> Use StandardScaler when features have different units or scales but are approximately normally distributed.
> Use MinMaxScaler when you need all features in the same range, especially when the data is not normally distributed or for models that are sensitive to absolute magnitude.

And,
> If you want all features scaled together (globally) based on the entire dataset, you would need to preprocess the data differently (e.g., flattening all features, calculating the global minimum and maximum, and scaling accordingly).

In [12]:
def scale_features(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Scale numerical features using StandardScaler.

    Args:
        df: Input DataFrame
        columns: List of numerical columns to scale

    Returns:
        DataFrame with scaled features
    """
    df[columns] = scaler.fit_transform(df[columns])
    return df

## 4. Encode Categorical Features

### 4.1. What I've been told to do

> 1. First fills NaN values with a placeholder to ensure consistent encoding.
> 2. Encode categorical variables using one-hot encoding.

### 4.2. What I've learned to do

We can use the `OneHotEncoder` class from the `sklearn.preprocessing` module to encode categorical features using one-hot encoding.

The `OneHotEncoder` class converts categorical features into binary vectors. Each unique category value is mapped to an integer value and then converted into a binary vector with all zero values except the index of the integer, which is marked with a 1.

In [13]:
def encode_categorical(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """
    Encode categorical variables using one-hot encoding.
    First fills NaN values with a placeholder to ensure consistent encoding.

    Args:
        df: Input DataFrame
        columns: List of categorical columns to encode

    Returns:
        DataFrame with encoded categorical variables
    """
    # Replace NaN values with 'unknown'
    df[columns] = df[columns].fillna("unknown")
    
    # Performs one-hot encoding
    encoded_array = encoder.fit_transform(df[columns]).toarray()
    encoded_df = pd.DataFrame(
        encoded_array, columns=encoder.get_feature_names_out(columns)
    )
    
    # Drop original columns and concatenate encoded columns
    df = pd.concat([df.drop(columns, axis=1), encoded_df], axis=1)
    
    return df

## 5. Preprocess Data

In [14]:
def preprocess_data(df: pd.DataFrame, numerical_columns: list, categorical_columns: list
    ) -> Tuple[pd.DataFrame, dict]:
    """
    Complete preprocessing pipeline.

    Args:
        df: Input DataFrame
        numerical_columns: List of numerical columns
        categorical_columns: List of categorical columns

    Returns:
        Tuple containing:
        - Preprocessed DataFrame
        - Dictionary with preprocessing statistics
    """
    # Preprocessing Steps:
    # 1. Handle missing values (use 'handle_missing_values').
    preprocessed_df = handle_missing_values(df)
    # 2. Remove outliers (use 'remove_outliers').
    preprocessed_df = remove_outliers(df, numerical_columns)
    # 3. Scale features (use 'scale_features').
    preprocessed_df = scale_features(df, numerical_columns)
    # 4. Encode categorical variables (use 'encode_categorical').
    preprocessed_df = encode_categorical(df, categorical_columns)
    # Ensure each step is applied in the specified order.
    # Implement full preprocessing pipeline
    # Should return (preprocessed_df, stats_dict)

    stats_dict = {}
    for col in numerical_columns:
        stats_dict[col] = {
            "mean": preprocessed_df[col].mean(),
            "std": preprocessed_df[col].std(),
        }

    return preprocessed_df, stats_dict

## 6. Main Code

In [15]:
# Load sample dataset
df = pd.read_csv("sample_dataset.csv")

# Define columns
numerical_cols = ["age", "salary", "experience"]
categorical_cols = ["department", "education"]

# Preprocess data
processed_df, stats = preprocess_data(
    df.copy(), numerical_cols, categorical_cols
)

print(processed_df.head())

print("Preprocessing complete!")
print("\nPreprocessing statistics:")
for key, value in stats.items():
    print(f"{key}: {value}")

        age    salary  experience  department_Finance  department_HR  \
0  0.489727  0.916849   -0.105625                 0.0            0.0   
1 -0.069833  0.570817    0.299068                 0.0            0.0   
2  0.601640 -0.071815   -0.307972                 0.0            0.0   
3  1.608849 -0.615580    0.096722                 0.0            1.0   
4 -0.293657  0.373084   -1.319705                 0.0            0.0   

   department_IT  department_Marketing  department_Sales  education_Bachelor  \
0            0.0                   1.0               0.0                 0.0   
1            0.0                   1.0               0.0                 1.0   
2            1.0                   0.0               0.0                 1.0   
3            0.0                   0.0               0.0                 1.0   
4            0.0                   0.0               1.0                 1.0   

   education_Master  education_PhD  
0               0.0            1.0  
1           