### Task 1: Handling Missing Values - Simple Imputation
**Description**: Given a dataset with missing values, impute the missing values using the mean for numerical features and the mode for categorical features.

In [2]:
import pandas as pd
import numpy as np
import logging
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def validate_columns(df, required_columns):
    missing = [col for col in required_columns if col not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

def preprocess_data(df):
    if df.empty:
        raise ValueError("Input DataFrame is empty.")

    logging.info("Starting preprocessing...")

    # Define column groups
    numeric_features = ['age', 'income']
    categorical_features = ['city']

    # Validate columns
    validate_columns(df, numeric_features + categorical_features)

    # Define transformers
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ])

    # Combine transformers
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Fit-transform the data
    processed_data = preprocessor.fit_transform(df)

    # Recreate a DataFrame
    processed_df = pd.DataFrame(
        processed_data,
        columns=numeric_features + categorical_features
    )

    logging.info("Preprocessing completed successfully.")
    return processed_df

# Example usage
if __name__ == "__main__":
    sample_data = pd.DataFrame({
        'age': [25, np.nan, 47, 51],
        'income': [50000, 64000, None, 110000],
        'city': ['New York', 'San Francisco', None, 'Chicago']
    })

    try:
        clean_data = preprocess_data(sample_data)
        print(clean_data)
    except Exception as e:
        logging.error(f"Preprocessing failed: {e}")


2025-05-22 05:11:21,046 - INFO - Starting preprocessing...
2025-05-22 05:11:21,058 - ERROR - Preprocessing failed: '<' not supported between instances of 'NoneType' and 'str'


### Task 2: Feature Scaling - Min-Max Normalization
**Description**: Normalize a numerical feature using Min-Max scaling to a range [0, 1].

In [3]:
# write your code from here
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Sample numerical data
data = pd.DataFrame({
    'age': [25, 32, 47, 51],
    'income': [50000, 64000, 120000, 110000]
})

# Initialize Min-Max Scaler to scale features between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit scaler on data and transform
scaled_data = scaler.fit_transform(data)

# Convert back to DataFrame for readability
scaled_df = pd.DataFrame(scaled_data, columns=data.columns)

print(scaled_df)


        age    income
0  0.000000  0.000000
1  0.269231  0.200000
2  0.846154  1.000000
3  1.000000  0.857143


### Task 3: Handling Missing Values - Drop Missing Values
**Description**: Remove rows with missing values from a dataset.

In [4]:
# write your code from here

### Task 4: Feature Scaling - Standardization
**Description**: Standardize a numerical feature to have zero mean and unit variance.

In [5]:
# write your code from here