## Handling Missing Values in Large-scale ML Pipelines:

**Task 1**: Impute with Mean or Median
- Step 1: Load a dataset with missing values (e.g., Boston Housing dataset).
- Step 2: Identify columns with missing values.
- Step 3: Impute missing values using the mean or median of the respective columns.

In [None]:
# write your code from here
from sklearn.datasets import fetch_openml
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

def impute_missing_values(strategy='mean'):
    try:
        boston = fetch_openml(name='boston', version=1, as_frame=True)
        df = boston.frame

        if df.isnull().sum().sum() == 0:
            # artificially create missing values for demonstration
            df.iloc[0, 0] = np.nan
            df.iloc[4, 3] = np.nan

        imputer = SimpleImputer(strategy=strategy)
        df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

        return df_imputed
    except Exception as e:
        return f"Error: {str(e)}"


**Task 2**: Impute with the Most Frequent Value
- Step 1: Use the Titanic dataset and identify columns with missing values.
- Step 2: Impute categorical columns using the most frequent value.

In [None]:
# write your code from here
from sklearn.impute import SimpleImputer
import pandas as pd

def impute_most_frequent():
    try:
        titanic_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
        df = pd.read_csv(titanic_url)

        cat_cols = df.select_dtypes(include=['object', 'category']).columns
        df_cat = df[cat_cols]

        imputer = SimpleImputer(strategy='most_frequent')
        df_cat_imputed = pd.DataFrame(imputer.fit_transform(df_cat), columns=cat_cols)

        for col in cat_cols:
            df[col] = df_cat_imputed[col]

        return df
    except Exception as e:
        return f"Error: {str(e)}"


**Task 3**: Advanced Imputation - k-Nearest Neighbors
- Step 1: Implement KNN imputation using the KNNImputer from sklearn.
- Step 2: Explore how KNN imputation improves data completion over simpler methods.

In [None]:
# write your code from here
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np

def knn_imputation(data, n_neighbors=5):
    try:
        if not isinstance(data, pd.DataFrame):
            return "Input data must be a pandas DataFrame."

        numeric_cols = data.select_dtypes(include=[np.number]).columns
        imputer = KNNImputer(n_neighbors=n_neighbors)

        data_imputed_array = imputer.fit_transform(data[numeric_cols])
        data_imputed = data.copy()
        data_imputed[numeric_cols] = data_imputed_array

        return data_imputed
    except Exception as e:
        return f"Error: {str(e)}"


## Feature Scaling & Normalization Best Practices:

**Task 1**: Standardization
- Step 1: Standardize features using StandardScaler.
- Step 2: Observe how standardization affects data distribution.

In [None]:
# write your code from here
from sklearn.preprocessing import StandardScaler
import pandas as pd

def standardize_features(data):
    try:
        if not isinstance(data, pd.DataFrame):
            return "Input must be a pandas DataFrame."
        numeric_cols = data.select_dtypes(include=['number']).columns
        scaler = StandardScaler()
        data_scaled = data.copy()
        data_scaled[numeric_cols] = scaler.fit_transform(data[numeric_cols])
        return data_scaled
    except Exception as e:
        return f"Error: {str(e)}"


**Task 2**: Min-Max Scaling

- Step 1: Scale features to lie between 0 and 1 using MinMaxScaler.
- Step 2: Compare with standardization.

In [None]:
# write your code from here
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

def min_max_scale_features(data):
    try:
        if not isinstance(data, pd.DataFrame):
            return "Input must be a pandas DataFrame."
        numeric_cols = data.select_dtypes(include=['number']).columns
        scaler = MinMaxScaler()
        data_scaled = data.copy()
        data_scaled[numeric_cols] = scaler.fit_transform(data[numeric_cols])
        return data_scaled
    except Exception as e:
        return f"Error: {str(e)}"


**Task 3**: Robust Scaling
- Step 1: Scale features using RobustScaler, which is useful for data with outliers.
- Step 2: Assess changes in data scaling compared to other scaling methods.

In [None]:
# write your code from here
from sklearn.preprocessing import RobustScaler
import pandas as pd

def robust_scale_features(data):
    try:
        if not isinstance(data, pd.DataFrame):
            return "Input must be a pandas DataFrame."
        numeric_cols = data.select_dtypes(include=['number']).columns
        scaler = RobustScaler()
        data_scaled = data.copy()
        data_scaled[numeric_cols] = scaler.fit_transform(data[numeric_cols])
        return data_scaled
    except Exception as e:
        return f"Error: {str(e)}"


## Feature Selection Techniques:
### Removing Highly Correlated Features:

**Task 1**: Correlation Matrix
- Step 1: Compute correlation matrix.
- Step 2: Remove highly correlated features (correlation > 0.9).

In [None]:
# write your code from here
import pandas as pd
import numpy as np

def remove_highly_correlated_features(df, threshold=0.9):
    try:
        if not isinstance(df, pd.DataFrame):
            return "Input must be a pandas DataFrame."
        if not (0 <= threshold <= 1):
            return "Threshold must be between 0 and 1."
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [col for col in upper.columns if any(upper[col] > threshold)]
        df_reduced = df.drop(columns=to_drop)
        return df_reduced
    except Exception as e:
        return f"Error: {str(e)}"


### Using Mutual Information & Variance Thresholds:

**Task 2**: Mutual Information
- Step 1: Compute mutual information between features and target.
- Step 2: Retain features with high mutual information scores.

In [None]:
# write your code from here
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

def select_features_by_mutual_info(X, y, threshold=0.1):
    try:
        if not isinstance(X, pd.DataFrame):
            return "X must be a pandas DataFrame."
        if len(X) != len(y):
            return "Feature matrix and target vector length mismatch."
        mi_scores = mutual_info_classif(X, y, discrete_features='auto', random_state=0)
        selected_features = X.columns[mi_scores >= threshold].tolist()
        return selected_features
    except Exception as e:
        return f"Error: {str(e)}"


**Task 3**: Variance Threshold
- Step 1: Implement VarianceThreshold to remove features with low variance.
- Step 2: Analyze impact on feature space.

In [None]:
# write your code from here
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance_features(X, threshold=0.0):
    try:
        if not isinstance(X, pd.DataFrame):
            return "X must be a pandas DataFrame."
        selector = VarianceThreshold(threshold=threshold)
        X_selected = selector.fit_transform(X)
        selected_columns = X.columns[selector.get_support()].tolist()
        return pd.DataFrame(X_selected, columns=selected_columns)
    except Exception as e:
        return f"Error: {str(e)}"
