In [1]:
# Question: Advanced Data Cleaning with Multiple Issues
# Objective: Handle multiple issues in one dataset, including missing values, duplicates, and outliers.
# Description: Given a dataset with various data quality issues, employ multiple data cleaning techniques.
import pandas as pd
import numpy as np

def clean_dataset(df):
    df = df.copy()

    # 1. Handle missing values
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
        else:
            mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else 'Unknown'
            df[col].fillna(mode_val, inplace=True)

    # 2. Remove duplicate rows
    df.drop_duplicates(inplace=True)

    # 3. Handle outliers using IQR
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        median_val = df[col].median()
        df[col] = df[col].apply(lambda x: median_val if x < lower_bound or x > upper_bound else x)

    return df




In [2]:
# Question: Data Transformation Techniques
# Objective: Transform skewed data using log transformation.
# Description: Perform a log transformation to handle skewness in a dataset, which is particularly useful for
# certain machine learning models.

import pandas as pd
import numpy as np

def log_transform(df, columns):
    df = df.copy()
    for col in columns:
        if df[col].min() <= 0:
            df[col] = df[col] + abs(df[col].min()) + 1  # shift to avoid log(0) or log(negative)
        df[col] = np.log(df[col])
    return df


In [3]:
# Question: Feature Engineering by Creating New Features
# Objective: Create a new feature based on existing features to add predictive power.
# Description: Generate additional features from existing data to potentially improve the performance of
# prediction models.


import pandas as pd

def create_new_feature(df):
    df = df.copy()
    
    # Example: Create a 'income_to_expense_ratio' feature
    if 'Income' in df.columns and 'Expenses' in df.columns:
        df['income_to_expense_ratio'] = df['Income'] / (df['Expenses'] + 1e-5)  # prevent division by zero

    # Example: Create a 'net_savings' feature
    if 'Income' in df.columns and 'Expenses' in df.columns:
        df['net_savings'] = df['Income'] - df['Expenses']
        
    return df


In [4]:
# Question: Handling Complex Outliers with Z-Scores
# Objective: Detect and handle outliers using Z-score method.
# Description: Use the Z-score method to identify outliers which significantly differ from the rest of the data points.

import pandas as pd
import numpy as np

def remove_outliers_zscore(df, threshold=3):
    df = df.copy()
    numeric_cols = df.select_dtypes(include=[np.number]).columns

    for col in numeric_cols:
        mean = df[col].mean()
        std = df[col].std()
        z_scores = (df[col] - mean) / std
        df = df[(z_scores.abs() <= threshold)]

    return df



In [5]:
# Question: Data Imputation with K-Nearest Neighbors (KNN)
# Objective: Impute missing numerical values using the KNN method.
# Description: Use the K-nearest neighbors algorithm to fill in missing values, which considers the values of
# nearest neighbors for imputation.

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

def knn_impute_missing_values(df, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    numeric_df = df.select_dtypes(include=[np.number])
    imputed_data = imputer.fit_transform(numeric_df)
    imputed_df = pd.DataFrame(imputed_data, columns=numeric_df.columns, index=df.index)

    # Combine imputed numerical data back with any non-numeric columns
    non_numeric_df = df.select_dtypes(exclude=[np.number])
    return pd.concat([imputed_df, non_numeric_df], axis=1)

