# Chapter 4

# 4.3.2. Techniques for handling missing data

A. Removal of incomplete records

Listwise deletion 

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Apply Listwise Deletion (removes any row with missing values)
data_listwise_deleted = data.dropna()

print("\nDataFrame after Listwise Deletion:")
print(data_listwise_deleted)

Pairwise deletion

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Calculate pairwise correlations using pairwise deletion (excluding NA/null values)
# The `pairwise deletion` is applied by using the 'min_periods=1' parameter 
# which will calculate correlation for available pairs without dropping rows entirely.
correlation_matrix = data.corr(min_periods=1)

print("\nPairwise Correlation Matrix with Pairwise Deletion:")
print(correlation_matrix)

In [None]:
Threshold-based removal 

In [None]:
# Import necessary library
import pandas as pd

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)
print(data.shape)

# Define a threshold for column removal (e.g., remove columns with more than 20% missing data)
column_threshold = 0.2
# Remove columns where the proportion of missing values is greater than the threshold
df_column_threshold = data.loc[:, data.isnull().mean() <= column_threshold]
print("\nDataFrame after Column Threshold-based Removal (20% missing data threshold):")
print(df_column_threshold)
print(df_column_threshold.shape)

# Define a threshold for row removal (e.g., remove rows with more than 20% missing data)
row_threshold = 0.2
# Remove rows where the proportion of missing values is greater than the threshold
df_row_threshold = data[data.isnull().mean(axis=1) <= row_threshold]
print("\nDataFrame after Row Threshold-based Removal (20% missing data threshold):")
print(df_row_threshold)
print(df_row_threshold.shape)


B. Single Imputation

Mean/Median/Mode imputation

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('example_data/data2.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Separate numerical and categorical columns
numerical_columns = data.select_dtypes(include=['number']).columns
print(numerical_columns)

# Mean Imputation: Replace missing values with the mean of the respective column (for numerical columns only)
df_mean_imputed = data.copy()
# Initialize SimpleImputer with mean strategy
mean_imputer = SimpleImputer(strategy='mean')  
df_mean_imputed[numerical_columns] = mean_imputer.fit_transform(df_mean_imputed[numerical_columns])
print("\nDataFrame after Mean Imputation (Numerical Columns):")
print(df_mean_imputed)

# Median Imputation: Replace missing values with the median of the respective column (for numerical columns only)
df_median_imputed = data.copy()
# Initialize SimpleImputer with median strategy
median_imputer = SimpleImputer(strategy='median')  
df_median_imputed[numerical_columns] = median_imputer.fit_transform(df_median_imputed[numerical_columns])
print("\nDataFrame after Median Imputation (Numerical Columns):")
print(df_median_imputed)

# Mode Imputation: Replace missing values with the mode of the respective column (useful for categorical columns)
df_mode_imputed = data.copy()
# Initialize SimpleImputer with mode strategy
mode_imputer = SimpleImputer(strategy='most_frequent')  
df_mode_imputed[data.columns] = mode_imputer.fit_transform(df_mode_imputed)
print("\nDataFrame after Mode Imputation (Numerical and Categorical Columns):")
print(df_mode_imputed)


Last observation carried forward (LOCF)

In [None]:
# Import necessary library
import pandas as pd

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Apply LOCF (Last Observation Carried Forward) imputation
df_locf = data.copy()  # Create a copy of the original dataframe
df_locf = df_locf.ffill()  # Forward fill method to propagate last valid observation forward

print("\nDataFrame after LOCF Imputation:")
print(df_locf)


Hot deck imputation

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('example_data/data2.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Hot deck imputation function using grouping and random sampling
def hot_deck_imputation(df, group_column):
    """
    Perform hot deck imputation by filling missing values within groups using random sampling.
    
    Parameters:
    df (pd.DataFrame): Dataframe with missing values
    group_column (str): The column used to group similar records (e.g., 'Gender')

    Returns:
    pd.DataFrame: Dataframe with missing values imputed
    """
    # Create a copy of the dataframe to avoid modifying the original
    df_imputed = df.copy()
    
    # Loop through each column in the dataframe that has missing values
    for col in df.columns:
        # Skip the group column itself
        if col == group_column:
            continue
        
        # Apply imputation within each group defined by the group_column
        for group in df[group_column].unique():
            group_mask = (df[group_column] == group)
            missing_mask = df[col].isnull() & group_mask
            
            # Get available values within the group (non-missing values)
            available_values = df.loc[group_mask & df[col].notnull(), col]
            
            # If there are available values to sample from, fill missing values with random sampling
            if not available_values.empty:
                df_imputed.loc[missing_mask, col] = np.random.choice(available_values, size=missing_mask.sum(), replace=True)

    return df_imputed

# Perform hot deck imputation using 'groups' as the grouping column
df_hot_deck_imputed = hot_deck_imputation(data, group_column='groups')

print("\nDataFrame after Hot Deck Imputation (Grouped by 'groups'):")
print(df_hot_deck_imputed)


C. Model-based imputation

Regression imputation

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Function for regression imputation
def regression_imputation(df, target_column):
    """
    Perform regression imputation on a column with missing values.

    Parameters:
    df (pd.DataFrame): The input dataframe with missing values.
    target_column (str): The name of the column to perform regression imputation on.

    Returns:
    pd.Series: Column with missing values filled using regression imputation.
    """
    # Separate rows with missing values and rows with observed values in the target column
    observed_data = df[df[target_column].notnull()]
    missing_data = df[df[target_column].isnull()]
    
    # Define the features (all columns except the target column)
    X_train = observed_data.drop(columns=[target_column])
    y_train = observed_data[target_column]

    # Create a Linear Regression model
    regressor = LinearRegression()

    # Train the model using observed data
    regressor.fit(X_train.fillna(0), y_train)

    # Use the trained model to predict missing values
    X_missing = missing_data.drop(columns=[target_column])
    predicted_values = regressor.predict(X_missing.fillna(0))

    # Fill missing values with the predicted values
    df.loc[df[target_column].isnull(), target_column] = predicted_values

    return df[target_column]

# Apply regression imputation
df = data.copy()
df['feature1'] = regression_imputation(df, 'feature1')
df['feature3'] = regression_imputation(df, 'feature3')
df['feature4'] = regression_imputation(df, 'feature4')
df['feature5'] = regression_imputation(df, 'feature5')

print("\nDataFrame after Regression Imputation:")
print(df)


Stochastic regression imputation

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Function for stochastic regression imputation
def stochastic_regression_imputation(df, target_column):
    """
    Perform stochastic regression imputation on a column with missing values.

    Parameters:
    df (pd.DataFrame): The input dataframe with missing values.
    target_column (str): The name of the column to perform stochastic regression imputation on.

    Returns:
    pd.Series: Column with missing values filled using stochastic regression imputation.
    """
    # Separate rows with missing values and rows with observed values in the target column
    observed_data = df[df[target_column].notnull()]
    missing_data = df[df[target_column].isnull()]
    
    # Define the features (all columns except the target column)
    X_train = observed_data.drop(columns=[target_column])
    y_train = observed_data[target_column]

    # Create a Linear Regression model
    regressor = LinearRegression()

    # Train the model using observed data
    regressor.fit(X_train.fillna(0), y_train)

    # Use the trained model to predict missing values
    X_missing = missing_data.drop(columns=[target_column])
    predicted_values = regressor.predict(X_missing.fillna(0))

    # Calculate the residuals (difference between observed and predicted values)
    residuals = y_train - regressor.predict(X_train.fillna(0))

    # Calculate the standard deviation of the residuals
    residuals_std = np.std(residuals)

    # Add random noise to the predicted values based on the residuals' standard deviation
    stochastic_predictions = predicted_values + np.random.normal(0, residuals_std, size=predicted_values.shape)

    # Fill missing values with the stochastic predictions
    df.loc[df[target_column].isnull(), target_column] = stochastic_predictions

    return df[target_column]

# Apply stochastic regression imputation
df = data.copy()
df['feature1'] = stochastic_regression_imputation(df, 'feature1')
df['feature3'] = stochastic_regression_imputation(df, 'feature3')
df['feature4'] = stochastic_regression_imputation(df, 'feature4')
df['feature5'] = stochastic_regression_imputation(df, 'feature5')

print("\nDataFrame after Stochastic Regression Imputation:")
print(df)


Multiple imputation using Scikit-learn's IterativeImputer.

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Initialize imputer
imputer = IterativeImputer()
imputed = imputer.fit_transform(data)
print("\nDataFrame after Iterative Imputation:")
df = pd.DataFrame(imputed, index=data.index, columns=data.columns)
print(df)

Multiple Imputation by Chained Equations (MICE) implementation from Statsmodels

In [None]:
# Import necessary libraries
import pandas as pd
from statsmodels.imputation import mice
import statsmodels.api as sm

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Initialize imputer
mice_data = mice.MICEData(data)
# Prepare MICE model formulas dynamically
cols_with_missing = data.columns[data.isnull().any()].tolist()

# Create a formula and perform MICE only for columns with missing data
for column in cols_with_missing:
    other_columns = list(data.columns.drop(column))  # All columns except the current one
    formula = f"{column} ~ " + " + ".join(other_columns)
    mi_model = mice.MICE(formula, sm.OLS, mice_data)
    mi_results = mi_model.fit(10, 10)  # Using 10 imputations with 10 iterations each
    print(mi_results.summary())

imputed_data = mice_data.data
print("\nDataFrame after MICE Imputation:")
print(imputed_data)

K-Nearest Neighbors (KNN) imputation

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.impute import KNNImputer

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Initialize imputer
imputer = KNNImputer()
imputed = imputer.fit_transform(data)
df = pd.DataFrame(imputed, index=data.index, columns=data.columns)
print("\nDataFrame after KNN Imputation:")
print(df)

Random forest imputation

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load the dataset
data = pd.read_csv('example_data/data.csv', index_col=0)
print("Original DataFrame:")
print(data)

# Initialize imputer
imputer = IterativeImputer(estimator=RandomForestRegressor())
imputed = imputer.fit_transform(data)

df = pd.DataFrame(imputed, index=data.index, columns=data.columns)
print("\nDataFrame after Random Forest Imputation:")
print(df)