<a href="https://colab.research.google.com/github/khosh90/prospect/blob/main/analysis_multiple_imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions

Installations and functions


In [6]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/MyDrive/Colab Notebooks/Mini_DIVA'

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Mini_DIVA


In [7]:
!pip install statsmodels



In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from utils import *
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import sklearn
import tensorflow as tf
print("Tensorflow version is: ", tf.__version__)
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from sklearn.preprocessing import StandardScaler
from scipy import stats
from scipy.stats import f
from scipy import stats



Tensorflow version is:  2.15.0


In [9]:
def encode_categorical(df, categorical_cols):
    df_encoded = df.copy()
    encoding_mappings = {}

    for col in categorical_cols:
        encoder = LabelEncoder()
        df_encoded[col] = encoder.fit_transform(df_encoded[col])
        encoding_mappings[col] = {
            i: category for i, category in enumerate(encoder.classes_)
        }

    return df_encoded, encoding_mappings


In [10]:
def numerical_backto_categorical(df, encoding_mappings):
    for col_index, col_name in enumerate(df.columns):
        if col_name in encoding_mappings:
            mapping = encoding_mappings[col_name]
            df[col_name] = df[col_name].apply(lambda x: mapping.get(int(x), x))
    return df

In [11]:
def no_impute_variables(df, threshold_identifier=0.9, threshold_cardinality=0.05):
    """
    Identify columns that may not be suitable for imputation.

    Parameters:
    - df: DataFrame representing the dataset.
    - threshold_identifier: Threshold for identifying Identifier Columns (default is 0.9).
    - threshold_cardinality: Threshold for identifying High Cardinality Categorical Columns (default is 0.05).

    Returns:
    - result: Dictionary containing identified columns in different categories.
              Categories include 'Identifier Columns', 'Datetime Columns', 'High Cardinality Columns', and 'Free Text Columns'.
    """
    identifier_columns = []
    datetime_columns = []
    high_cardinality_columns = []
    free_text_columns = []

    for col in df.columns:
        # Check for Identifier Columns
        uniqueness = df[col].nunique() / len(df[col])
        if uniqueness > threshold_identifier:
            identifier_columns.append(col)

        # Check for Datetime Columns
        if pd.api.types.is_datetime64_any_dtype(df[col]):
            datetime_columns.append(col)

        # Check for High Cardinality Categorical Columns
        unique_ratio = df[col].nunique() / len(df[col])
        if unique_ratio > threshold_cardinality:
            high_cardinality_columns.append(col)

        # Check for Free Text Columns
        if df[col].dtype == 'O':
            free_text_columns.append(col)

    return {
        "Identifier Columns": identifier_columns,
        "Datetime Columns": datetime_columns,
        "High Cardinality Columns": high_cardinality_columns,
        "Free Text Columns": free_text_columns
    }

# Assuming 'credit' is your DataFrame
#result = no_impute_variables(credit)

# Display identified columns
#for category, columns in result.items():
#    print(f"{category}: {columns}")


In [12]:
def numerical_rubin(selected_dataframes, columns_of_interest, m=10, alpha=0.05):
    """
    Perform Rubin's Rules for combining results from multiple imputed datasets for numerical variables.

    Parameters:
    - selected_dataframes: List of DataFrames representing imputed datasets.
    - columns_of_interest: List of numerical column names for which Rubin's Rules will be applied.
    - m: Number of imputed datasets.
    - alpha: Significance level for confidence intervals (default is 0.05).

    Returns:
    - results_summary: DataFrame summarizing the combined results using Rubin's Rules.
    """
    # Step 1: Calculate Point Estimates for each imputed dataset and each dataframe
    point_estimates_by_dataframe = []
    for df in selected_dataframes:
      means = df.mean()
      point_estimates_by_dataframe.append(means)

    # Step 2: Calculate Within-Imputation Variance for each variable in each dataframe
    within_imputation_var_by_dataframe = []
    for df, means in zip(selected_dataframes, point_estimates_by_dataframe):
      within_imputation_var_for_variables = []
      for column in columns_of_interest:
        squared_deviations = (df[column] - means[column]) ** 2
        within_var_for_variable = squared_deviations.mean()
        within_imputation_var_for_variables.append(within_var_for_variable)
      within_imputation_var_by_dataframe.append(within_imputation_var_for_variables)

    # Step 3: Calculate Between-Imputation Variance for each variable in each dataframe
    between_imputation_var_for_variables = []
    for column in columns_of_interest:
      squared_deviations = [(df[column] - means[column]) ** 2 for df, means in zip(selected_dataframes, point_estimates_by_dataframe)]
      between_var_for_variable = np.mean(squared_deviations)
      between_imputation_var_for_variables.append(between_var_for_variable)

    # Step 4: Calculate Pooled Point Estimate
    pooled_point_estimate = np.mean(point_estimates_by_dataframe, axis=0)
    # Step 5: Calculate Pooled Within-Imputation Variance
    pooled_within_var = np.mean(within_imputation_var_by_dataframe, axis=0)
    # Step 6: Calculate Pooled Between-Imputation Variance
    pooled_between_var = np.mean(between_imputation_var_for_variables)
    # Step 7: Calculate Total Variance
    total_variance = pooled_within_var + pooled_between_var + (pooled_between_var/m)
    # Step 8: Calculate Standard Error
    se = np.sqrt(total_variance)
    # Step 9: Calculate Degrees of Freedom (if needed)
    df_lambda = (pooled_between_var + (pooled_between_var / m)) / total_variance
    old_df = (m - 1) / ((df_lambda) ** 2)
    # Step 10: Calculate t-statistic and p-value
    t_statistic = pooled_point_estimate / se
    p_value = 2 * (1 - stats.t.cdf(np.abs(t_statistic), df=old_df))

    # Step 11: Calculate 95% confidence interval for the original method
    t_critical = stats.t.ppf(1 - alpha / 2, df=old_df)
    margin_of_error = t_critical * se
    confidence_interval_lower = np.round(pooled_point_estimate - margin_of_error, 2)
    confidence_interval_upper = np.round(pooled_point_estimate + margin_of_error, 2)

    #confidence_interval = (confidence_interval_lower, confidence_interval_upper)


    # Step 12: Summarize Results
    results_summary = pd.DataFrame({
        'Variable': columns_of_interest,
        'Pooled Estimate': pooled_point_estimate,
        'SE': se,
        't-statistic': t_statistic,
        'old df': old_df,
        'p-value': p_value,
        'CI': list(zip(confidence_interval_lower, confidence_interval_upper))

        #'Confidence Interval Lower': confidence_interval_lower,
        #'Confidence Interval Upper': confidence_interval_upper
        #'CI': confidence_interval
    })
    return results_summary

In [13]:
def categorical_pooling(imputations, categorical_vars, m ):
    """
    Perform Rubin's Rules for combining results from multiple imputed datasets for categorical variables.

    Parameters:
    - imputation: List of DataFrames representing imputed datasets.
    - flat_cat_cols_list: List of categorical column names for which Rubin's Rules will be applied.
    - m: Number of imputed datasets.
    - decode_func: Function to decode categorical values to original labels (optional).
    - theta_0: Null hypothesis is that all parameters are zero

    Returns:
    - df_results: DataFrame summarizing the combined results using Rubin's Rules for categorical variables.
    - wald_results: Dictionary containing the Multivariate Wald Test results.
    """
    # Step 1: Calculate Within-Imputation Variance for each categorical variable
    within_imputation_var_by_variable = []
    for variable in categorical_vars:
        within_imputation_var_for_variable = [
            (imputations[i][variable].sum() / len(imputations[i][variable])) * (1 - (imputations[i][variable].sum() / len(imputations[i][variable])))
            for i in range(m)
        ]
        within_imputation_var_by_variable.append(within_imputation_var_for_variable)

    # Step 2: Calculate Between-Imputation Variance for each categorical variable
    between_imputation_var_by_variable = [
        np.var([imputations[i][variable].mean() for i in range(m)])
        for variable in categorical_vars
    ]

    # Step 3: Calculate Pooled Within-Imputation Variance for each categorical variable
    pooled_within_var_by_variable = np.mean(within_imputation_var_by_variable, axis=1)

    # Step 4: Calculate Pooled Between-Imputation Variance for all categorical variables
    pooled_between_var = np.mean(between_imputation_var_by_variable)

    # Calculate r_1
    V_B = pooled_between_var
    V_W = pooled_within_var_by_variable
    k = len(V_W)

    # Multiply the scalar to each element of V_W
    scaled_V_W = (1 + 1/m) * V_B * V_W

    # Calculate r_1 using the sum of scaled_V_W
    r_1 = np.sum(scaled_V_W) / k

    # Calculate total variation (V_T)
    V_T = (1 + r_1) * pooled_within_var_by_variable

    # Assume null hypothesis (all parameters are zero)
    theta_0 = np.zeros(k)

    # Calculate multivariate Wald statistic (D_1)
    theta_bar = np.mean(pooled_within_var_by_variable)
    V_T_matrix = np.diag(V_T)
    D_1 = (1/k) * np.dot((theta_bar - theta_0).T, np.linalg.inv(V_T_matrix))
    D_1 = np.dot(D_1, (theta_bar - theta_0))

    # Degrees of freedom
    t = k * (m - 1)
    t_inv = 1/t
    k_inv = 1/k
    r_1_inv = 1/r_1
    if t >= 4:
      v_1 = 4 + (t - 4) * (1 + (1 - 2 * t_inv) * r_1_inv)**2
    else:
      v_1 = t * (1 + k_inv) * (1 + r_1_inv)**2 / 2
    # Calculate p-value
    p_value = 1 - f.cdf(D_1, k, v_1)

    # Display results
    print("Multivariate Wald Test Results:")
    #print(f"Pooled Within-Imputation Variance: {pooled_within_var_by_variable}")
    #print(f"Pooled Between-Imputation Variance: {pooled_between_var}")
    #print(f"Relative Increase in Variance (r_1): {r_1}")
    #print(f"Pooled Total Variance (V_T): {V_T}")
    print(f"Multivariate Wald Statistic (D_1): {D_1}")
    print(f"Degrees of Freedom (v_1): {v_1}")
    print(f"P-Value: {p_value}")

    return {'D_1': D_1, 'v_1': v_1, 'p_value': p_value}

# Example usage
# Assuming 'imputations' is a list of dataframes and 'categorical_vars' is a list of categorical variable names
#calculate_categorical_variances(imputations, categorical_vars, m=3)

In [14]:
def evaluate_imputed_data(actual_data, imputed_data, confidence_interval=1.96):
    """
    Evaluate imputed data and calculate various metrics.

    Parameters:
    - actual_data: DataFrame representing the original dataset with true labels.
    - imputed_data: DataFrame representing the imputed dataset.
    - confidence_interval: Z-score for calculating lower and upper bounds (default is 1.96 for a 95% confidence interval).

    Returns:
    - evaluation_results: Dictionary containing evaluation metrics.
    """
    # Placeholder for results
    evaluation_results = {}

    # Assuming you have your original data stored in actual_data with the true labels
    true_labels = actual_data.to_numpy()

    # Assuming you have your imputed dataset stored in imputed_data
    imputed_data_np = imputed_data.to_numpy()

    # Calculate standard error
    std_error = np.std(imputed_data_np, axis=0) / np.sqrt(imputed_data_np.shape[0])

    # Calculate evaluation metrics with corrected lower and upper bounds
    raw_bias = np.mean(imputed_data_np - true_labels)
    lower_bound = imputed_data_np - confidence_interval * std_error
    upper_bound = imputed_data_np + confidence_interval * std_error
    coverage_rate = np.mean((true_labels >= lower_bound) & (true_labels <= upper_bound))
    average_width = np.mean(upper_bound - lower_bound)
    rmse = np.sqrt(np.mean((imputed_data_np - true_labels)**2))
    mse = np.mean((imputed_data_np - true_labels)**2)
    r_squared = 1 - mse / np.var(true_labels)
    predictions = np.round(imputed_data_np)
    accuracy = np.mean(predictions == true_labels)

    # Store the results
    evaluation_results["Results"] = {
        'Raw Bias': raw_bias,
        'Coverage Rate': coverage_rate,
        'Average Width': average_width,
        'Root Mean Squared Error': rmse,
        'Accuracy': accuracy,
        'Mean Squared Error': mse,
        'R-squared': r_squared,
    }

    return evaluation_results

# Implementation


In [16]:
data, X, y, Xinds, yinds = read_dataset(dataset="credit")

In [18]:
credit_num, credit_cat = get_num_cat_vars(X)
credit_encoded, credit_mappings = encode_categorical(X, credit_cat)


We can generate imputed dataset with different models. Here I used RandomForest:

In [19]:
# Assuming credit_encoded is your original dataset
fraction = 0.25
data, data_ind = set_fraction_missing(credit_encoded, fraction=fraction, random_state=42)

# Initialize the Random Forest imputer
rf_imputer = IterativeImputer(estimator=RandomForestRegressor(), random_state=42)

# Initialize an empty list to store the imputed datasets
imputed_datasets = []

# Generate three different imputed datasets using a for loop
for _ in range(3):
    imputed_data = pd.DataFrame(rf_imputer.fit_transform(data), columns=credit_encoded.columns)
    imputed_datasets.append(imputed_data)

In [21]:
imputed_datasets

[       A1       A2        A3   A4   A5     A6    A7       A8    A9  A10  A11  \
 0    1.00  30.8300   0.00000  1.0  0.0  12.00  7.00  2.51915  0.64  1.0  1.0   
 1    0.49  58.6700   4.46000  1.0  0.0  10.00  3.00  3.04000  1.00  1.0  6.0   
 2    0.00  24.5000   0.50000  1.0  0.0   4.55  3.00  1.50000  1.00  0.0  0.0   
 3    1.00  27.8300   1.54000  1.0  0.0  12.00  7.00  3.75000  1.00  1.0  5.0   
 4    0.86  20.1700   5.15535  1.0  0.0   6.10  7.00  1.71000  0.27  0.0  0.0   
 ..    ...      ...       ...  ...  ...    ...   ...      ...   ...  ...  ...   
 648  1.00  26.1187  10.08500  2.0  2.0   4.00  3.00  1.25000  0.22  0.0  0.0   
 649  0.00  22.6700   3.58405  1.0  0.0   5.44  5.69  2.00000  0.70  1.0  2.0   
 650  0.67  25.2500   4.37105  2.0  2.0   5.00  2.00  2.00000  0.00  1.0  1.0   
 651  1.00  28.2645   0.20500  1.0  0.0   0.00  5.05  0.04000  0.00  0.0  0.0   
 652  1.00  35.7235   3.37500  1.0  0.0   1.00  2.55  8.29000  0.00  0.0  0.0   
 
      A12   A13     A14   

Evaluation function:

In [22]:
columns_of_interest = credit_num
# Placeholder for results
all_evaluation_results = {}
confidence_interval = 1.96
# Loop over each imputed dataset
for i, imputed_data in enumerate(imputed_datasets):
    # Extract the columns of interest from the imputed dataset
    imputed_data_subset = imputed_data[columns_of_interest]

    # Call the evaluate_imputed_data function for each dataset
    evaluation_results = evaluate_imputed_data(X[columns_of_interest], imputed_data_subset, confidence_interval)

    # Store the results in the dictionary
    all_evaluation_results[f'Imputation_{i + 1}'] = evaluation_results

# Print or analyze the evaluation results as needed
for imputation_name, results in all_evaluation_results.items():
    print(f"\nResults for {imputation_name}:")
    for key, value in results["Results"].items():
        print(f"{key}: {value}")


Results for Imputation_1:
Raw Bias: -8.98461336906585
Coverage Rate: 0.7883614088820827
Average Width: 160.21338720722738
Root Mean Squared Error: 1285.3310834365232
Accuracy: 0.38315467075038284
Mean Squared Error: 1652075.9940481067
R-squared: 0.7085307287860814

Results for Imputation_2:
Raw Bias: -2.5787945022970886
Coverage Rate: 0.7895865237366003
Average Width: 162.8323583774413
Root Mean Squared Error: 1346.6964272889738
Accuracy: 0.38192955589586525
Mean Squared Error: 1813591.2672728861
R-squared: 0.6800352242533929

Results for Imputation_3:
Raw Bias: -0.5119541500765681
Coverage Rate: 0.7895865237366003
Average Width: 159.97334552766785
Root Mean Squared Error: 1276.317701666271
Accuracy: 0.38529862174578866
Mean Squared Error: 1628986.8755866725
R-squared: 0.7126042511634848


In [23]:
credit_cat

['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A11', 'A12', 'A13']

Wald Test for cateforical variables:

In [24]:
wald_results = categorical_pooling(imputed_datasets, credit_cat, m=3)

print("\nWald Test Results:")
print(wald_results)


Multivariate Wald Test Results:
Multivariate Wald Statistic (D_1): 66.19364452783535
Degrees of Freedom (v_1): 80509776.43010186
P-Value: 1.1102230246251565e-16

Wald Test Results:
{'D_1': 66.19364452783535, 'v_1': 80509776.43010186, 'p_value': 1.1102230246251565e-16}


Different models can be generated, and we can assess our models using these functions.