# MICE: Multiple Imputation by Chained Equations

In [1]:
import pandas as pd
import numpy as np
from statsmodels.imputation.mice import MICEData

# Sample data with missing values
data = {
    'A': [1, 2, np.nan, 4.2, 5],
    'B': [np.nan, 2, 3.1, 4, 5],
    'C': [1, np.nan, 3, 4, 5]
}
df = pd.DataFrame(data)

# Initialize MICEData object
mice_data = MICEData(df)

# Perform MICE imputation
imputed_data = mice_data.data

print("Original Data:")
print(df)
print("\nImputed Data:")
print(imputed_data)

Original Data:
     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  NaN
2  NaN  3.1  3.0
3  4.2  4.0  4.0
4  5.0  5.0  5.0

Imputed Data:
     A    B    C
0  1.0  3.1  1.0
1  2.0  2.0  3.0
2  2.0  3.1  3.0
3  4.2  4.0  4.0
4  5.0  5.0  5.0


In [2]:
mice_data.update_all(10)
print(mice_data.data)

     A    B    C
0  1.0  2.0  1.0
1  2.0  2.0  1.0
2  1.0  3.1  3.0
3  4.2  4.0  4.0
4  5.0  5.0  5.0


Load dataset

In [4]:
df = pd.read_csv('dataset.csv', parse_dates=True, index_col=0)

df = df.rename(columns = lambda x: 'S'+x)

   
# Calculate the percentage of non-missing data for each study site
non_missing_percentage = df.notna().mean() * 100

# Filter study sites with at least 90% non-missing data
selected_sites = non_missing_percentage[non_missing_percentage >= 90].index
df_selected = df[selected_sites]

Introduce artificial gaps

In [5]:
q = 24
p = 56

# artifical gaps 
df = df_selected.copy()

np.random.seed(4152)

gaps = {}
# randomly set a n-day contiguous segment as missing for each column
random_columns = np.random.choice(df.columns, size=q, replace=False)

N = len(df.values.flatten())
m = df.isnull().values.flatten().sum()
missing_data = m / N * 100

for col in random_columns:
    # Randomly select the start of the n-day segment
    start_idx = np.random.randint(0, len(df) - p)
    end_idx = start_idx + p

    gaps[col] = [start_idx, end_idx]

    # Set the values in this range to NaN
    df.iloc[start_idx:end_idx, df.columns.get_loc(col)] = np.nan

m = df.isnull().values.flatten().sum()

missing_data = float(m / N * 100)

ValueError: Cannot take a larger sample than population when 'replace=False'

Impute with MICE

In [37]:
# Initialize MICEData object
mice_data = MICEData(df, k_pmm=7)

In [2]:
mice_data.update_all()

NameError: name 'mice_data' is not defined

In [1]:
mice_data.data

NameError: name 'mice_data' is not defined