# MICE: Multiple Imputation by Chained Equations

In [25]:
import pandas as pd
import numpy as np
from statsmodels.imputation.mice import MICEData

# Sample data with missing values
data = {
    'A': [1, 2, np.nan, 4.2, 5],
    'B': [np.nan, 2, 3.1, 4, 5],
    'C': [1, np.nan, 3, 4, 5]
}
df = pd.DataFrame(data)

# Initialize MICEData object
mice_data = MICEData(df)

# Perform MICE imputation
imputed_data = mice_data.data

print("Original Data:")
print(df)
print("\nImputed Data:")
print(imputed_data)

Original Data:
     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  NaN
2  NaN  3.1  3.0
3  4.2  4.0  4.0
4  5.0  5.0  5.0

Imputed Data:
     A    B    C
0  1.0  3.1  1.0
1  2.0  2.0  3.0
2  2.0  3.1  3.0
3  4.2  4.0  4.0
4  5.0  5.0  5.0


In [28]:
mice_data.update_all(10)
print(mice_data.data)

     A    B    C
0  1.0  2.0  1.0
1  2.0  2.0  1.0
2  1.0  3.1  3.0
3  4.2  4.0  4.0
4  5.0  5.0  5.0


Load dataset

In [33]:
df = pd.read_csv('tempdata.csv', parse_dates=True, index_col=0)

df = df.rename(columns = lambda x: 'S'+x)

# Define the cutoff date
cutoff_date = "2020-09-01"

# Select rows past the cutoff date
df = df[df.index >=  pd.Timestamp(cutoff_date)]

# Define the cutoff date
cutoff_date = "2024-08-31"

# Select rows past the cutoff date
df = df[df.index <=  pd.Timestamp(cutoff_date)]
   
# Calculate the percentage of non-missing data for each study site
non_missing_percentage = df.notna().mean() * 100

# Filter study sites with at least 90% non-missing data
selected_sites = non_missing_percentage[non_missing_percentage >= 90].index
df_selected = df[selected_sites]

Introduce artificial gaps

In [34]:
q = 24
p = 56

# artifical gaps 
df = df_selected.copy()

np.random.seed(4152)

gaps = {}
# randomly set a n-day contiguous segment as missing for each column
random_columns = np.random.choice(df.columns, size=q, replace=False)

N = len(df.values.flatten())
m = df.isnull().values.flatten().sum()
missing_data = m / N * 100

for col in random_columns:
    # Randomly select the start of the n-day segment
    start_idx = np.random.randint(0, len(df) - p)
    end_idx = start_idx + p

    gaps[col] = [start_idx, end_idx]

    # Set the values in this range to NaN
    df.iloc[start_idx:end_idx, df.columns.get_loc(col)] = np.nan

m = df.isnull().values.flatten().sum()

missing_data = float(m / N * 100)

Impute with MICE

In [37]:
# Initialize MICEData object
mice_data = MICEData(df, k_pmm=7)

In [38]:
mice_data.update_all()

In [39]:
mice_data.data

Unnamed: 0,S27,S28,S29,S31,S33,S34,S35,S41,S42,S43,...,S242,S243,S279,S280,S281,S282,S299,S300,S301,S302
0,8.377135,10.383819,11.967052,12.482950,7.437136,6.298104,4.062500,15.722677,14.941286,8.278365,...,16.699931,16.753056,8.158264,7.471042,8.898400,6.645833,16.156389,17.536771,14.812760,13.087500
1,10.766979,12.728958,13.259438,11.359929,8.442284,7.443816,4.625000,15.737292,12.931460,8.189573,...,16.574236,16.614861,9.804097,8.797615,10.536747,7.282609,17.228542,17.554063,16.135479,13.712500
2,13.929063,14.476042,14.703469,12.373958,11.464854,9.440135,6.154167,15.844896,11.847958,8.272979,...,16.428750,16.456458,11.285571,9.942789,12.118219,8.500000,17.824167,17.928417,17.551073,14.829167
3,15.341677,15.885139,16.113990,13.077292,12.257677,10.744917,6.928571,15.011742,11.362153,8.342177,...,16.438403,16.478889,10.912361,9.460875,12.155083,8.134783,18.145625,18.242229,17.849125,15.845833
4,14.484104,16.310833,16.632979,13.161806,12.023083,10.979385,7.911111,15.311208,12.768582,8.411229,...,16.236458,16.343056,10.075362,8.798500,11.435574,7.404167,18.181319,18.594635,16.461719,14.479167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,8.583646,11.199028,13.617948,18.968312,10.182927,8.261677,7.433333,18.782187,15.254028,8.719344,...,14.960903,15.240139,8.336111,7.353823,10.088302,6.058333,18.459028,19.837760,15.854031,11.891667
1457,8.526490,11.366875,13.786531,17.341115,10.818698,8.557333,7.454167,17.078937,13.569028,8.514896,...,14.166389,14.504375,8.427222,7.344500,10.701726,5.662500,19.974524,20.096156,17.188425,12.541667
1458,8.505552,9.896667,12.179844,15.794111,8.952813,7.866052,7.345833,15.649677,14.875694,8.771615,...,15.910625,15.851458,8.940417,8.289385,9.417740,7.300000,13.158652,17.092011,13.031135,11.575000
1459,8.383802,9.667917,11.220344,12.090330,7.579521,6.433875,6.533333,16.707812,16.389444,8.685021,...,14.755069,15.056667,9.071528,8.176292,9.903365,6.537500,14.067847,14.858937,13.199604,11.033333
