## Multivariate Imputation by chained equation (MICE)

In [77]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [78]:
# Read specific columns from the 50_startups dataset and scale them by dividing by 10,000, then round the values
df = np.round(pd.read_csv('../Dataset/50_startups.csv')[['R&D Spend', 'Administration', 'Marketing Spend', 'Profit']]/10000)

# Set random seed for reproducibility
np.random.seed(9)

# Randomly sample 5 rows from the DataFrame
df = df.sample(5)

# Display the sampled DataFrame
df


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
21,8.0,15.0,30.0,11.0
37,4.0,5.0,20.0,9.0
2,15.0,10.0,41.0,19.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0


In [79]:
# Drop the last column (Profit) from the DataFrame
df = df.iloc[:,0:-1]
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,4.0,5.0,20.0
2,15.0,10.0,41.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0


In [80]:
# Introduce missing values (NaN) at specific positions in the DataFrame
df.iloc[1, 0] = np.nan        # Set the value in row 1, column 0 to NaN
df.iloc[3, 1] = np.nan        # Set the value in row 3, column 1 to NaN
df.iloc[-1, -1] = np.nan      # Set the value in the last row, last column to NaN


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[1, 0] = np.nan        # Set the value in row 1, column 0 to NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[3, 1] = np.nan        # Set the value in row 3, column 1 to NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1, -1] = np.nan      # Set the value in the last row, last column to NaN


In [81]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,


In [82]:
# step 1 impute all missing values wiwth mean of respective col
df0 = pd.DataFrame()
df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())


In [83]:
# Create a copy of df0 to df1
df1 = df0.copy()

# Introduce a missing value (NaN) in row 1, column 0
df1.iloc[1, 0] = np.nan

# Display the updated DataFrame
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [84]:
# use first 3 rows to build a model and use the last for prediction
X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
21,15.0,30.0
2,10.0,41.0
14,11.25,26.0
44,15.0,29.25


In [85]:
# Select specific rows (0, 2, 3, 4) from the first column of df1 as the target variable
y = df1.iloc[[0, 2, 3, 4], 0]

# Display the selected values
y


21     8.0
2     15.0
14    12.0
44     2.0
Name: R&D Spend, dtype: float64

In [86]:
# Build and train a Linear Regression model
lr = LinearRegression()
lr.fit(X,y)
# Predict the missing value in row 1 using the trained model
lr.predict(df1.iloc[1,1:].values.reshape(1,2))



array([23.14158651])

In [87]:
# Update the missing value in df1 with the predicted value
df1.iloc[1,0] = 23.14

In [88]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.25,26.0
44,2.0,15.0,29.25


In [89]:
# Remove the col2 imputed value
df1.iloc[3,1] = np.nan

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,,26.0
44,2.0,15.0,29.25


In [90]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
21,8.0,30.0
37,23.14,20.0
2,15.0,41.0
44,2.0,29.25


In [91]:
# target variable
y = df1.iloc[[0,1,2,4],1]
y

21    15.0
37     5.0
2     10.0
44    15.0
Name: Administration, dtype: float64

In [92]:
# Build and train a Linear Regression model
lr = LinearRegression()
# fit the model
lr.fit(X, y)
# predict 
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([11.06331285])

In [93]:
# Update the missing value in df1 with the predicted value
df1.iloc[3,1] = 11.06

In [94]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,29.25


In [95]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.nan

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,


In [96]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
21,8.0,15.0
37,23.14,5.0
2,15.0,10.0
14,12.0,11.06


In [97]:
y = df1.iloc[0:4,-1]
y

21    30.0
37    20.0
2     41.0
14    26.0
Name: Marketing Spend, dtype: float64

In [98]:
# Build and train a Linear Regression model
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([31.56351448])

In [99]:
df1.iloc[4,-1] = 31.56

In [100]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.14,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.06,26.0
44,2.0,15.0,31.56


In [101]:
df2 = df1.copy()

df2.iloc[1,0] = np.nan

In [102]:
# Use first 3 rows to build a model and use the last for prediction
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))



array([23.78627207])

In [103]:
df2.iloc[1,0] = 23.78

In [104]:
df2.iloc[3,1] = np.nan
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([11.22020174])

In [105]:
df2.iloc[3,1] = 11.22

In [106]:
# Remove the col3 imputed value
df2.iloc[4,-1] = np.nan

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))



array([38.87979054])

In [107]:
df2.iloc[4,-1] = 31.56

In [108]:
df2


Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,23.78,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.22,26.0
44,2.0,15.0,31.56


In [109]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.64,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.16,0.0
44,0.0,0.0,0.0


In [110]:
df3 = df2.copy()

df3.iloc[1,0] = np.nan

In [111]:
# Use first 3 rows to build a model and use the last for prediction
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

# Build and train a Linear Regression model
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))



array([24.57698058])

In [112]:
# Update the missing value in df3 with the predicted value
df3.iloc[1,0] = 24.57

In [113]:
# Introduce a missing value at row 3, column 1 in df3
df3.iloc[3, 1] = np.nan

# Select features (columns 0 and 2) for rows 0,1,2,4
X = df3.iloc[[0, 1, 2, 4], [0, 2]]

# Select target variable (column 1) for the same rows
y = df3.iloc[[0, 1, 2, 4], 1]

# Initialize and train Linear Regression model
lr = LinearRegression()
lr.fit(X, y)

# Predict the missing value at row 3 using the trained model
lr.predict(df3.iloc[3, [0, 2]].values.reshape(1, 2))



array([11.37282844])

In [114]:
df3.iloc[3,1] = 11.37

In [115]:
# Introduce a missing value at row 4, last column in df3
df3.iloc[4, -1] = np.nan

# Select features (columns 0 and 1) for rows 0 to 3
X = df3.iloc[0:4, 0:2]

# Select target variable (last column) for rows 0 to 3
y = df3.iloc[0:4, -1]

# Initialize and train Linear Regression model
lr = LinearRegression()
lr.fit(X, y)

# Predict the missing value at row 4 using the trained model
lr.predict(df3.iloc[4, 0:2].values.reshape(1, 2))




array([45.53976417])

In [123]:
# Update the missing value in df3 with the predicted value
df3.iloc[4,-1] = 45.53

In [124]:
# Update the missing value in df2 with the predicted value
df2.iloc[3,1] = 11.22

In [118]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,8.0,15.0,30.0
37,24.57,5.0,20.0
2,15.0,10.0,41.0
14,12.0,11.37,26.0
44,2.0,15.0,45.53


In [125]:
df3-df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
21,0.0,0.0,0.0
37,0.79,0.0,0.0
2,0.0,0.0,0.0
14,0.0,0.15,0.0
44,0.0,0.0,13.97


# Using Sklearn

In [126]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Example dataset
df = pd.DataFrame({
    'Age': [25, np.nan, 35, 40, np.nan],
    'Salary': [50000, 60000, np.nan, 80000, 75000],
    'Experience': [1, 3, 5, np.nan, 8]
})

# Create MICE imputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Fit + transform
df_imputed = imputer.fit_transform(df)

# Back to DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

print(df_imputed)


         Age        Salary  Experience
0  25.000000  50000.000000    1.000000
1  30.000546  60000.000000    3.000000
2  35.000000  69997.109363    5.000000
3  40.000000  80000.000000    8.489023
4  37.499849  75000.000000    8.000000


In [121]:
import pandas as pd
import numpy as np

# Create a small dataset
data = {
    'Age': [25, 30, 35, 40, np.nan],
    'Experience': [1, 3, 5, 7, 9],
    'Salary': [2000, 3000, np.nan, 5000, 6000]
}

df = pd.DataFrame(data)
print("Original Data:\n", df)


Original Data:
     Age  Experience  Salary
0  25.0           1  2000.0
1  30.0           3  3000.0
2  35.0           5     NaN
3  40.0           7  5000.0
4   NaN           9  6000.0


In [122]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

mice_imputer = IterativeImputer(random_state=0)
df_mice = pd.DataFrame(mice_imputer.fit_transform(df), columns=df.columns)

print("\nAfter MICE Imputation:\n", df_mice)



After MICE Imputation:
     Age  Experience       Salary
0  25.0         1.0  2000.000000
1  30.0         3.0  3000.000000
2  35.0         5.0  4000.000003
3  40.0         7.0  5000.000000
4  45.0         9.0  6000.000000
