In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from missforest import MissForest

## Load data

In [2]:
iris = datasets.load_iris()
df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

## Fit Logistic Regression with the complete data set

In [32]:
X = df.drop('target', axis=1).values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

logreg = LogisticRegression(penalty='l1', solver='liblinear', multi_class='ovr', random_state=1)
logreg.fit(X_train, y_train)
y_pred=logreg.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9666666666666667


## Missing Completely at Random (MCAR)

In [52]:
missing_proportion = 0.1

df_train_complete, df_test_compuete = train_test_split(df, test_size=0.2, random_state=1)

df_train_mcar = df_train_complete.copy()
df_test_mcar = df_test_compuete.copy()

for col in df_train_complete.columns[:-1]:
    df_train_mcar.loc[df_train_complete.sample(frac=missing_proportion).index, col] = np.nan

for col in df_test_mcar.columns[:-1]:
    df_test_mcar.loc[df_test_mcar.sample(frac=missing_proportion).index, col] = np.nan

In [53]:
mf_mcar = MissForest()
mf_mcar.fit(df_train_mcar.drop('target', axis=1))

In [54]:
dfs_train_mcar_imputed = mf_mcar.transform(df_train_mcar.drop('target', axis=1))
dfs_test_mcar_imputed = mf_mcar.transform(df_test_mcar.drop('target', axis=1))

In [63]:
for i, df_train_mcar_imputed in enumerate(dfs_train_mcar_imputed):
    print(f"{i}: MSE", metrics.mean_squared_error(df_train_mcar_imputed.values, df_train_complete.drop('target', axis=1).values))

0: MSE 0.009832852635263139
1: MSE 0.010552756141628893
2: MSE 0.009885863595698704
3: MSE 0.010252217782986135
4: MSE 0.010568395078561998
5: MSE 0.009723132536024326
6: MSE 0.00976471304629631
7: MSE 0.009945531946180563
8: MSE 0.009561384403380125
9: MSE 0.009640346654667441


In [81]:
X_train_complete = df_train_complete.drop('target', axis=1).values
coefs = []
for i, (df_train_mcar_imputed, df_test_mcar_imputed) in enumerate(zip(dfs_train_mcar_imputed, dfs_test_mcar_imputed)):
    
    X_train_mcar = df_train_mcar_imputed.values
    y_train_mcar = df_train_mcar['target'].values

    X_test_mcar = df_test_mcar_imputed.values
    y_test_mcar = df_test_mcar['target'].values

    logreg = LogisticRegression(penalty='l1', solver='liblinear', multi_class='ovr', random_state=1)
    logreg.fit(X_train_mcar, y_train_mcar)

    coefs.append(logreg.coef_)
    y_pred_mcar=logreg.predict(X_test_mcar)

    print(f"{i}: MSE {metrics.mean_squared_error(X_train_mcar, X_train_complete):.05f}, ACC {metrics.accuracy_score(y_test_mcar, y_pred_mcar):.05f}")

    plt.imshow(logreg.)

from copy import deepcopy
logreg_mean = deepcopy(logreg)
logreg_mean.coef_ = np.array(coefs).mean(axis=0)

for i, df_test_mcar_imputed in enumerate(dfs_test_mcar_imputed):
    X_test_mcar = df_test_mcar_imputed.values
    y_test_mcar = df_test_mcar['target'].values

    y_pred_mcar = logreg_mean.predict(X_test_mcar)

    print(f"{i}: ACC {metrics.accuracy_score(y_test_mcar, y_pred_mcar):.05f}")


0: MSE 0.00983, ACC 0.86667
1: MSE 0.01055, ACC 0.86667
2: MSE 0.00989, ACC 0.86667
3: MSE 0.01025, ACC 0.86667
4: MSE 0.01057, ACC 0.86667
5: MSE 0.00972, ACC 0.86667
6: MSE 0.00976, ACC 0.86667
7: MSE 0.00995, ACC 0.86667
8: MSE 0.00956, ACC 0.86667
9: MSE 0.00964, ACC 0.86667
0: ACC 0.86667
1: ACC 0.86667
2: ACC 0.86667
3: ACC 0.86667
4: ACC 0.86667
5: ACC 0.86667
6: ACC 0.86667
7: ACC 0.86667
8: ACC 0.86667
9: ACC 0.86667


In [69]:
df_train_mcar_imputed.values - df_train.drop('target', axis=1).values

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [-0.523     ,  0.134     ,  0.        ,  0.        ],
       [ 0.        , -0.00307143,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.094     ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        , -0.138     ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.

In [36]:
y_train

array([2., 0., 2., 0., 0., 2., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       2., 0., 0., 0., 2., 0., 2., 0., 0., 1., 1., 2., 0., 0., 0., 1., 0.,
       2., 2., 1., 1., 0., 0., 1., 1., 1., 0., 2., 2., 2., 2., 1., 0., 2.,
       2., 1., 2., 2., 1., 1., 0., 2., 0., 1., 1., 0., 0., 2., 2., 2., 2.,
       2., 0., 2., 1., 2., 2., 0., 2., 1., 1., 1., 2., 0., 2., 0., 0., 2.,
       2., 1., 0., 1., 1., 0., 1., 2., 0., 1., 2., 2., 1., 1., 0., 0., 0.,
       1., 2., 1., 1., 1., 2., 0., 0., 2., 0., 0., 0., 1., 0., 2., 1., 1.,
       0.])

In [37]:
y_train_mcar

array([0.  , 0.49, 1.  , 1.  , 1.  , 1.  , 2.  , 0.  , 2.  , 1.  , 1.  ,
       2.  , 0.  , 2.  , 2.  , 0.  , 2.  , 1.  , 2.  , 1.  , 0.  , 2.  ,
       0.  , 1.  , 0.  , 1.99, 1.  , 0.  , 0.  , 1.  , 0.  , 1.  , 2.  ,
       2.  , 0.  , 2.  , 2.  , 2.  , 0.  , 1.  , 1.  , 0.  , 0.  , 1.  ,
       2.  , 1.  , 0.  , 2.  , 1.  , 2.  , 1.  , 1.  , 0.  , 1.  , 1.  ,
       0.  , 1.  , 2.  , 1.  , 0.  , 2.  , 2.  , 1.  , 0.  , 2.  , 2.  ,
       2.  , 2.  , 2.  , 2.  , 1.  , 0.  , 2.  , 0.  , 0.  , 2.  , 1.  ,
       0.  , 1.  , 1.  , 0.  , 0.  , 2.  , 0.  , 1.  , 2.  , 1.  , 1.  ,
       1.74, 1.  , 0.  , 2.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       2.  , 0.  , 2.  , 1.  , 0.  , 1.  , 0.  , 2.  , 2.  , 0.  , 2.  ,
       1.  , 1.  , 0.  , 2.  , 2.  , 2.  , 0.  , 1.  , 0.  , 1.  ])