In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import sys

N_SPLITS = 5

sys.path.append("../")
rng = np.random.RandomState(0)

def introduce_missingness(data, missing_rate=0.1, mechanism='MCAR'):
    data_missing = data.copy()
    n_samples, n_features = data.shape
    n_missing = int(np.floor(missing_rate * n_samples * n_features))
    
    if mechanism == 'MCAR':
        missing_indices = [(np.random.randint(0, n_samples), np.random.randint(0, n_features)) for _ in range(n_missing)]
    
    for i, j in missing_indices:
        data_missing.iat[i, j] = np.nan
    return data_missing


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

def evaluate_imputation(true_values, imputed_values):
    mae = mean_absolute_error(true_values, imputed_values)
    rmse = np.sqrt(mean_squared_error(true_values, imputed_values))
    return mae, rmse


In [None]:
dataset_v1 = pd.read_csv('production/models/data/latest_cgpa.csv')
dataset_v2 = pd.read_csv('production/models/data/student-mat.csv')
dataset_v3 = pd.read_csv('production/models/data/student-por.csv')
dataset_v4 = pd.read_csv('production/models/data/StudentsPerformance.csv')

In [15]:
dataset_v1.shape

(143, 34)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

mean_imputer = SimpleImputer(strategy='mean')
X_mean_imputed = mean_imputer.fit_transform(dataset_v1)

knn_imputer = KNNImputer(n_neighbors=5)
X_knn_imputed = knn_imputer.fit_transform(dataset_v1)

mice_imputer = IterativeImputer(random_state=0)
X_mice_imputed = mice_imputer.fit_transform(X_misdataset_v1sing)

In [None]:
br_estimator = BayesianRidge()
score_full_data = pd.DataFrame(
    cross_val_score(
        br_estimator, dataset_v1.drop(['sem_8_sgpa'],axis=1), dataset_v1.iloc[:,33], scoring="neg_mean_squared_error", cv=N_SPLITS
    ),
    columns=["Full Data"],
)

missing_features = rng.choice(n_features, n_samples, replace=True)
X_missing[missing_samples, missing_features] = np.nan

score_simple_imputer = pd.DataFrame()
for strategy in ("mean", "median"):
    estimator = make_pipeline(
        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
    )
    score_simple_imputer[strategy] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

estimators = [
    BayesianRidge(),
    RandomForestRegressor(
        n_estimators=4,
        max_depth=10,
        bootstrap=True,
        max_samples=0.5,
        n_jobs=2,
        random_state=0,
    ),
    make_pipeline(
        Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
    ),
    KNeighborsRegressor(n_neighbors=15),
]
score_iterative_imputer = pd.DataFrame()

tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
for impute_estimator, tol in zip(estimators, tolerances):
    estimator = make_pipeline(
        IterativeImputer(
            random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
        ),
        br_estimator,
    )
    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
    )

scores = pd.concat(
    [score_full_data, score_simple_imputer, score_iterative_imputer],
    keys=["Original", "SimpleImputer", "IterativeImputer"],
    axis=1,
)

fig, ax = plt.subplots(figsize=(13, 6))
means = -scores.mean()
errors = scores.std()
means.plot.barh(xerr=errors, ax=ax)
ax.set_xlabel("MSE")
ax.set_yticks(np.arange(means.shape[0]))
ax.set_yticklabels([" w/ ".join(label) for label in means.index.tolist()])
plt.tight_layout(pad=1)
plt.show()