In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.datasets import load_diabetes
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
SEED = 42
FOLDS = 5

### Load data
Use Diabetes dataset, where based on features, the quantitatice measure of disease progression one year after baseline is predicted.

In [None]:
X, y = load_diabetes(as_frame=True, return_X_y=True)

print("Data frame shape:", X.shape)
X.describe()

### Check for multicolinearity
We can see that features *s1* and *s2* are highly correlated and *s3* and *s4* are somewhat highly negatively correlated.
This would be a call for feature selection and feature engineering.
However since the feature-dimensionality is already small, we resort to regularization techniques or tree-based models which are more robust against multicolinearity.

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Manipulate and split data
We intentionally remove some entries to be able to use data imputation later.

In [None]:
np.random.seed(SEED)
X = X.mask(np.random.random(X.shape) < 1e-2) # 1% NaN
print(X.isna().sum())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=SEED)

### Choose model, scaler, impute strategies

In [None]:
impute_strategies = {
  'mean': SimpleImputer(strategy='mean'),
  'median': SimpleImputer(strategy='median'),
  'most_frequent': SimpleImputer(strategy='most_frequent'),
  'knn': KNNImputer(n_neighbors=3),
}
scaler = StandardScaler()
model = RandomForestRegressor(n_jobs=-1)

### Impute data
To deal with incomplete data, we need to impute missing values.
Use cross-validation to find good strategy.

In [None]:
plt.figure(figsize=(6, 4))
for name, imputer in impute_strategies.items():
  pipe = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler),
    ('model', model)
  ])
  scores = cross_val_score(pipe, X_train, y_train, cv=FOLDS, scoring='neg_mean_squared_error')
  print(f"Negative mean squared error for {name}:", np.mean(scores))
  plt.plot(range(FOLDS), scores, label=name)
plt.title("Negative mean squared error of each fold across different impute strategies")
plt.xlabel("Folds")
plt.ylabel("Negative mean squared error")
plt.legend()
plt.grid(True)
plt.show()

### Hyperparameter optimization
From previous results, we decide to continue with *knn* imputer.
We use nested cross-validation to find good hyper-parameters.

In [None]:
parameters = {
  'imputer__n_neighbors': [1, 3, 5],
  'model__n_estimators': [10, 100, 500],
  'model__max_depth': [None, 10, 30],
}
pipe = Pipeline([
  ('imputer', impute_strategies['knn']),
  ('model', RandomForestRegressor())
])
regr = GridSearchCV(pipe, parameters, cv=FOLDS, scoring='r2')

In [None]:
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print(regr.best_params_)
print("RMSE", root_mean_squared_error(y_test, y_pred))

In [None]:
num_dp = y_test.shape[0]
plt.figure(figsize=(8,4))
plt.plot(range(num_dp), y_test, label='True', color='red')
plt.plot(range(num_dp), y_pred, label='Predicted', color='blue')
plt.legend()
plt.grid(True)
plt.title('True vs Predicted')
plt.show()