In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, ConstantKernel
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load data
df = pd.read_csv('sensor_data.csv')

# Create X and y arrays
X = df.drop(['humidity1', 'humidity2', 'humidity3', 'humidity4', 'humidity5', 'temp1', 'temp2', 'temp3', 'temp4', 'temp5'], axis=1).values
y = df[['humidity1', 'humidity2', 'humidity3', 'humidity4', 'humidity5', 'temp1', 'temp2', 'temp3', 'temp4', 'temp5']].values

# Split data into observed and missing
observed_mask = ~np.isnan(y)
observed_indices = np.where(observed_mask)
missing_indices = np.where(~observed_mask)

observed_X = X[observed_mask[:, 0], :]
observed_y = y[observed_mask]
missing_X = X[~observed_mask[:, 0], :]

# Define kernel
kernel = ConstantKernel() * RBF() + WhiteKernel()

# Define Gaussian Process model
model = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=0.1, normalize_y=True)

# Define hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'kernel__k1__constant_value': [0.1, 1.0, 10.0],
    'kernel__k1__length_scale': [0.1, 1.0, 10.0],
    'kernel__k2__noise_level': [0.01, 0.1, 1.0],
}

# Perform hyperparameter optimization
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', n_jobs=-1)
search.fit(observed_X, observed_y)

print("Best hyperparameters:", search.best_params_)
model = search.best_estimator_

# Fit model to observed data
model.fit(observed_X, observed_y)

# Predict missing values
missing_y, std = model.predict(missing_X, return_std=True)

# Insert predicted values into original array
y_imputed = y.copy()
y_imputed[missing_indices] = missing_y

# Compute metrics
mse = mean_squared_error(y[observed_mask], y_imputed[observed_mask])
mae = mean_absolute_error(y[observed_mask], y_imputed[observed_mask])
corr = np.corrcoef(y[observed_mask], y_imputed[observed_mask], rowvar=False)[0, 1]

print("Mean squared error: ", mse)
print("Mean absolute error: ", mae)
print("Correlation coefficient: ", corr)

# Plot results
plt.figure(figsize=(10, 5))

plt.scatter(y[observed_mask], y_imputed[observed_mask], c=X[observed_mask[:, 0], :])
plt.colorbar()
plt.plot([0, 100], [0, 100], 'k--')
plt.xlabel('Observed')
plt.ylabel('Imputed')
plt.title('Imputed vs Observed Sensor Measurements')
plt.show()


KeyError: "['humidity1', 'humidity2', 'humidity3', 'humidity4', 'humidity5', 'temp1', 'temp2', 'temp3', 'temp4', 'temp5'] not found in axis"