In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

# Load data into pandas dataframe
df = pd.read_csv('sensor_data.csv')
df.set_index('timestamp', inplace=True)

# Check for missing or invalid values in data
print(df.isna().sum())
print(df.describe())

# Remove rows with missing or invalid values
df.dropna(inplace=True)
df = df[(df >= 0) & (df <= 100)] # Assuming humidity and temperature values are between 0 and 100

# Create boolean mask for missing values
mask = df.isna()

# Split data into training and test sets
train = df[~mask]
test = df[mask]

# Define kernel for GP regression
kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)) + WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-5, 1e+1))

# Fit GP regression model with observed data
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
X_train = train.index.values.reshape(-1, 1)
y_train = train.values
gp.fit(X_train, y_train)

# Predict missing values in test set
X_test = test.index.values.reshape(-1, 1)
y_pred, y_std = gp.predict(X_test, return_std=True)

# Replace missing values with predicted values
test_imputed = test.copy()
test_imputed.iloc[:, :] = y_pred

# Evaluate model performance
mse = np.mean((y_pred - test.values)**2)
mae = np.mean(np.abs(y_pred - test.values))
corr = np.corrcoef(y_pred.T, test.values.T)[0, 1]

print("Mean squared error: ", mse)
print("Mean absolute error: ", mae)
print("Correlation coefficient: ", corr)

# Plot predicted values with uncertainty bounds
plt.plot(train.index, train, 'ro', markersize=5, label='Observations')
plt.plot(test.index, y_pred, 'b-', label='Predictions')
plt.fill_between(test.index, y_pred - 2*y_std, y_pred + 2*y_std, alpha=0.2)
plt.legend(loc='best')
plt.show()


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').