In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

# Load data
data = pd.read_csv('sensor_data.csv')

# Split data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [94]:
train_data.count()

ID             5266
timestamp      5266
latitude       5266
longitude      5266
temperature    5076
humidity       5049
dtype: int64

In [95]:
test_data.count()

ID             1317
timestamp      1317
latitude       1317
longitude      1317
temperature    1269
humidity       1272
dtype: int64

In [96]:
# Define kernel
kernel = RBF() + WhiteKernel()

In [97]:
# Fit GPR model to training data for temperature
train_temp = train_data.dropna(subset=['temperature'])

In [106]:
train_temp = train_temp.interpolate()


In [107]:
train_temp.count()

ID             5076
timestamp      5076
latitude       5076
longitude      5076
temperature    5076
humidity       5076
dtype: int64

In [108]:
train_X_temp = train_temp[['latitude', 'longitude', 'humidity', 'timestamp']]
train_y_temp = train_temp['temperature']

In [108]:
test_X_temp = test_temp[['latitude', 'longitude', 'humidity', 'timestamp']]
test_y_temp = test_temp['temperature']

In [109]:
type(train_X_temp)

pandas.core.frame.DataFrame

In [110]:
np.shape(train_X_temp)

(5076, 4)

In [111]:
np.shape(train_y_temp)

(5076,)

In [112]:
model_temp = GaussianProcessRegressor(kernel=kernel)
model_temp.fit(train_X_temp, train_y_temp)



GaussianProcessRegressor(kernel=RBF(length_scale=1) + WhiteKernel(noise_level=1))

In [113]:
# Impute missing temperature values in test data
test_temp = test_data.copy()
test_temp.loc[test_temp['temperature'].isna(), 'temperature'] = model_temp.predict(test_temp.loc[test_temp['temperature'].isna(), ['latitude', 'longitude', 'humidity', 'timestamp']].values)



Unnamed: 0,ID,timestamp,latitude,longitude,temperature,humidity
611,612,1419036780,-37.814808,144.980999,21.93,39.150
4987,4988,1419276240,-37.814922,144.982258,22.90,50.027
1419,1420,1418095440,-37.814610,144.979018,22.90,31.420
2471,2472,1420470300,-37.814610,144.979018,18.70,
3096,3097,1418835720,-37.813073,144.980406,21.29,32.840
...,...,...,...,...,...,...
3707,3708,1419329100,-37.813073,144.980406,16.12,51.447
5914,5915,1419051780,-37.813408,144.979492,17.09,45.610
110,111,1418131320,-37.814808,144.980999,22.58,37.100
2574,2575,1420733280,-37.814610,144.979018,15.48,72.100


In [77]:
# Fit GPR model to training data for humidity
train_humid = train_data.dropna(subset=['humidity'])
train_X_humid = train_humid[['latitude', 'longitude', 'temperature', 'timestamp']]
train_y_humid = train_humid['humidity']

In [78]:
train_X_humid

Unnamed: 0,latitude,longitude,temperature,timestamp
3221,-37.813073,144.980406,25.16,1419029760
1862,-37.814610,144.979018,15.48,1418868900
4308,-37.814922,144.982258,11.61,1418740320
3527,-37.813073,144.980406,22.25,1419218220
2252,-37.814610,144.979018,18.70,1419257520
...,...,...,...,...
3092,-37.813073,144.980406,19.03,1418833260
3772,-37.813073,144.980406,18.38,1420433580
5191,-37.814922,144.982258,17.09,1420467000
5226,-37.814922,144.982258,23.22,1420688160


In [79]:
train_X_humid = train_X_temp.interpolate()

In [80]:
np.shape(train_X_humid)

(5076, 4)

In [81]:
train_X_humid.count()

latitude     5076
longitude    5076
humidity     5076
timestamp    5076
dtype: int64

In [83]:
np.shape(train_y_humid)

(5049,)

In [75]:
model_humid = GaussianProcessRegressor(kernel=kernel)
model_humid.fit(train_X_humid, train_y_humid)

ValueError: Found input variables with inconsistent numbers of samples: [5076, 5049]

In [57]:
# Impute missing humidity values in test data
test_humid = test_data.copy()
test_humid.loc[test_humid['humidity'].isna(), 'humidity'] = model_humid.predict(test_humid.loc[test_humid['humidity'].isna(), ['latitude', 'longitude', 'temperature', 'timestamp']].values)

# Concatenate imputed temperature and humidity values into a single DataFrame
imputed_data = pd.concat([test_temp, test_humid[['humidity']]], axis=1)

# Evaluate model on test data
test_rmse_temp = ((test_data['temperature'] - imputed_data['temperature'])**2).mean()**0.5
test_rmse_humid = ((test_data['humidity'] - imputed_data['humidity'])**2).mean()**0.5

print(f"Test RMSE for temperature: {test_rmse_temp:.2f}")
print(f"Test RMSE for humidity: {test_rmse_humid:.2f}")



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').