In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel


In [2]:
# Load data
data = pd.read_csv('sensor_data.csv')

In [3]:
data.count()

ID             6583
timestamp      6583
latitude       6583
longitude      6583
temperature    6345
humidity       6321
dtype: int64

In [4]:
#preparing data for temperature mode
temp_data = data.copy()
temp_data.dropna(subset=['temperature'],inplace = True)

In [5]:
temp_data.interpolate(inplace=True)

In [6]:
temp_data.count()

ID             6345
timestamp      6345
latitude       6345
longitude      6345
temperature    6345
humidity       6345
dtype: int64

In [7]:
data_X_temp = temp_data[['latitude', 'longitude', 'humidity', 'timestamp']]
data_y_temp = temp_data['temperature']

In [8]:
data_X_temp.count()

latitude     6345
longitude    6345
humidity     6345
timestamp    6345
dtype: int64

In [10]:
# Define kernel functions
kernel_temp = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0)


In [11]:
model_temp = GaussianProcessRegressor(kernel=kernel_temp)
model_temp.fit(data_X_temp, data_y_temp)



GaussianProcessRegressor(kernel=1**2 * RBF(length_scale=1) + WhiteKernel(noise_level=1))

In [12]:
#preparing data for humidity mode
humid_data = data.copy()
humid_data.dropna(subset=['humidity'],inplace = True)

In [13]:
humid_min = humid_data.loc[humid_data['humidity']>0,'humidity'].min()
print(humid_min)
humid_data.loc[humid_data['humidity']<0,'humidity'] = humid_min

1.781


In [14]:
humid_data.interpolate(inplace=True)

In [15]:
humid_data.count()

ID             6321
timestamp      6321
latitude       6321
longitude      6321
temperature    6321
humidity       6321
dtype: int64

In [18]:
data_x_humid = humid_data[['latitude', 'longitude', 'temperature', 'timestamp']]
data_y_humid = humid_data['humidity']

In [19]:
# Define kernel functions
kernel_humid = ConstantKernel(1.0) * RBF(length_scale=1.0) + WhiteKernel(noise_level=1.0)

#kernel_humid = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e5)) + WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-10, 1e+1))



In [20]:
model_humid = GaussianProcessRegressor(kernel=kernel_humid)
model_humid.fit(data_x_humid, data_y_humid)

GaussianProcessRegressor(kernel=1**2 * RBF(length_scale=1) + WhiteKernel(noise_level=1))

In [21]:
# final prediction for submission
pred_data = data.copy()
pred_data = pred_data[pred_data['temperature'].isna() | pred_data['humidity'].isna()]

In [22]:
pred_data.count()

ID             500
timestamp      500
latitude       500
longitude      500
temperature    262
humidity       238
dtype: int64

In [23]:
pred_temp = pred_data[pred_data['temperature'].isna()]

In [24]:
pred_temp.count()

ID             238
timestamp      238
latitude       238
longitude      238
temperature      0
humidity       238
dtype: int64

In [25]:
pred_humid = pred_data[pred_data['humidity'].isna()]

In [26]:
pred_humid.count()

ID             262
timestamp      262
latitude       262
longitude      262
temperature    262
humidity         0
dtype: int64

In [27]:
pred_temp.drop(['temperature'], axis=1,inplace=True)
pred_humid.drop(['humidity'], axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_temp.drop(['temperature'], axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_humid.drop(['humidity'], axis=1,inplace=True)


In [28]:
pred_temp.count()

ID           238
timestamp    238
latitude     238
longitude    238
humidity     238
dtype: int64

In [29]:
pred_humid.count()

ID             262
timestamp      262
latitude       262
longitude      262
temperature    262
dtype: int64

In [30]:
pred_temp['prediction'] = model_temp.predict(pred_temp[['latitude', 'longitude', 'humidity', 'timestamp']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_temp['prediction'] = model_temp.predict(pred_temp[['latitude', 'longitude', 'humidity', 'timestamp']])


In [31]:
pred_temp.count()


ID            238
timestamp     238
latitude      238
longitude     238
humidity      238
prediction    238
dtype: int64

In [32]:
pred_humid['prediction'] = model_temp.predict(pred_humid[['latitude', 'longitude', 'temperature', 'timestamp']])



Feature names unseen at fit time:
- temperature
Feature names seen at fit time, yet now missing:
- humidity

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_humid['prediction'] = model_temp.predict(pred_humid[['latitude', 'longitude', 'temperature', 'timestamp']])


In [33]:
pred_humid.count()

ID             262
timestamp      262
latitude       262
longitude      262
temperature    262
prediction     262
dtype: int64

In [35]:
submission_df = pd.concat([pred_temp[['ID','prediction']],pred_humid[['ID','prediction']]],axis=0).sort_values('ID')

In [36]:
submission_df.count()

ID            500
prediction    500
dtype: int64

In [37]:
submission_df.to_csv('submission_file.csv',index=False)