In [41]:
import numpy as np
import pandas as pd
import pymc3 as pm
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv("sensor_data.csv")

# Define variables
n_nodes = 5
n_obs = data.shape[0]
n_missing = data[['humidity', 'temperature']].isna().sum()

In [42]:
# Split data into training and test sets
train_data = data.dropna().reset_index(drop=True)
test_data = data.loc[data['humidity'].isna() | data['temperature'].isna() ].reset_index(drop=True)

In [43]:
test_data.count()

ID             500
timestamp      500
latitude       500
longitude      500
temperature    262
humidity       238
dtype: int64

In [44]:
train_data.count()


ID             6083
timestamp      6083
latitude       6083
longitude      6083
temperature    6083
humidity       6083
dtype: int64

In [45]:
data.count()

ID             6583
timestamp      6583
latitude       6583
longitude      6583
temperature    6345
humidity       6321
dtype: int64

In [46]:
# Normalize data
scaler = StandardScaler()
train_data[['humidity', 'temperature']] = scaler.fit_transform(train_data[['humidity', 'temperature']])
test_data[['humidity', 'temperature']] = scaler.transform(test_data[['humidity', 'temperature']])

In [47]:
train_data.to_csv('train_data.csv')

In [48]:
test_data.to_csv('test_data.csv')

In [52]:
# Define Bayesian linear regression model
with pm.Model() as model:
    # Priors for regression coefficients
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta_humidity = pm.Normal('beta_humidity', mu=0, sd=10)
    beta_temperature = pm.Normal('beta_temperature', mu=0, sd=10)
    # Prior for residual standard deviation
    sigma = pm.HalfNormal('sigma', sd=1)
    
    # Regression model
    mu = alpha + beta_humidity * train_data['humidity'] + beta_temperature * train_data['temperature']
    
    # Likelihood function
    y_obs = pm.Normal('y_obs', mu=mu, sd=sigma, observed=train_data['humidity'])
    
    # Imputation for missing values
    mu_missing = alpha + beta_humidity * test_data['humidity'] + beta_temperature * test_data['temperature']
    y_missing = pm.Normal('y_missing', mu=mu_missing, sd=sigma, shape=n_missing)
    
    # Sample from posterior distribution using Markov chain Monte Carlo (MCMC) method
    trace = pm.sample(draws=1000, tune=1000)
    

AttributeError: [unnamed] ~ Normal has no finite default value to use, checked: ('median', 'mean', 'mode'). Pass testval argument or adjust so value is finite.

In [40]:
    
# Predict missing values using posterior distribution
posterior_samples = pm.sample_posterior_predictive(trace, samples=1000, var_names=['y_missing'])['y_missing']
imputed_values = np.mean(posterior_samples, axis=0)

# Inverse transform normalization
imputed_values = scaler.inverse_transform(imputed_values.reshape(-1, 1)).flatten()

NameError: name 'trace' is not defined

In [None]:
# Update test data with imputed values
test_data['value'] = imputed_values

# Concatenate train and test data
imputed_data = pd.concat([train_data, test_data]).sort_index()

# Calculate training error
y_train = train_data['value']
y_pred_train = trace['alpha'] + trace['beta_humidity'] * train_data['humidity'] + trace['beta_temperature'] * train_data['temperature']
train_error = np.sqrt(np.mean((y_train - np.mean(y_pred_train, axis=0))**2))

print('Training error:', train_error)