In [3]:
import arviz as az
import numpy as np
import pandas as pd
import polars as ps
import pymc as pm
from sklearn.model_selection import train_test_split
# from theano import shared

AttributeError: partially initialized module 'theano' has no attribute 'compile' (most likely due to a circular import)

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
type_map: list = [ps.Int32, ps.Int32, ps.Int32, ps.Int32, ps.Int32, ps.Int32, ps.Int32, 
                  ps.Int32, ps.Float32, ps.Float32, ps.Float32, ps.Float32, ps.Float32, 
                  ps.Float32, ps.Float32, ps.Float32, ps.Float32, ps.Float32, 
                  ps.Float32, ps.Int32, ps.Float32, ps.Float32, ps.Float32]

In [None]:
dataset: ps.DataFrame = ps.read_csv('full_dataset_train.csv', dtypes=type_map, encoding='utf8', n_threads=8, use_pyarrow=True)

In [None]:
dataset = dataset[:1000]

In [None]:
dataset.head()

# Features

In [None]:
features: list = ['AQI', 'Status', 'PM10', 'PM10_AVG', 'PM2.5_AVG']
target_feature: str = 'PM2.5'

In [None]:
variables: np.array = np.array([dataset[feature].to_numpy() for feature in features])
variables = variables.transpose()

In [None]:
target: np.array = dataset[target_feature].to_numpy().reshape(1, -1)
target = target.transpose()

In [None]:
train_variables, test_variables, train_target, test_target = train_test_split(variables, target, test_size=0.4, random_state=37710, shuffle=True)

In [None]:
# train_variables = shared(train_variables.transpose())
train_variables = train_variables.transpose()
train_target = train_target.transpose()
test_variables = test_variables.transpose()
test_target = test_target.transpose()

# Model

In [None]:
with pm.Model() as predictor:
	intercept = pm.Normal(name='intercept', sd=10)
	coefficients = pm.Normal(name='coefficients', mu=0, sd=1, shape=5)
	error = pm.HalfCauchy(name='error', beta=5)
	model = pm.Deterministic('model', intercept + pm.math.dot(coefficients, train_variables))
	
	prediction = pm.Normal(name='prediction', mu=model, sd=error, observed=train_target)

	trace = pm.sample(tune=2000, chains=2, cores=8)
	ppc = pm.sample_posterior_predictive(trace, samples=2000)

# Performance Analysis

In [None]:
az.plot_trace(trace, var_names=['intercept', 'coefficients', 'error'])

In [None]:
summary: pd.DataFrame = az.summary(trace, var_names=['intercept', 'coefficients', 'error'])
summary

## MSE

In [None]:
number_of_test_data: int = len(test_target)
formula_intercept: float = summary['mean'].tolist()[0]
formula_coefficients: np.array  = np.array(summary['mean'].to_list[1:1+len(features)])
mean_squared_error: float = 0
for i in range(number_of_test_data):
	mean_squared_error += (sum(np.dot(test_variables[i], formula_coefficients)) + formula_intercept - test_target[i])**2
mean_squared_error /= number_of_test_data

In [None]:
print('MSE:{:.2f}'.format(mean_squared_error))