In [1]:
#!pip install pymc

In [2]:
import pandas as pd
import numpy as np
import pymc as pm

train_data = pd.read_csv('train_reg.csv')
test_data = pd.read_csv('test_reg.csv')

train_data

Unnamed: 0,x,y
0,0.415709,2.267527
1,2.078132,5.761260
2,6.152402,17.237352
3,7.064143,18.693992
4,8.140977,23.252819
...,...,...
95,7.213907,19.271893
96,-0.173887,0.688316
97,3.423655,9.814922
98,9.486892,29.445184


In [3]:
test_data.head()

Unnamed: 0,x
0,5.913738
1,2.657985
2,-0.666239
3,9.891288
4,7.224223


In [4]:
with pm.Model() as model:

    intercept = pm.Normal('Intercept', mu=0, sigma=10)
    coef = pm.Normal('Coef', mu=0, sigma=10, shape=1)
    y_pred = intercept + coef * train_data['x']
    likelihood = pm.Normal('y', mu=y_pred, sigma=1, observed=train_data['y'])
    trace = pm.sample(1000, tune=1000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (3 chains in 3 jobs)
NUTS: [Intercept, Coef]


Sampling 3 chains for 1_000 tune and 1_000 draw iterations (3_000 + 3_000 draws total) took 11 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


In [5]:
print(trace.posterior)

<xarray.Dataset>
Dimensions:     (chain: 3, draw: 1000, Coef_dim_0: 1)
Coordinates:
  * chain       (chain) int64 0 1 2
  * draw        (draw) int64 0 1 2 3 4 5 6 7 ... 992 993 994 995 996 997 998 999
  * Coef_dim_0  (Coef_dim_0) int64 0
Data variables:
    Intercept   (chain, draw) float64 -0.9238 -0.6532 ... -0.6396 -0.5661
    Coef        (chain, draw, Coef_dim_0) float64 3.131 3.082 ... 3.089 3.098
Attributes:
    created_at:                 2023-11-14T10:39:56.885525
    arviz_version:              0.16.0
    inference_library:          pymc
    inference_library_version:  5.7.2
    sampling_time:              11.468213081359863
    tuning_steps:               1000


In [6]:
coefs = trace.posterior["Coef"].values
inter = trace.posterior["Intercept"].values

In [7]:
intercept_mean = np.mean(inter)
coef_mean = np.mean(coefs)
y_pred_test = intercept_mean + coef_mean * test_data['x']
print(y_pred_test)

0     17.538740
1      7.486136
2     -2.777881
3     29.819996
4     21.585051
        ...    
95    27.665781
96    31.964309
97     4.975009
98     0.940090
99    19.059985
Name: x, Length: 100, dtype: float64


In [8]:
from sklearn.metrics import mean_squared_error
y_true = pd.read_csv('test_Y_true_reg.csv')
mean_squared_error(y_true, y_pred_test)

2.3647319728491563