

### Main Objectives
    1. Linear and Polynomial Model
    2. Model Comparison wrt Predictive Accuracy
    3. Model Comparison wrt Complexity

In [None]:
# Importing all necessarry libraries
import pymc as pm
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import arviz as az
az.style.use('arviz-darkgrid')

## Import Fish Dataset

In [None]:
fish_data = pd.read_csv('fish.csv')
fish_data.head()

Unnamed: 0,Species,Weight,Length,Diagonal,Height,Width
0,Bream,242.0,25.4,30.0,11.52,4.02
1,Bream,290.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,26.5,31.1,12.3778,4.6961
3,Bream,363.0,29.0,33.5,12.73,4.4555
4,Bream,430.0,29.0,34.0,12.444,5.134


#### Independent Variable: Fish Width
#### Dependent Variable: FIsh Height

In [None]:
# Extract the required columns
# width and height of fish
f_height = np.array(fish_data['Height'].values)
f_width = np.array(fish_data['Width'].values)

In [None]:
## normalize the parameters
f_height = (f_height - f_height.mean())/f_height.std()
f_width = (f_width - f_width.mean())/f_width.std()

In [None]:
## Linear Model Definition

with pm.Model() as model_l:
    α = pm.Normal('α', mu=0, sigma=1)
    β = pm.Normal('β', mu=0, sigma=10)
    ϵ = pm.HalfNormal('ϵ', 5)

    μ = α + β * f_width

    y_pred = pm.Normal('y_pred', mu=μ, sigma=ϵ, observed=f_height)

    idata_l = pm.sample(2000, idata_kwargs={'log_likelihood': True})

## Polynomial Model Definition

with pm.Model() as model_p:
    α = pm.Normal('α', mu=0, sigma=1)
    β = pm.Normal('β', mu=0, sigma=10, shape=2)
    ϵ = pm.HalfNormal('ϵ', 5)

    μ = α + β[0]*f_width + β[1]*(np.square(f_width))

    y_pred = pm.Normal('y_pred', mu=μ, sigma=ϵ, observed=f_height)

    idata_p = pm.sample(2000, idata_kwargs={'log_likelihood': True})


### <font color='Green'> Comparison here </font>

In [None]:
cmp_df = az.compare({'model_l':idata_l, 'model_p':idata_p},method='BB-pseudo-BMA', ic="loo",scale="log")
cmp_df

Unnamed: 0,rank,elpd_loo,p_loo,elpd_diff,weight,se,dse,warning,scale
model_p,0,-145.376065,2.997476,0.0,0.920087,7.220625,0.0,False,log
model_l,1,-149.730877,2.596228,4.354812,0.079913,7.065858,2.540225,False,log


<a href="https://python.arviz.org/en/latest/api/generated/arviz.compare.html">For more detail</a>

## <font color='Green'> Question 1 </font>

##### <font color='Green'>Based on the above table, which model do you think is better from predictive accuracy point of view and why? </font>

With respect to log scale, \& comparing the **elpd_loo** for both the models i.e model\_p and model\_l, model\_p is better. because of large elpd_loo value.

If scale is "negative_log" or "deviance" then we'll choose model with small elpd value. & in all cases the model\_p is better.

## <font color='Green'> Question 2    </font>

By comparing the **p\_loo** for both the models i.e model\_p and model\_l, model\_l is less complex.