In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

Our likelihood function is

$$p(y | X, \beta ,\sigma^2) \propto (\sigma)^{-n/2} \exp \left[-\frac{1}{2\sigma^2} (y-X\beta)^T (y-X\beta) \right] $$

Taking our priors as $\pi(\sigma^2) \propto \frac{1}{\sigma^2}$ and $\pi(\beta) \propto 1$ gives the posterior:

$$p(\sigma^2, \beta | x, y) \propto  (\sigma)^{-n/2 - 1} \exp \left[-\frac{1}{2\sigma^2} (y-X\beta)^T (y-X\beta) \right] $$

$$p(\beta |y, X ,\Sigma) \propto \exp \left[-\frac{1}{2\sigma^2} (y-X\beta)^T(y-X\beta) \right] $$

$$p(\sigma^2 | y, X, \beta) \propto (\sigma)^{-n/2 - 1} \exp \left[-\frac{1}{2\sigma^2} (y-X\beta)^T (y-X\beta) \right] $$

$$\sigma^2 | y, X, \beta \sim IG \left(\frac{n}{2}, \frac{(y-X\beta)^T (y-X\beta)}{2} \right)$$


In [142]:
df = pd.read_csv('../student/student-mat.csv', sep=";")

In [143]:
df

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [144]:
from preprocessing import prepare_data

In [145]:
features, target = prepare_data(df)

In [146]:
X = np.matrix(features)
y = target.values

In [149]:
from model_code import MH_regression, Gibbs_regression

In [154]:
np.random.seed(seed=870)
betas, accept = MH_regression(X, y, 20000, 0.00004)
print(np.mean(accept))

0.33345


In [155]:
thinned_betas = betas[:,::10]
results = pd.DataFrame(thinned_betas.T, columns = features.columns)

In [160]:
quants = results.quantile((0.05, 0.5, 0.95))

In [159]:
quants.to_csv('mh_feature_quantiles.csv')

In [172]:
np.abs(quants.loc[0.5]).nlargest(30)

intercept            5.289151
G2                   3.596550
guardian_mother      1.891200
guardian_other       1.703173
guardian_father      1.694790
reason_other         1.572532
Fjob_health          1.426147
reason_reputation    1.394707
reason_course        1.265818
Mjob_other           1.142159
Mjob_services        1.115642
Fjob_at_home         1.095822
Mjob_at_home         1.067695
reason_home          1.056614
Mjob_teacher         1.042070
Fjob_other           1.011235
Fjob_teacher         0.982179
Mjob_health          0.921796
Fjob_services        0.773277
G1                   0.625854
school_MS            0.480825
schoolsup_yes        0.455821
absences             0.366815
activities_yes       0.345869
famrel               0.319316
romantic_yes         0.271787
Walc                 0.227236
higher_yes           0.226731
nursery_yes          0.222781
age                  0.220785
Name: 0.5, dtype: float64

In [171]:
quants

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0.05,-0.225039,0.136568,-0.150169,0.063714,-0.091636,-0.123433,0.315937,0.042795,0.009439,-0.169349,...,1.884392,1.691628,0.445588,0.169186,0.068141,-0.353038,-0.230606,0.211793,-0.154501,-0.280607
0.5,-0.220785,0.141979,-0.145612,0.067607,-0.087938,-0.119266,0.319316,0.046608,0.01325,-0.164476,...,1.8912,1.703173,0.455821,0.177003,0.075386,-0.345869,-0.222781,0.226731,-0.144025,-0.271787
0.95,-0.216889,0.147526,-0.140829,0.071315,-0.084029,-0.115364,0.322628,0.050496,0.017244,-0.159804,...,1.897628,1.714672,0.466968,0.184922,0.082638,-0.338093,-0.214627,0.241685,-0.134262,-0.263968
