In [96]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from statsmodels import api
from scipy import stats
from scipy.optimize import minimize 

In [97]:
np.random.seed(123)

In [98]:
# generate an independent variable 
x = np.random.rand(2, 200)

# generate a normally distributed residual
e = np.random.normal(10, 5, 200)

# generate ground truth
y = np.dot(x.T, np.random.uniform(1.0, 15.0, size=2))  + e
df = pd.DataFrame(x).T.rename(columns={0: 'x1', 1: 'x2'})
df = pd.concat([df, pd.DataFrame(y).rename(columns={0: 'y'})], axis=1)
df 

Unnamed: 0,x1,x2,y
0,0.696469,0.542636,9.461113
1,0.286139,0.066774,10.052146
2,0.226851,0.653365,16.571117
3,0.551315,0.996086,24.532979
4,0.719469,0.769397,19.908888
...,...,...,...
195,0.635900,0.360424,12.516907
196,0.032198,0.210653,9.317869
197,0.744781,0.421200,15.450712
198,0.472913,0.218035,11.820892


In [99]:
features = api.add_constant(df[['x1', 'x2']])
model = api.OLS(y, features).fit()
model.summary() 

0,1,2,3
Dep. Variable:,y,R-squared:,0.33
Model:,OLS,Adj. R-squared:,0.324
Method:,Least Squares,F-statistic:,48.58
Date:,"Thu, 06 Apr 2023",Prob (F-statistic):,7.07e-18
Time:,22:41:28,Log-Likelihood:,-601.16
No. Observations:,200,AIC:,1208.0
Df Residuals:,197,BIC:,1218.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.0629,0.953,9.507,0.000,7.183,10.943
x1,4.3478,1.311,3.316,0.001,1.762,6.933
x2,11.0557,1.182,9.350,0.000,8.724,13.388

0,1,2,3
Omnibus:,0.413,Durbin-Watson:,1.936
Prob(Omnibus):,0.813,Jarque-Bera (JB):,0.558
Skew:,-0.069,Prob(JB):,0.757
Kurtosis:,2.781,Cond. No.,5.56


In [100]:
res = model.resid
standard_dev = np.std(res)
standard_dev

4.888384545924403

Suppose we have $n$ observations and $p$ features (i.e., $p$ coefficients) and denote $X_i = (x_{i1}, \dots, x_{ip})^T$

$$f(y_i|\beta, \sigma^2) = N (X_i^T\beta, \sigma^2), \text{ where } \beta = (\beta_1, \dots, \beta_j, \dots, \beta_p)^T$$
$$ \beta_j \sim N(\mu_j, \eta_j^2), \text{ where } \mu = (\mu_1, \dots, \mu_p), \text{ and } \eta = (\eta_1, \dots, \eta_p)$$

Then the joint distribution is 
$$L(\beta, \sigma, \mu, \eta) = (\sqrt{2\pi}\sigma)^{-n}\exp\{\frac{\sum_{i=1}^n(y_i - X_i^T\beta)^2}{2\sigma^2}\} \times \prod_{j=1}^p (\sqrt{2\pi}\eta_j)^{-1} \exp\{\frac{(\beta_j - \mu_j)^2}{2\eta_j^2}\}$$

In [101]:
# MLE function
def MLE_Norm(parameters):
       
    beta1, beta2, std_dev = parameters

    # predict the output
    pred = np.dot(x.T, np.array([beta1, beta2]))
    
    # Calculate the log-likelihood for normal distribution
    LL = np.sum(stats.norm.logpdf(y, pred, std_dev))
    
    # Calculate the negative log-likelihood
    neg_LL = -1*LL
    return neg_LL 

In [102]:
 # minimize arguments: function, intial_guess_of_parameters, method
 mle_model = minimize(MLE_Norm, np.array([2, 2, 2]), method='L-BFGS-B')
 mle_model.x

array([16.66478861, 18.78203099,  7.66490432])