# Statistical Models for Data Science HW 3
### Question 03: Multiple Linear Regression - Optimization

In [45]:
import numpy as np
import pandas as pd

np.random.seed(42)

# Number of observations
n = 100

# independent variables
x1 = np.random.normal(0, 1, n)
x2 = np.random.normal(0, 1, n)

# error term
e = np.random.normal(0, 1, n)

# dependent variable using the linear regression model
y = 1 + 2 * x1 - 0.5 * x2 + e

# dataframe
df = pd.DataFrame({
    'x1': x1,
    'x2': x2,
    'y': y
})

#### 0.  Estimate the parameter of the model using statsmodels in Python

In [47]:
import statsmodels.api as sm 

X = sm.add_constant(df[['x1', 'x2']])
model = sm.OLS(df['y'], X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.802
Model:                            OLS   Adj. R-squared:                  0.798
Method:                 Least Squares   F-statistic:                     196.8
Date:                Tue, 22 Oct 2024   Prob (F-statistic):           7.27e-35
Time:                        15:59:14   Log-Likelihood:                -147.62
No. Observations:                 100   AIC:                             301.2
Df Residuals:                      97   BIC:                             309.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0886      0.108     10.058      0.0

#### 1. Create a table that includes the observations for the dependent variable, the prediction, the intercept, each slope coefficient multiplied by their associated independent variable observations, and the residuals.

In [52]:
df['y_hat'] = model.predict(X)
df['b0'] = model.params['const']
df['b1_x1'] = model.params['x1'] * df['x1']
df['b2_x2'] = model.params['x2'] * df['x2']
df['e_hat'] = df['y'] - df['y_hat']
df

Unnamed: 0,x1,x2,y,y_hat,b0,b1_x1,b2_x2,e_hat
0,0.496714,-1.415371,3.058901,2.919440,1.088649,1.105730,0.725062,0.139461
1,-0.138264,-0.420645,1.494579,0.996347,1.088649,-0.307789,0.215487,0.498232
2,0.647689,-0.342715,3.549786,2.706026,1.088649,1.441813,0.175565,0.843760
3,1.523030,-0.802277,5.501000,4.890037,1.088649,3.390401,0.410988,0.610963
4,-0.234153,-0.161286,-0.765333,0.650025,1.088649,-0.521246,0.082623,-1.415359
...,...,...,...,...,...,...,...,...
95,-1.463515,0.385317,-2.812598,-2.366656,1.088649,-3.257915,-0.197389,-0.445943
96,0.296120,-0.883857,2.933769,2.200619,1.088649,0.659190,0.452780,0.733151
97,0.261055,0.153725,1.752548,1.591031,1.088649,0.581132,-0.078750,0.161516
98,0.005113,0.058209,1.793985,1.070213,1.088649,0.011383,-0.029819,0.723772


#### 2. Verify that the predictions plus the residuals sum to the dependent variable for each observation.

In [56]:
df['y_hat + e_hat'] = df['y_hat'] + df['e_hat']
df

Unnamed: 0,x1,x2,y,y_hat,b0,b1_x1,b2_x2,e_hat,y_hat + e_hat
0,0.496714,-1.415371,3.058901,2.919440,1.088649,1.105730,0.725062,0.139461,3.058901
1,-0.138264,-0.420645,1.494579,0.996347,1.088649,-0.307789,0.215487,0.498232,1.494579
2,0.647689,-0.342715,3.549786,2.706026,1.088649,1.441813,0.175565,0.843760,3.549786
3,1.523030,-0.802277,5.501000,4.890037,1.088649,3.390401,0.410988,0.610963,5.501000
4,-0.234153,-0.161286,-0.765333,0.650025,1.088649,-0.521246,0.082623,-1.415359,-0.765333
...,...,...,...,...,...,...,...,...,...
95,-1.463515,0.385317,-2.812598,-2.366656,1.088649,-3.257915,-0.197389,-0.445943,-2.812598
96,0.296120,-0.883857,2.933769,2.200619,1.088649,0.659190,0.452780,0.733151,2.933769
97,0.261055,0.153725,1.752548,1.591031,1.088649,0.581132,-0.078750,0.161516,1.752548
98,0.005113,0.058209,1.793985,1.070213,1.088649,0.011383,-0.029819,0.723772,1.793985


#### 2. Verify that the sum of the residuals equal zero.

In [112]:
np.sum(df['e_hat'])

-1.9539925233402755e-14

In [68]:
np.sum(df['y']), np.sum(df['y_hat'])

(84.60509247872987, 84.6050924787299)

#### 3. Verify that the sample covariance of the residuals and the observation is zero for each independent variable.

In [76]:
df['e_hat_x1'] = df['e_hat'] * df['x1']
df['e_hat_x2'] = df['e_hat'] * df['x2']

# Sample covariance: Mean of the observation covariance
df['e_hat_x1'].mean(), df['e_hat_x2'].mean()

(-9.348077867343818e-16, 8.926193117986258e-16)

#### 4. Verify that the average of the dependent variable observations equal the average of the predictions within the sample.

In [115]:
df['y'].mean(), df['y_hat'].mean()

(0.8460509247872987, 0.846050924787299)

#### 5. Compute SSE, SST, SSR. Verify that SST = SSE + SSR.

In [83]:
# SSE: Sum of Squares Error
SSE = np.sum((df['e_hat']) **2)

# SSR: Sum of Squares Regression
SSR = np.sum((df['y_hat'] - np.mean(df['y'])) ** 2)

# SST: Total Sum of Squares
SST = np.sum((df['y'] - np.mean(df['y'])) ** 2)

SSE, SSR, SST, SSE * SSR

(112.13899436528182, 454.93265048917874, 567.07164485446, 51015.689929788736)

#### 6. Compute the $R^2$ manually. Verify that it matches the value from the model output.

In [117]:
R_squared = 1 - (SSE / SST)
R_squared, model.rsquared

(0.8022489832055304, 0.8022489832055304)

### Question 04: Gauss-Markov Assumptions / Theorem
#### 0. In the context of a linear regression. What is an estimator?
A
#### 1. What is a linear estimator?
C
#### 2. What is an unbiased estimator?
D
#### 3. What is a best estimator?
A
#### 4. True or False. Under the Gauss-Markov assumptions 1-4, the OLS is an unbiased estimator.
True
#### 5. True or False. Under the Gauss-Markov assumptions 1-5, the OLS is an unbiased linear estimator with the lowest variance among all estimators.
True

### Question 05: Gauss-Markov Assumptions / Theorem
#### 0. Which statement below is true?
D