# Housing Example: Build a simple Linear Regression to predict housing prices. Use whatever library you like

In [23]:
import statsmodels.api as sm
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import sklearn 

In [4]:
data = sm.datasets.get_rdataset('Guerry', 'HistData').data

In [11]:
data

Unnamed: 0,dept,Region,Department,Crime_pers,Crime_prop,Literacy,Donations,Infants,Suicides,MainCity,Wealth,Commerce,Clergy,Crime_parents,Infanticide,Donation_clergy,Lottery,Desertion,Instruction,Prostitutes,Distance,Area,Pop1831
0,1,E,Ain,28870,15890,37,5098,33120,35039,2:Med,73,58,11,71,60,69,41,55,46,13,218.372,5762,346.03
1,2,N,Aisne,26226,5521,51,8901,14572,12831,2:Med,22,10,82,4,82,36,38,82,24,327,65.945,7369,513.00
2,3,C,Allier,26747,7925,13,10973,17044,114121,2:Med,61,66,68,46,42,76,66,16,85,34,161.927,7340,298.26
3,4,E,Basses-Alpes,12935,7289,46,2733,23018,14238,1:Sm,76,49,5,70,12,37,80,32,29,2,351.399,6925,155.90
4,5,E,Hautes-Alpes,17488,8174,69,6962,23076,16171,1:Sm,83,65,10,22,23,64,79,35,7,1,320.280,5549,129.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,86,W,Vienne,15010,4710,25,8922,35224,21851,2:Med,68,43,71,20,1,44,40,38,65,18,170.523,6990,282.73
82,87,C,Haute-Vienne,16256,6402,13,13817,19940,33497,2:Med,67,63,76,68,6,78,55,11,84,7,198.874,5520,285.13
83,88,E,Vosges,18835,9044,62,4040,14978,33029,2:Med,82,42,51,58,34,5,14,85,11,43,174.477,5874,397.99
84,89,C,Yonne,18006,6516,47,4276,16616,12789,2:Med,30,15,55,32,22,35,51,66,27,272,81.797,7427,352.49


In [14]:
relationship = smf.ols('Literacy ~ Lottery + np.log(Pop1831)', data = data).fit()
print(relationship.summary())

                            OLS Regression Results                            
Dep. Variable:               Literacy   R-squared:                       0.153
Model:                            OLS   Adj. R-squared:                  0.132
Method:                 Least Squares   F-statistic:                     7.481
Date:                Thu, 30 Nov 2023   Prob (F-statistic):            0.00103
Time:                        13:22:09   Log-Likelihood:                -359.88
No. Observations:                  86   AIC:                             725.8
Df Residuals:                      83   BIC:                             733.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          96.6935     33.587     

In [26]:
X

0     42.846525
1     57.240276
2     18.697966
3     51.049215
4     73.860587
        ...    
81    30.644492
82    18.652945
83    67.986427
84    52.865022
85    54.275100
Length: 86, dtype: float64

In [31]:
from sklearn.linear_model import LinearRegression

x1 = data['Literacy']
x2 = np.log(data['Pop1831'])
X = np.column_stack([x1,x2])
y1 = data['Lottery']

result = LinearRegression().fit(X, y1)



In [35]:
result.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [39]:
x_1 = data['Literacy']
x_2 = np.log(data['Pop1831'])
X = np.column_stack([x_1, x_2])
X=sm.add_constant(X)
y = data['Lottery']

results = sm.OLS(y, X).fit()
print(results.summary())

mse = np.mean(results.resid**2)
rmse = np.sqrt(mse)

print(mse)
print(rmse)

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     22.20
Date:                Thu, 30 Nov 2023   Prob (F-statistic):           1.90e-08
Time:                        13:45:50   Log-Likelihood:                -379.82
No. Observations:                  86   AIC:                             765.6
Df Residuals:                      83   BIC:                             773.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        246.4341     35.233      6.995      0.0

- F-statictisc tests the overall significant of the regression model. High F statistic and low p-value == better

- AIC: Akaike Information Criterion, useful for comparing different models on same dataset, balances model fit and comfplexity, lower value == better

- BIC: Bayesioan Information Criterion, useful for testing different models on same dataset. Lower values == better

## Solution

In [9]:
# ols = ordinary least squared
# equations = ('dependent_variable' ~ 'indepndent_variable')... + sign is used to combine multiple independent variables
# Uses R - style formula
#np.log transforms the column Pop1831
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=data).fit()

In [10]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     22.20
Date:                Thu, 30 Nov 2023   Prob (F-statistic):           1.90e-08
Time:                        12:53:01   Log-Likelihood:                -379.82
No. Observations:                  86   AIC:                             765.6
Df Residuals:                      83   BIC:                             773.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         246.4341     35.233     

In [17]:
#With numpy
num_obs = 100

X = np.random.random((num_obs, 2))

#Creates a constant like the a intercept which represents value of y when all values of x are zero
X = sm.add_constant(X)

# # 1 is true intercept, .1 and .5 are true coefficients for two independent variables.
beta = [1, .1, .5]

# e = np.random.random(num_obs)

# # Take dot-product
# y = np.dot(X, beta) + e

# results = sm.OLS(y, X).fit()

# print(results.summary())



In [20]:
X

array([[1.        , 0.11588308, 0.97016346],
       [1.        , 0.78912564, 0.95594132],
       [1.        , 0.11275949, 0.71380319],
       [1.        , 0.30358975, 0.40196255],
       [1.        , 0.38540916, 0.90067565],
       [1.        , 0.83730409, 0.06889008],
       [1.        , 0.86477314, 0.30695177],
       [1.        , 0.52807534, 0.73648519],
       [1.        , 0.46392528, 0.26969661],
       [1.        , 0.62238017, 0.88775321],
       [1.        , 0.04352956, 0.90345902],
       [1.        , 0.59192748, 0.98307733],
       [1.        , 0.65601057, 0.54391297],
       [1.        , 0.55320403, 0.01562237],
       [1.        , 0.68214965, 0.43810785],
       [1.        , 0.52013006, 0.24733944],
       [1.        , 0.10297945, 0.01963864],
       [1.        , 0.83403836, 0.9195652 ],
       [1.        , 0.74896279, 0.8853528 ],
       [1.        , 0.40801985, 0.04051185],
       [1.        , 0.20159823, 0.51563668],
       [1.        , 0.15748995, 0.83658271],
       [1.