# Estimation of linear regression coefficients using linear algebra.



In [3]:
import pandas as pd
import numpy as np

In [1]:
# path = 'drive/MyDrive/CarData/'
# filename = 'ford.csv'
path = ''
filename = 'merc.csv'
fullpath = path + filename
fullpath

'merc.csv'

In [5]:
df = pd.read_csv(fullpath)
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,SLK,2005,5200,Automatic,63000,Petrol,325,32.1,1.8
1,S Class,2017,34948,Automatic,27000,Hybrid,20,61.4,2.1
2,SL CLASS,2016,49948,Automatic,6200,Petrol,555,28.0,5.5
3,G Class,2016,61948,Automatic,16000,Petrol,325,30.4,4.0
4,G Class,2016,73948,Automatic,4000,Petrol,325,30.1,4.0


In [6]:
df.shape

(13119, 9)

In [7]:
df['model'].unique()

array([' SLK', ' S Class', ' SL CLASS', ' G Class', ' GLE Class',
       ' GLA Class', ' A Class', ' B Class', ' GLC Class', ' C Class',
       ' E Class', ' GL Class', ' CLS Class', ' CLC Class', ' CLA Class',
       ' V Class', ' M Class', ' CL Class', ' GLS Class', ' GLB Class',
       ' X-CLASS', '180', ' CLK', ' R Class', '230', '220', '200'],
      dtype=object)

In [8]:
df['transmission'].unique()

array(['Automatic', 'Manual', 'Semi-Auto', 'Other'], dtype=object)

In [9]:
df['fuelType'].unique()

array(['Petrol', 'Hybrid', 'Diesel', 'Other'], dtype=object)

In [10]:
# Model, transmission and fuel type are categorical variables.
# We cannot include categorical variables directly in the regression model. 
# We must first create binary dummy variables by encoding the categorical variables.
# Each category becomes a column and the first (or any one) category is dropped.
# Therefore, if there are n categories, we get n-1 dummy variables.
df_encoded = pd.get_dummies(data = df, drop_first=True)
df_encoded.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_ B Class,model_ C Class,model_ CL Class,model_ CLA Class,...,model_180,model_200,model_220,model_230,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,2005,5200,63000,325,32.1,1.8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2017,34948,27000,20,61.4,2.1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2016,49948,6200,555,28.0,5.5,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2016,61948,16000,325,30.4,4.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2016,73948,4000,325,30.1,4.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
df_encoded.shape

(13119, 38)

In [12]:
# The dependent variable that we want to predict is price
Y = df_encoded['price']

In [13]:
# All other variables are independent variables
X = df_encoded.loc[:, df_encoded.columns != 'price']

In [14]:
# Augment the X matrix by adding a column for the intercept term 'a'
X['CONST'] = 1

In [15]:
X.head()

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_ B Class,model_ C Class,model_ CL Class,model_ CLA Class,model_ CLC Class,...,model_200,model_220,model_230,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,CONST
0,2005,63000,325,32.1,1.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,2017,27000,20,61.4,2.1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,2016,6200,555,28.0,5.5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,2016,16000,325,30.4,4.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,2016,4000,325,30.1,4.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [16]:
# Save the names of the columns in a list
variable_list = X.columns
variable_list

Index(['year', 'mileage', 'tax', 'mpg', 'engineSize', 'model_ B Class',
       'model_ C Class', 'model_ CL Class', 'model_ CLA Class',
       'model_ CLC Class', 'model_ CLK', 'model_ CLS Class', 'model_ E Class',
       'model_ G Class', 'model_ GL Class', 'model_ GLA Class',
       'model_ GLB Class', 'model_ GLC Class', 'model_ GLE Class',
       'model_ GLS Class', 'model_ M Class', 'model_ R Class',
       'model_ S Class', 'model_ SL CLASS', 'model_ SLK', 'model_ V Class',
       'model_ X-CLASS', 'model_180', 'model_200', 'model_220', 'model_230',
       'transmission_Manual', 'transmission_Other', 'transmission_Semi-Auto',
       'fuelType_Hybrid', 'fuelType_Other', 'fuelType_Petrol', 'CONST'],
      dtype='object')

In [17]:
# Convert everything to numpy arrays
Y = Y.to_numpy()
X = X.to_numpy()

In [18]:
print(np.shape(Y))
print(np.shape(X))

(13119,)
(13119, 38)


We estimate the ordinary least squares (OLS) regression coefficients using the following formula.

$$\hat{\beta} = (X^{T}X)^{-1}X^{T}Y$$

In [19]:
beta_est = np.dot(np.linalg.inv(np.dot(X.T, X)), np.dot(X.T, Y))
beta_est

array([ 1.97795373e+03, -1.38178451e-01, -2.05318544e+01, -1.68314991e+02,
        9.07601471e+03, -2.58050306e+03, -5.20049612e+02, -8.85427934e+02,
       -2.89750415e+02,  3.39381590e+03,  8.93163856e+03, -4.69276285e+02,
        1.12180408e+03,  5.60547918e+04,  8.98976021e+02, -2.14367435e+03,
        5.30803067e+03,  4.41548003e+03,  7.28326264e+03,  1.02508194e+04,
        4.74917980e+03,  2.77144817e+03,  1.04469896e+04,  3.30174747e+02,
        2.69040613e+03,  5.53091740e+03, -2.57833420e+03,  4.18363667e+03,
       -3.53292118e+03, -4.64122487e+03,  3.21184931e+04, -2.22001705e+03,
       -2.75416097e+02,  3.59107363e+01,  1.18158887e+04,  1.45491987e+04,
        1.60085921e+03, -3.97035870e+06])

In [20]:
coeffs = dict(zip(variable_list, beta_est))
coeffs

{'year': 1977.9537346198194,
 'mileage': -0.13817845075542173,
 'tax': -20.531854371841973,
 'mpg': -168.31499130520024,
 'engineSize': 9076.014710874166,
 'model_ B Class': -2580.503061901225,
 'model_ C Class': -520.0496120025637,
 'model_ CL Class': -885.4279341650836,
 'model_ CLA Class': -289.750414665672,
 'model_ CLC Class': 3393.815895110223,
 'model_ CLK': 8931.638556877966,
 'model_ CLS Class': -469.2762851706357,
 'model_ E Class': 1121.8040814396518,
 'model_ G Class': 56054.791763985704,
 'model_ GL Class': 898.9760214923299,
 'model_ GLA Class': -2143.6743498000433,
 'model_ GLB Class': 5308.030666642706,
 'model_ GLC Class': 4415.480034965789,
 'model_ GLE Class': 7283.262642026297,
 'model_ GLS Class': 10250.81944656081,
 'model_ M Class': 4749.179803099716,
 'model_ R Class': 2771.4481712363777,
 'model_ S Class': 10446.989617409417,
 'model_ SL CLASS': 330.17474685490015,
 'model_ SLK': 2690.4061265556666,
 'model_ V Class': 5530.91739801981,
 'model_ X-CLASS': -2578.

# Verification of the results using the Statsmodels library

In [21]:
import statsmodels.api as sm

In [22]:
X = df_encoded.loc[:, df_encoded.columns != 'price']
X = sm.add_constant(data = X, prepend = True)   # Add the constant (intercept) term 'a' to the model
Y = df_encoded['price']

In [23]:
model = sm.OLS(Y, X)

In [24]:
result = model.fit()

In [25]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.769
Model:                            OLS   Adj. R-squared:                  0.768
Method:                 Least Squares   F-statistic:                     1174.
Date:                Thu, 28 Oct 2021   Prob (F-statistic):               0.00
Time:                        15:38:18   Log-Likelihood:            -1.3206e+05
No. Observations:               13119   AIC:                         2.642e+05
Df Residuals:                   13081   BIC:                         2.645e+05
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                   -3.9