# Multiple Linear Regression

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import inv
import matplotlib.pyplot as plt

## Importing the DataSet


In [2]:
dataset = pd.read_csv('G:/Jupyter Sketch/Regression/Multiple Linear Regression/Data/wine quality-red.csv')
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
X_train = dataset.iloc[ : int((3/4)*(len(dataset)+1)),: -1].values
y_train = dataset.iloc[ : int((3/4)*(len(dataset)+1)), -1].values

In [4]:
X_test = dataset.iloc[len(X_train): , :-1].values
y_test = dataset.iloc[len(y_train): , -1].values

In [5]:
len(y_test)

399

In [6]:
cols = list(dataset.columns)

In [7]:
cols.remove('quality')

In [8]:
dataset = dataset[cols]

In [9]:
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


## Normalising the training and test dataset

In [10]:
X_train = (X_train -np.min(X_train))/(np.max(X_train) - np.min(X_train))
y_train = (y_train -np.min(y_train))/(np.max(y_train) - np.min(y_train))

In [11]:
X_test = (X_test -np.min(X_test))/ (np.max(X_test) - np.min(X_test))
y_test = (y_test -np.min(y_test))/ (np.max(y_test) - np.min(y_test))

In [12]:
train_dataset = pd.DataFrame(X_train)

In [13]:
test_dataset = pd.DataFrame(X_test)

In [14]:
train_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.025606,0.002422,0.0,0.006574,0.000263,0.038062,0.117647,0.003453,0.012145,0.001938,0.032526
1,0.02699,0.003045,0.0,0.008997,0.000339,0.086505,0.231834,0.003449,0.011073,0.002353,0.03391
2,0.02699,0.00263,0.000138,0.007958,0.000318,0.051903,0.186851,0.00345,0.01128,0.002249,0.03391
3,0.038754,0.000969,0.001938,0.006574,0.00026,0.058824,0.207612,0.003453,0.010934,0.002007,0.03391
4,0.025606,0.002422,0.0,0.006574,0.000263,0.038062,0.117647,0.003453,0.012145,0.001938,0.032526


In [15]:
X_train

array([[0.02560554, 0.00242215, 0.        , ..., 0.01214533, 0.00193772,
        0.03252595],
       [0.02698962, 0.00304498, 0.        , ..., 0.01107266, 0.00235294,
        0.03391003],
       [0.02698962, 0.00262976, 0.00013841, ..., 0.01128028, 0.00224913,
        0.03391003],
       ...,
       [0.0266436 , 0.00197232, 0.00072664, ..., 0.01093426, 0.00186851,
        0.03391003],
       [0.0266436 , 0.00089965, 0.00089965, ..., 0.01089965, 0.00273356,
        0.03771626],
       [0.02733564, 0.00200692, 0.00079585, ..., 0.01110727, 0.00200692,
        0.03287197]])

In [16]:
test_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.048125,0.003562,0.001312,0.009375,0.000431,0.025,0.05625,0.006216,0.01975,0.003375,0.06125
1,0.049375,0.002125,0.00225,0.011875,0.000406,0.03125,0.0625,0.006214,0.020438,0.003375,0.07
2,0.05375,0.002625,0.002437,0.01125,0.000425,0.0375,0.075,0.00622,0.020938,0.004312,0.073125
3,0.061875,0.004625,0.001187,0.03625,0.000694,0.20625,0.475,0.006242,0.019625,0.003438,0.05875
4,0.045,0.00225,0.002875,0.013125,0.000463,0.15,0.275,0.006221,0.02125,0.005312,0.06875


## Normal Equation θ = Inv(X.XT).XT.Y

In [17]:
X0 = np.ones(shape=(len(X_train),1)) 

In [18]:
X_train = np.concatenate((X0,X_train), axis =1)

In [19]:
Xtrain_XT = np.dot(X_train.T, X_train) # Generally X.XT but np linear algebra is giving me an error of singular matrix

In [20]:
Xtrain_XT.shape

(12, 12)

In [21]:
y_train.shape

(1200,)

In [22]:
X_train.T.shape

(12, 1200)

In [23]:
Xtrain_XT_Inv = np.linalg.inv(Xtrain_XT)

In [24]:
Xtrain_XT_Inv_XT = np.dot(Xtrain_XT_Inv, X_train.T)

In [25]:
Xtrain_XT_Inv_XT.shape

(12, 1200)

In [26]:
thetas_Neq = np.dot(Xtrain_XT_Inv_XT,y_train)

In [27]:
thetas_Neq.shape

(12,)

In [28]:
thetas_Neq

array([ 7.92296708e+00,  2.70635193e+00, -6.17644133e+01, -1.55527126e+01,
        2.00203731e+00, -9.28035957e+01,  2.01557391e-01, -2.18139544e-01,
       -2.26175928e+03, -1.46716669e+01,  4.46143565e+01,  1.55746276e+01])

In [29]:
X_test.shape

(399, 11)

In [30]:
X0_test = np.ones(shape=(len(X_test),1))

In [31]:
X0_test.shape

(399, 1)

In [32]:
X_test.shape

(399, 11)

In [33]:
X_test = np.concatenate((X0_test,X_test), axis =1)

In [34]:
def pred(X, theta):
    pred = np.dot(X, theta)
    return pred

In [35]:
y_pred = pred(X_test, thetas_Neq)

In [36]:
y_pred.shape

(399,)

In [37]:
y_pred = (y_pred -np.min(y_pred))/ (np.max(y_pred) - np.min(y_pred))

In [38]:
err =  np.sum((y_pred - y_test)**2)/len(y_test)
print('MSE = ',err)

MSE =  0.02889363966749071


In [39]:
y_pred

array([0.42885148, 0.68149471, 0.73170787, 0.25904511, 0.66326389,
       0.66326389, 0.66326389, 0.36874757, 0.66326389, 0.69246336,
       0.46285929, 0.3666301 , 0.46285929, 0.64346198, 0.58561019,
       0.70143947, 0.28414376, 0.83054936, 0.59070145, 0.70540255,
       0.704192  , 0.704192  , 0.27738265, 0.82541808, 0.65173964,
       0.28864753, 0.34822969, 0.38865046, 0.87257136, 0.34682992,
       0.82835459, 0.38083265, 0.34682992, 0.49700248, 0.67779521,
       0.67083358, 0.400828  , 0.67779521, 0.25450165, 0.62343326,
       0.27024473, 0.46556142, 0.78295997, 0.24285223, 0.54496832,
       0.46344305, 0.30799131, 0.46344305, 0.64108463, 0.52874935,
       0.52874935, 0.36087695, 0.29018882, 0.32466119, 0.49053703,
       0.46425185, 0.27899808, 0.49862622, 0.51884188, 0.51884188,
       0.21759253, 0.33513676, 0.43484658, 0.22648583, 0.75671079,
       0.45066537, 0.45066537, 0.79409266, 0.33386421, 0.96850056,
       1.        , 0.65739133, 0.57373229, 0.32244375, 0.54814

In [40]:
y_test

array([0.6, 0.8, 1. , 0.4, 0.8, 0.8, 0.8, 0.4, 0.8, 0.8, 0.6, 0.4, 0.6,
       0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.4,
       0.4, 0.4, 0.8, 0.4, 0.6, 0.4, 0.4, 0.2, 0.6, 0.2, 0.6, 0.6, 0.2,
       0.2, 0.4, 0.4, 0.6, 0.4, 0.6, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.4,
       0.4, 0.4, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.4, 0.2, 0.4, 0.2, 0.6,
       0.6, 0.6, 0.6, 0.6, 1. , 0.6, 0.6, 0.4, 0.4, 0.6, 0.6, 0.2, 0.6,
       0.6, 0.8, 0.6, 0.6, 0.6, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.4, 0.4,
       0.6, 0.6, 0.2, 0.6, 0.4, 0.4, 0.6, 0.6, 0. , 0.6, 0.6, 0.6, 0.4,
       0.4, 0.4, 0.4, 0.2, 0.4, 0.4, 0.4, 0.6, 0.4, 0.6, 0.6, 0.6, 0.6,
       0.6, 0.6, 0.6, 0.4, 0.6, 0.4, 0.8, 0.6, 0.6, 0.6, 0.6, 0.4, 0.6,
       0.6, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.6,
       0.6, 0.4, 0.6, 0.4, 0.4, 0.4, 0.4, 0.4, 0.6, 0.4, 0.4, 0.4, 0.4,
       0.4, 0.6, 0.4, 0.6, 0.4, 0.4, 0.6, 0.2, 0.6, 0.4, 0.4, 0.6, 0.6,
       0.2, 0.4, 0.6, 0.4, 0.4, 0. , 0.4, 0.4, 0.6, 0.6, 0.6, 0.

## Gradient Descent

In [42]:
def gradient_descent(X, y, thetas_gd, alpha, itrs):
    for i in range(itrs):
        for j in range(thetas_gd.shape[1]):
            thetas_gd[: j] = thetas_gd - (alpha/len(X))*(np.sum((np.dot(thetas_gd, X.T)-y)*X[:,j]))
    return thetas_gd

In [43]:
thetas_gd = np.random.rand(1, len(cols)+1) 

In [44]:
thetas_gd = gradient_descent(X_train, y_train, thetas_gd, 0.01, 100000)

In [45]:
thetas_gd

array([[ 0.42832709,  0.02397622,  0.47948805, -0.19939299, -0.22055117,
         0.17088701, -0.3274814 ,  0.56530041, -0.27423385, -0.23118019,
        -0.41104155, -0.11725463]])

In [46]:
thetas_gd.shape

(1, 12)

In [47]:
X_test.shape

(399, 12)

In [51]:
y_pred_gd = pred(thetas_gd,X_test.T)

In [52]:
y_pred_gd

array([[0.43770497, 0.43660522, 0.44120399, 0.61033926, 0.51656785,
        0.51656785, 0.51656785, 0.54739189, 0.51656785, 0.46621659,
        0.44932047, 0.55494949, 0.44932047, 0.44765362, 0.43221495,
        0.47247868, 0.67161307, 0.58375057, 0.48032856, 0.53763958,
        0.53173951, 0.53173951, 0.66017531, 0.47315997, 0.46589793,
        0.64805714, 0.51882687, 0.47095843, 0.68313067, 0.60664869,
        0.5829399 , 0.63205938, 0.60664869, 0.51120148, 0.45273149,
        0.78271392, 0.45966015, 0.45273149, 0.47379171, 0.45681457,
        0.5530445 , 0.58442805, 0.52145382, 0.69600053, 0.81384676,
        0.50168636, 0.55342865, 0.50168636, 0.49270279, 0.49352084,
        0.49352084, 0.5696451 , 0.45101787, 0.4481971 , 0.49532779,
        0.52127379, 0.53852271, 0.62709998, 0.49698943, 0.49698943,
        0.57174634, 0.46314762, 0.57597516, 0.47941286, 0.53778839,
        0.50673259, 0.50673259, 0.45119918, 0.54964155, 0.65909934,
        0.56758951, 0.58871005, 0.51601169, 0.50

In [53]:
err =  np.sum((y_pred_gd - y_test)**2)/len(y_test)
print('MSE_GD = ',err)

MSE_GD =  0.03621543682304282
