# Multivariate Linear Regression 

### *Name: Ishita Gupta*
### *Roll No: J018*

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
np.random.seed(123)

In [2]:
data=pd.read_csv('ex1data2.txt',header=None)
data.head()

Unnamed: 0,0,1,2
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


In [3]:
data.columns=['Size of House','Number of Bedrooms', 'Price of House']
data.head()

Unnamed: 0,Size of House,Number of Bedrooms,Price of House
0,2104,3,399900
1,1600,3,329900
2,2400,3,369000
3,1416,2,232000
4,3000,4,539900


In [4]:
data.describe()

Unnamed: 0,Size of House,Number of Bedrooms,Price of House
count,47.0,47.0,47.0
mean,2000.680851,3.170213,340412.659574
std,794.702354,0.760982,125039.899586
min,852.0,1.0,169900.0
25%,1432.0,3.0,249900.0
50%,1888.0,3.0,299900.0
75%,2269.0,4.0,384450.0
max,4478.0,5.0,699900.0


In [5]:
data.isnull().sum()

Size of House         0
Number of Bedrooms    0
Price of House        0
dtype: int64

In [6]:
def normalize(dataframe):
    df=dataframe.copy()
    for col in df.columns:
        df[col]=(df[col]-df[col].mean())/df[col].std()
        return df

In [7]:
normalized = normalize(data)
normalized.head()

Unnamed: 0,Size of House,Number of Bedrooms,Price of House
0,0.13001,3,399900
1,-0.50419,3,329900
2,0.502476,3,369000
3,-0.735723,2,232000
4,1.257476,4,539900


In [8]:
X = normalized.iloc[:,:-1].values
y = normalized.iloc[:,-1].values

In [9]:
print(X.shape)
print(y.shape)

(47, 2)
(47,)


In [10]:
m = y.size

In [11]:
ones = np.ones((m,1))
X1=np.concatenate((ones,X),axis=1)
X1[:5]

array([[ 1.        ,  0.13000987,  3.        ],
       [ 1.        , -0.50418984,  3.        ],
       [ 1.        ,  0.50247636,  3.        ],
       [ 1.        , -0.73572306,  2.        ],
       [ 1.        ,  1.25747602,  4.        ]])

In [12]:
n = X1.shape[1]

In [13]:
y = y.reshape(m,1)
y.shape

(47, 1)

In [14]:
# Initialize hyperparameters 
alpha=0.01
theta = np.random.rand(n,1)
epoch = 10000

In [15]:
theta.shape

(3, 1)

In [16]:
def GD(X1,y,theta,epoch, alpha,dec=5):
    past_cost=[]
    past_theta=[theta]
    m=y.size
    n=X1.shape[1]
    for i in range(epoch): 
        h_theta=np.dot(X1,theta) 
        error = h_theta-y 
        cost = np.dot(error.T,error)/(2*m)
        past_cost.append(cost[0][0])
        differentiation = np.dot(X1.T,error)/m
        theta = theta-(alpha*differentiation)
        past_theta.append(theta)
        
        if np.equal(np.round(past_theta[i],dec),np.round(past_theta[i+1],dec)).sum()== n:
            break
            
    return past_cost, past_theta, i+1

In [17]:
pastCost,pastTheta,stopEpoch = GD(X1=X1, y=y, theta=theta, epoch=epoch, alpha=alpha)

In [18]:
print(f'The model performed {stopEpoch} epochs out of {epoch} epochs before the previous theta was equal to current theta rounded up to 5 decimal places')

The model performed 10000 epochs out of 10000 epochs before the previous theta was equal to current theta rounded up to 5 decimal places


In [1]:
plt.plot(pastCost)

NameError: name 'plt' is not defined

In [20]:
best_theta = np.array(pastTheta[-1]).reshape(n,)
print(best_theta)

[356033.87814738 109006.0652713   -5055.43828572]


In [21]:
print('Parameters from StatsModels')
sm.OLS(y,X).fit().params

Parameters from StatsModels


array([ 62994.94685809, 103051.04427073])

In [22]:
print(f'Parameters from StatsModels are: {sm.OLS(y,X1).fit().params}')
print(f'Parameters from SciKitLearn are: {LinearRegression().fit(X1,y).coef_}')

Parameters from StatsModels are: [368114.03931355 110631.05027885  -8738.01911233]
Parameters from SciKitLearn are: [[     0.         110631.05027885  -8738.01911233]]
