### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Read Dataset

In [None]:
data = pd.read_csv('data/train.csv')

In [None]:
data.tail()

In [None]:
data.shape

### Feature Selection

In [None]:
X = data['rm'].values
y = data['medv'].values

### Data Visualization

In [None]:
print(y)

In [None]:
plt.plot(X, y, 'r.')
plt.xlabel('Average number of rooms per dwelling')
plt.ylabel('Median value of homes in $1000s')
plt.show()

In [None]:
X = np.column_stack((np.ones(len(X)), X))
y = y.reshape(-1,  1)
theta = np.random.randn(2,1)

In [None]:
print(X[:5])

In [10]:
print(y[:5])

[[24. ]
 [21.6]
 [33.4]
 [36.2]
 [22.9]]


In [11]:
print(theta)

[[ 0.34065409]
 [-0.0300971 ]]


### Train-Test split

In [12]:
index = np.arange(len(X))
np.random.shuffle(index)

train_split = int(0.7*len(X))


X_train = X[index[:train_split]]
X_test = X[index[train_split:]]
y_train = y[index[:train_split]]
y_test = y[index[train_split:]]

print("X_train \t: ", X_train.shape)
print("X_test  \t: ", X_test.shape)
print("y_train \t: ", y_train.shape)
print("y_test  \t: ", y_test.shape)

X_train 	:  (233, 2)
X_test  	:  (100, 2)
y_train 	:  (233, 1)
y_test  	:  (100, 1)


### Hypothesis
<img src='../images/hypothesis.png'></img>

In [13]:
def hypothesis(X, theta):
    h = np.dot(X, theta)
    return h

### Cost Function
<img src='../images/cost function.png'></img>

In [14]:
def cost(X, y, theta):
    h = hypothesis(X, theta)
    J = np.mean((h-y)**2) / 2
    return J

### Gradient Descent
<img src='../images/gradient descent.png'></img>
<img src='../images/gradient descent formula2.png'></img>

In [None]:
def plot_hypothesis(X, y, theta):
    X1 = X[:, 1].reshape(-1, 1)
    plt.plot(X1, y, 'r.')
    plt.plot(X1, hypothesis(X, theta))
    plt.xlabel('Average number of rooms per dwelling')
    plt.ylabel('Median value of homes in $1000s')
    plt.show()

In [None]:
def optimize(X, y, theta, lr, n_iter):
    X0 = X[:, 0].reshape(-1, 1)
    X1 = X[:, 1].reshape(-1, 1)
    cost_hist = []
    for i in range(1, n_iter+1):
        h = hypothesis(X, theta)
        theta[0] = theta[0] - lr * np.mean((h - y)*X0)
        theta[1] = theta[1] - lr * np.mean((h - y)*X1)
        
        if(i%50 == 0):
            J = cost(X, y, theta)
            cost_hist.append(J)
            print('\nIteration : ', i, ',\tCost : ', J)
        
    return cost_hist

In [None]:
cost_hist = optimize(X_train, y_train, theta, 0.04, 10000)

In [None]:
plt.plot(cost_hist)
plt.xlabel('Iteration')
plt.ylabel('Cost')
plt.xticks([])
plt.yticks([])
plt.show()

In [None]:
plot_hypothesis(X_test, y_test, theta)