In [405]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


1. Напишіть функцію гіпотези лінійної регресії у векторному вигляді;

In [373]:
def h(x, w):
    x = np.append([1], x)
    return np.dot(w, x)

2. Cтворіть функцію для обчислення функції втрат у векторному вигляді;

In [374]:
def loss_function(w, df):
    n = df.shape[0]
    cost = 0

    for index, row in df.iterrows():
        x_1 = row['area']
        x_2 = row['bedrooms']
        x_3 = row['bathrooms']
        y = row['price']

        x = np.array([x_1, x_2, x_3])
        
        cost += (h(x, w) - y) ** 2
        
    return cost / (2 * n)

3. Реалізуйте один крок градієнтного спуску;

In [375]:
def grad_step(weights, grads, learning_rate = 0.01):

    w_0, w_1, w_2, w_3 = weights
    g_0, g_1, g_2, g_3 = grads

    w_0 = w_0 - learning_rate * g_0
    w_1 = w_1 - learning_rate * g_1
    w_2 = w_2 - learning_rate * g_2
    w_3 = w_3 - learning_rate * g_3
    
    return w_0, w_1, w_2, w_3

4. Знайдіть найкращі параметри $\vec{w}$ для датасету прогнозуючу ціну на будинок залежно від площі, кількості ванних кімнат та кількості спалень;

In [376]:
def normalize(data):
    mean = data.mean()
    range = data.max() - data.min()
    
    data = (data - mean) / range
    
    return data

In [377]:
def grad_w_0(w, df):
    n = df.shape[0]
    cost = 0

    for index, row in df.iterrows():
        x_1 = row['area']
        x_2 = row['bedrooms']
        x_3 = row['bathrooms']
        y = row['price']
        
        x = np.array([x_1, x_2, x_3])
        cost += h(x, w) - y
    return cost / n

In [378]:
def grad_w_1(w, df):
    n = df.shape[0]
    cost = 0

    for index, row in df.iterrows():
        x_1 = row['area']
        x_2 = row['bedrooms']
        x_3 = row['bathrooms']
        y = row['price']

        x = np.array([x_1, x_2, x_3])        
        cost += (h(x, w) - y) * x_1      
    return cost / n


def grad_w_2(w, df):
    n = df.shape[0]
    cost = 0

    for index, row in df.iterrows():
        x_1 = row['area']
        x_2 = row['bedrooms']
        x_3 = row['bathrooms']
        y = row['price']

        x = np.array([x_1, x_2, x_3])        
        cost += (h(x, w) - y) * x_2
        
    return cost / n


def grad_w_3(w, df):
    n = df.shape[0]
    cost = 0

    for index, row in df.iterrows():
        x_1 = row['area']
        x_2 = row['bedrooms']
        x_3 = row['bathrooms']
        y = row['price']

        x = np.array([x_1, x_2, x_3])        
        cost += (h(x, w) - y) * x_3
        
    return cost / n

In [379]:
def grad_descent(weights, df, num_iter, learning_rate = 0.001, epsilon = 0.01):
    
    w_0, w_1, w_2, w_3 = weights
    loss = loss_function(weights, df)
    loss_history = [loss]

    for i in range(num_iter):
        
        grad_w0 = grad_w_0(weights, df)
        grad_w1 = grad_w_1(weights, df)
        grad_w2 = grad_w_2(weights, df)
        grad_w3 = grad_w_3(weights, df)
        
        grads = grad_w0, grad_w1, grad_w2, grad_w3

        w_0, w_1, w_2, w_3 = grad_step(weights, grads, learning_rate)
        weights = w_0, w_1, w_2, w_3
        
        loss = loss_function(weights, df)

        if abs(loss - loss_history[-1]) <= epsilon:
            loss_history.append(loss)
            break
    
        loss_history.append(loss)

    return w_0, w_1, w_2, w_3, loss_history

In [380]:
data = pd.read_csv("Housing.csv")

In [381]:
norm_data = pd.DataFrame()
norm_data["area"] = normalize(data["area"])
norm_data["bedrooms"] = normalize(data["bedrooms"])
norm_data["bathrooms"] = normalize(data["bathrooms"])
norm_data["price"] = normalize(data["price"])

In [387]:
w_0, w_1, w_2, w_3, history = grad_descent((0, 0, 0, 0), norm_data, 1000, learning_rate = 0.1, epsilon = 0.0000001)

print(f"Coefitients: {w_0}, {w_1}, {w_2}, {w_3}") 
print(f"Loss function: {history[-1]}")

Coefitients: 5.020448888419632e-17, 0.4334673916275597, 0.18904886881067132, 0.3536652641284381
Loss function: 0.006735664105912874


5. Знайдіть ці ж параметри за допомогою аналітичного рішення

In [402]:
X = norm_data[["area", "bedrooms", "bathrooms"]]
y = norm_data["price"]
regressor = LinearRegression().fit(X, y)

w_a_0 = regressor.intercept_
w_a_1 = regressor.coef_[0]
w_a_2 = regressor.coef_[1]
w_a_3 = regressor.coef_[2]

print(f"Coefitients: {w_a_0}, {w_a_1}, {w_a_2}, {w_a_3}") 
print(f"Loss function: {loss_function((w_a_0, w_a_1, w_a_2, w_a_3), norm_data)}")

Coefitients: 7.014204135081997e-17, 0.47714268958123457, 0.17611256873124786, 0.3600128565690898
Loss function: 0.006713405108514903


6. Порівняйте отримані результати.

In [404]:
norm_copy = norm_data.copy()
norm_copy["prediction_math"] = w_0 +  \
                          norm_copy["area"]*w_1 + \
                          norm_copy["bedrooms"]*w_2 + \
                          norm_copy["bathrooms"]*w_3

norm_copy["prediction_analitic"] = regressor.intercept_ +  \
                          norm_copy["area"]*regressor.coef_[0] + \
                          norm_copy["bedrooms"]*regressor.coef_[1] + \
                          norm_copy["bathrooms"]*regressor.coef_[2]

norm_copy

Unnamed: 0,area,bedrooms,bathrooms,price,prediction_math,prediction_analitic
0,0.155977,0.206972,0.237920,0.738811,0.190883,0.196528
1,0.261818,0.206972,0.904587,0.647902,0.472539,0.487038
2,0.330547,0.006972,0.237920,0.647902,0.228744,0.244600
3,0.161475,0.206972,0.237920,0.644872,0.193266,0.199151
4,0.155977,0.206972,-0.095413,0.575175,0.072994,0.076524
...,...,...,...,...,...,...
540,-0.147804,-0.193028,-0.095413,-0.255128,-0.134304,-0.138868
541,-0.189041,0.006972,-0.095413,-0.259704,-0.114369,-0.123321
542,-0.105192,-0.193028,-0.095413,-0.261189,-0.115833,-0.118536
543,-0.153989,0.006972,-0.095413,-0.261189,-0.099175,-0.106597


In [406]:
mean_squared_error(norm_copy["price"], norm_copy["prediction_math"])

0.013471328211825749

In [407]:
mean_squared_error(norm_copy["price"], norm_copy["prediction_analitic"])

0.01342681021702981