In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_csv('../datasets/dataset_house_price.csv')

In [3]:
df.head()

Unnamed: 0,value,area,distance_beach,distance_supmarket
0,4600000,280,0.240925,0.793637
1,900000,208,0.904136,0.134494
2,2550000,170,0.059525,0.423318
3,550000,100,2.883181,0.525064
4,2200000,164,0.239758,0.192374


## Log-transforming data

In [4]:
df['value'] = np.log(df['value'])
df['area'] = np.log(df['area'])
df['distance_beach'] = np.log(df['distance_beach'] + 1)
df['distance_supmarket'] = np.log(df['distance_supmarket'] + 1)

In [5]:
def predict(x, theta1, theta0):
    return x * theta1 + theta0

In [43]:
def predict_multivariate(X, betas, intercept=True):
    X = np.c_[np.ones(X.shape[0]), X] if intercept else X
    
    return np.dot(X, betas)

In [71]:
def fit(X, y, mode='ols', learning_rate=0.01, steps=10000, seed=0, verbosity=0, intercept=True, cost_func='mse'):
    modes = {
        'ols': ordinary_least_square,
        'gradient_descent': gradient_descent,
        'ols_2': ordinary_least_square_multivariate,
        'gradient_descent_multivariate': gradient_descent_multivariate
    }
    
    return modes.get(mode, lambda: 'Invalid')(X=X,
                                              y=y,
                                              intercept=intercept,
                                              learning_rate=learning_rate,
                                              steps=steps,
                                              seed=seed,
                                              verbosity=verbosity,
                                              cost_func=cost_func)

In [93]:
def mean_squared_error(pred_y, expec_y):
    return squared_error(pred_y, expec_y).mean()

def mean_absolute_error(pred_y, expec_y):
    return absolute_error.mean()

In [94]:
def squared_error(pred_y, expec_y):
    return (pred_y - expec_y) ** 2

def absolute_error(pred_y, expec_y):
    return (pred_y - expec_y)

In [10]:
def r2 (pred_y, expec_y):
    var_a = sum([ (y_i - y_t) ** 2 for y_i, y_t in zip(pred_y, expec_y) ])
    var_b = sum([ (y_i - expec_y.mean()) ** 2 for y_i in expec_y ])
    
    return 1 - (var_a/var_b)

In [16]:
def ordinary_least_square(X, y, **kwargs):
    theta1 = np.corrcoef(X,y)[0, 1] * (y.std() / X.std())
    theta0 = y.mean() - theta1 * X.mean()
    
    return theta1, theta0

In [50]:
def ordinary_least_square_multivariate(X, y, intercept=True, **kwargs):
    X = np.c_[np.ones(X.shape[0]), X] if intercept else X
    
    return np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)

In [13]:
def derivative(theta0, theta1, X, y):
    dtheta0 = 0
    dtheta1 = 0
    
    for x_i, y_i in zip(X, y):
        dtheta0 += predict(x_i, theta1, theta0) - y_i
        dtheta1 += (predict(x_i, theta1, theta0) - y_i) * x_i
    
    dtheta0 /= 0.5*len(X)
    dtheta1 /= 0.5*len(X)
    
    return dtheta0, dtheta1

In [14]:
def gradient_descent(X, y, learning_rate, steps, seed, verbosity, **kwargs):
    random.seed(seed)
    
    theta0 = random.random()
    theta1 = random.random()
    
    for i in range(steps):
        if verbosity != 0 and i % verbosity == 0 :
            predicted = predict(X, theta1, theta0)
            print('Step: {} --- Error: {}'.format(i, mean_squared_error(predicted, y)))
        
        dtheta0, dtheta1 = derivative(theta0, theta1, X, y)

        theta0 -= learning_rate*dtheta0
        theta1 -= learning_rate*dtheta1
    
    return theta1, theta0

In [112]:
def gradient_descent_multivariate(X, y, learning_rate, steps, seed, verbosity, intercept=True, cost_func='mse', **kwargs):
    np.random.seed(seed)
    
    X = np.c_[np.ones(X.shape[0]), X] if intercept else X
    
    minimize = {
        'mse': mean_squared_error,
        'mae': mean_absolute_error,
        
    }
    
    betas = np.random.rand(X.shape[1])
    
    for i in range(steps):
        predicted = predict_multivariate(X, betas, intercept=False)
        error = minimize.get(cost_func, lambda: 'Invalid')(predicted, y)
            
        if verbosity != 0 and i % verbosity == 0 :
            print('Step: {} --- Error: {}'.format(i, error))
        
        error = absolute_error(predicted,  y)
        gradient = 2*np.dot(X.T, error) / len(X)
        betas -= learning_rate*gradient
    
    return betas

In [17]:
ordinary_least_square(df['area'], df['value'])

(1.273136797389059, 7.806465474455767)

In [18]:
theta1, theta0 = fit(df['area'], df['value'])

In [19]:
print(theta1, theta0)

1.273136797389059 7.806465474455767


In [20]:
predicted = predict(df['area'], theta1, theta0)
print(mean_squared_error(predicted, df['value']))
print(r2(predicted, df['value']))

0.2922746317391325
0.641975652487156


In [23]:
betas = fit(df[['area']].values, df['value'].values, mode='ols_2', intercept=True)

In [24]:
betas

array([7.80646547, 1.2731368 ])

In [32]:
predicted = predict_multivariate(df[['area']].values, betas, intercept=True)
print(mean_squared_error(predicted, df['value']))
print(r2(predicted, df['value']))

0.2922746317391325
0.6419756524871565


In [39]:
np.array([np.log(72)])

array([4.27666612])

In [42]:
np.exp(predict(np.log(72), theta1, theta0))

568778.569868878

In [40]:
np.exp(predict_multivariate(np.array([np.log(72)]), betas, intercept=True))

array([568778.56986887])

In [16]:
theta1, theta0 = fit(df['area'], df['value'], mode='gradient_descent', verbosity=1000)

Step: 0 --- Error: 87.70010488446381
Step: 1000 --- Error: 0.6541337197535424
Step: 2000 --- Error: 0.4969364109075487
Step: 3000 --- Error: 0.4080280983225491
Step: 4000 --- Error: 0.35774296236989567
Step: 5000 --- Error: 0.32930248448590266
Step: 6000 --- Error: 0.31321699984027107
Step: 7000 --- Error: 0.30411930403711884
Step: 8000 --- Error: 0.2989737911259385
Step: 9000 --- Error: 0.2960635703742233


In [113]:
betas = fit(df['area'], df['value'], mode='gradient_descent_multivariate', verbosity=1000)

Step: 0 --- Error: 97.17937234750998
Step: 1000 --- Error: 0.6850250629817578
Step: 2000 --- Error: 0.514408066243553
Step: 3000 --- Error: 0.41790979035273856
Step: 4000 --- Error: 0.36333189115206443
Step: 5000 --- Error: 0.33246349424381877
Step: 6000 --- Error: 0.31500481698360694
Step: 7000 --- Error: 0.30513046514790004
Step: 8000 --- Error: 0.2995456878636821
Step: 9000 --- Error: 0.2963870261273715


In [17]:
predicted = predict(df['area'], theta1, theta0)
print(mean_squared_error(predicted, df['value']))
print(r2(predicted, df['value']))

0.29441759552780694
0.639350610390212


In [18]:
np.exp(predict(np.log(72), theta1, theta0))

550014.4752076697

### Multivariable regression

In [21]:
X = df[['area', 'distance_beach', 'distance_supmarket']].values
y = df['value'].values

In [34]:
x = np.array([[1., 2.], [3., 4.]])

In [36]:
y = np.array([2, 4])

In [39]:
np.dot(x, y)

array([10., 22.])

In [41]:
np.dot(np.dot(np.linalg.inv(np.dot(x.T, x)), x.T), y)

array([8.8817842e-15, 1.0000000e+00])

In [32]:
np.linalg.inv(np.array([[[1., 2.], [3., 4.]], [[1, 3], [3, 5]]]))

array([[[-2.  ,  1.  ],
        [ 1.5 , -0.5 ]],

       [[-1.25,  0.75],
        [ 0.75, -0.25]]])

In [29]:
np.c_[np.ones(X.shape[0]), X] if intercept else X

array([[1.        , 1.72895981, 0.21585741, 0.5842454 ],
       [1.        , 1.67476451, 0.64402824, 0.12618674],
       [1.        , 1.63623532, 0.05782096, 0.35299077],
       ...,
       [1.        , 1.44656486, 0.225651  , 0.35164723],
       [1.        , 1.38812598, 2.29709264, 0.51936684],
       [1.        , 1.51365247, 0.57348707, 0.02753345]])