In [138]:
import numpy as np
import warnings
from metrics import *

class SimpleLinearRegressor:
    
    def __init__(self, intercept=True):
        self.intercept = intercept
        self.betas = None

    def predict(self, X, intercept=True):
        X = np.c_[np.ones(X.shape[0]), X] if intercept else X
        
        try:
            pred = np.dot(X, self.betas)
        except TypeError as e:
            raise Exception('Model is not fitted. Call fit before predict').with_traceback(e.__traceback__)
        return pred
    
    def __ordinary_least_square(self, X, y, **kwargs):
        self.betas = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), y)
        self.is_fitted = True

    def __gradient_descent(self, X, y, learning_rate, steps, seed, verbosity, cost_func='mse', **kwargs):
        np.random.seed(seed)
        
        minimize = {
            'mse': mean_squared_error,
            'mae': mean_absolute_error,
            
        }
        
        self.betas = np.random.rand(X.shape[1])
        
        for i in range(steps):
            predicted = self.predict(X, intercept=False)
            
            try:
                error = minimize.get(cost_func, lambda: 'Invalid')(predicted, y)
            except ValueError as e:
                raise Exception('X and y have different lenght')
                
            if verbosity != 0 and i % verbosity == 0 :
                print('Step: {} --- Error: {}'.format(i, error))
            
            error = absolute_error(predicted,  y)
            gradient = 2*np.dot(X.T, error) / len(X)
            self.betas -= learning_rate*gradient


    def fit(self, X, y, mode='ols', learning_rate=0.01, steps=10000, seed=0, verbosity=0, cost_func='mse'):
        modes = {
            'ols': self.__ordinary_least_square,
            'gradient_descent': self.__gradient_descent
        }
        
        try:
            X = np.c_[np.ones(X.shape[0]), X] if self.intercept else X
        except AttributeError as e:
            raise Exception('X must a be an numpy array').with_traceback(e.__traceback__)
        
        try:
            modes.get(mode)(X=X,
                            y=y,
                            learning_rate=learning_rate,
                            steps=steps,
                            seed=seed,
                            verbosity=verbosity,
                            cost_func=cost_func)
            
        except TypeError as e:
            modes.get('ols')(X=X,
                            y=y,
                            learning_rate=learning_rate,
                            steps=steps,
                            seed=seed,
                            verbosity=verbosity,
                            cost_func=cost_func)
            warnings.warn('Invalid parameter: {}. Using ols.'.format(mode))
        except Exception as e:
            raise e


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('../datasets/dataset_house_price.csv')
df.head()

Unnamed: 0,value,area,distance_beach,distance_supmarket
0,4600000,280,0.240925,0.793637
1,900000,208,0.904136,0.134494
2,2550000,170,0.059525,0.423318
3,550000,100,2.883181,0.525064
4,2200000,164,0.239758,0.192374


In [5]:
df['value'] = np.log(df['value'])
df['area'] = np.log(df['area'])
df['distance_beach'] = np.log(df['distance_beach'] + 1)
df['distance_supmarket'] = np.log(df['distance_supmarket'] + 1)

In [139]:
clf = SimpleLinearRegressor()

In [140]:
clf.fit(df['area'], df['value'], mode='gradient_descent', verbosity=1000)

Exception: X must a be an numpy array

In [104]:
predicted = clf.predict(df['area'])

In [105]:
print(mean_squared_error(predicted, df['value']))
print(r2(predicted, df['value']))

0.2922746317391325
0.6419756524871565


In [32]:
clf.fit(df['area'], df['value'], mode='ols', verbosity=1000)

In [33]:
predicted = clf.predict(df['area'])

In [34]:
print(mean_squared_error(predicted, df['value']))
print(r2(predicted, df['value']))

0.2922746317391325
0.6419756524871565
