In [1]:
import pandas as pd
import numpy as np
import random

In [57]:
df = pd.read_csv('../datasets/dataset_house_price_classification.csv')

In [58]:
df.head()

Unnamed: 0,value,area,distance_beach,dist_supmarket,should_buy
0,4600000,280,0.240925,0.793637,1
1,900000,208,0.904136,0.134494,1
2,2550000,170,0.059525,0.423318,1
3,550000,100,2.883181,0.525064,0
4,2200000,164,0.239758,0.192374,1


In [62]:
df['value'] = np.log(df['value'])
df['area'] = np.log(df['area'])
df['distance_beach'] = np.log(df['distance_beach'] + 1)
df['dist_supmarket'] = np.log(df['dist_supmarket'] + 1)

In [63]:
df.head()

Unnamed: 0,value,area,distance_beach,dist_supmarket,should_buy
0,15.341567,5.63479,0.215857,0.584245,1
1,13.71015,5.337538,0.644028,0.126187,1
2,14.751604,5.135798,0.057821,0.352991,1
3,13.217674,4.60517,1.356655,0.422036,0
4,14.603968,5.099866,0.214916,0.175946,1


In [35]:
def fit(X, y, mode='gradient_descent', learning_rate=0.01, steps=10000, seed=0, verbosity=0, cost_func='cross_entropy'):
    modes = {
        'gradient_descent': gradient_descent
    }
    
    return modes.get(mode, lambda: 'Invalid')(X=X,
                                              y=y,
                                              learning_rate=learning_rate,
                                              steps=steps,
                                              seed=seed,
                                              verbosity=verbosity,
                                              cost_func=cost_func)

In [27]:
def absolute_error(pred_y, expec_y):
    return (pred_y - expec_y)

In [71]:
def predict_proba(X, betas, intercept=True):
    X = np.c_[np.ones(X.shape[0]), X] if intercept else X
    
    z = np.dot(X, betas)
    return 1 / (1 + np.exp(-z))

In [70]:
def predict(y, betas, threshold = 0.5):
    return (predict_proba(y, betas) > 0.5)*1

In [44]:
def cross_entropy(y_pred, y_expec):
    return (-y_expec*np.log(y_pred) - (1 - y_expec)*np.log(1 - y_pred)).mean()

In [60]:
def gradient_descent(X, y, learning_rate, steps, seed, verbosity, cost_func='cross_entropy', intercept=True, **kwargs):
    np.random.seed(seed)
    
    X = np.c_[np.ones(X.shape[0]), X] if intercept else X
    
    minimize = {
        'cross_entropy': cross_entropy
    }
    
    betas = np.random.rand(X.shape[1])
    
    for i in range(steps):
        predicted = predict_proba(X, betas, intercept=False)
        error = minimize.get(cost_func, lambda: 'Invalid')(predicted, y)
            
        if verbosity != 0 and i % verbosity == 0 :
            print('Step: {} --- Error: {}'.format(i, error))
        
        error = predicted - y
        gradient = np.dot(X.T, error)/(len(X))
        betas -= learning_rate*gradient
    
    return betas

In [64]:
betas = fit(df.drop('should_buy', axis=1).values, df['should_buy'].values, seed=42, learning_rate=0.1, verbosity=1000)

Step: 0 --- Error: 8.64039360110891
Step: 1000 --- Error: 0.3068509524080482
Step: 2000 --- Error: 0.29882549538137126
Step: 3000 --- Error: 0.2983581074563666
Step: 4000 --- Error: 0.29806465608454114
Step: 5000 --- Error: 0.29780855042822774
Step: 6000 --- Error: 0.2975664466572868
Step: 7000 --- Error: 0.2973333549433567
Step: 8000 --- Error: 0.2971078250585845
Step: 9000 --- Error: 0.29688923704683573


In [65]:
np.array([[np.log(72), np.log(0.5 + 1), np.log(0.1 +1), np.log(850000)]])

array([[ 4.27666612,  0.40546511,  0.09531018, 13.65299163]])

In [66]:
predict_proba(np.array([[np.log(72), np.log(0.5 + 1), np.log(0.1 +1), np.log(850000)]]), betas)

array([0.69807295])

In [72]:
predict(np.array([[np.log(72), np.log(0.5 + 1), np.log(0.1 +1), np.log(850000)]]), betas)

array([1])

In [67]:
def accuracy(y_pred, y_expec):
    return (y_pred == y_expec).mean()