In [2]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# from src.linear.models.svm import SVM
# from src.linear.models.linear import LinearRegression
from src.tree.factory import IdentificationTreeFactory
from src.ensembles.bagging.factory import BaggingFactory
from src.neighborhood.factory import NearestNeighborFactory
# from src.ensembles.boosting.gradient import GradientBoostedRegressionTree

from src.linear.factory import LinearModelFactory

## **1) Dataset Preprocessing**

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('../data/Boston.csv')
df

Unnamed: 0.1,Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,502,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,503,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,504,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,505,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [5]:
scaled_df = StandardScaler().fit_transform(df)
x_df = scaled_df[:, :-1]
y_df = scaled_df[:, -1]

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((404, 14), (404,), (102, 14), (102,))

## **Helper Function**

In [7]:
def evaluation(y_pred, y_test):
    return {
        "mse": ((y_test - y_pred) ** 2).mean(),
        "mae": abs(y_test - y_pred).mean(),
        "r2": 1 - ((y_test - y_pred) ** 2).sum() / ((y_test - y_test.mean()) ** 2).sum()
    }

## **2) Machine Learning Model Experiments**

### **2.1) Linear Models**

In [10]:
model = (
    LinearModelFactory.create("linear_regression")
    .compile(optimizer="sgd", loss='huber')
    .build()
)

model.fit(x_train, y_train, epochs=200)
y_pred = model.predict(x_test)

evaluation(y_pred.reshape(-1, ), y_test)

-------------------[EPOCH 1/200]---------------------
Error: 1.6372549822640121
-------------------[EPOCH 2/200]---------------------
Error: 1.3872041783182048
-------------------[EPOCH 3/200]---------------------
Error: 1.2068609811086208
-------------------[EPOCH 4/200]---------------------
Error: 1.0762823488147808
-------------------[EPOCH 5/200]---------------------
Error: 0.9785910445704042
-------------------[EPOCH 6/200]---------------------
Error: 0.9035195428625158
-------------------[EPOCH 7/200]---------------------
Error: 0.8431678062095844
-------------------[EPOCH 8/200]---------------------
Error: 0.7917592942376314
-------------------[EPOCH 9/200]---------------------
Error: 0.7461014964289681
-------------------[EPOCH 10/200]---------------------
Error: 0.7042795817308061
-------------------[EPOCH 11/200]---------------------
Error: 0.6653104671440715
-------------------[EPOCH 12/200]---------------------
Error: 0.6285678028402334
-------------------[EPOCH 13/200]----

{'mse': np.float64(0.35622612701582995),
 'mae': np.float64(0.3704610222906671),
 'r2': np.float64(0.5810787282236443)}

In [12]:
model = GradientBoostedRegressionTree()
model.compile(n_estimators=10, learning_rate=0.1, max_depth=3, impurity_type="huber")

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

evaluation(y_pred, y_test)

-------------------[Iteration 0/10]---------------------
Error : 0.3844905493645985
-------------------[Iteration 1/10]---------------------
Error : 0.35076804382331167
-------------------[Iteration 2/10]---------------------
Error : 0.3202994307140073
-------------------[Iteration 3/10]---------------------
Error : 0.29244992452779683
-------------------[Iteration 4/10]---------------------
Error : 0.2672993149312308
-------------------[Iteration 5/10]---------------------
Error : 0.2451150494289032
-------------------[Iteration 6/10]---------------------
Error : 0.22536536622580386
-------------------[Iteration 7/10]---------------------
Error : 0.20778695485764595
-------------------[Iteration 8/10]---------------------
Error : 0.19066823600004298
-------------------[Iteration 9/10]---------------------
Error : 0.17537899847913607


{'mse': np.float64(0.24711754974615802),
 'mae': np.float64(0.3367737861228029),
 'r2': np.float64(0.6010884853480787)}

In [9]:
model = (
    NearestNeighborFactory.create("regressor")
    .compile(k=4, metrics="manhattan")
    .build()
)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

evaluation(y_pred, y_test)

{'mse': np.float64(0.1294042867362617),
 'mae': np.float64(0.2723604523198698),
 'r2': np.float64(0.800622099435147)}

In [None]:
model = (
    IdentificationTreeFactory.create("regressor")
    .compile(max_depth=100, impurity_type="mse")
    .build()
)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

evaluation(y_pred, y_test)

{'mse': np.float64(0.2393147282115792),
 'mae': np.float64(0.3182962700372857),
 'r2': np.float64(0.6924019977107047)}