In [1]:
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))


from src.linear.factory import LinearModelFactory
from src.tree.factory import IdentificationTreeFactory
from src.ensembles.bagging.factory import BaggingFactory
from src.ensembles.boosting.factory import BoostingFactory
from src.neighborhood.factory import NearestNeighborFactory

## **1) Dataset Preprocessing**

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../data/Boston.csv')
df

Unnamed: 0.1,Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,502,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,503,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,504,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,505,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [4]:
scaled_df = StandardScaler().fit_transform(df)
x_df = scaled_df[:, :-1]
y_df = scaled_df[:, -1]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((404, 14), (404,), (102, 14), (102,))

## **Helper Function**

In [6]:
def evaluation(y_pred, y_test):
    return {
        "mse": ((y_test - y_pred) ** 2).mean(),
        "mae": abs(y_test - y_pred).mean(),
        "r2": 1 - ((y_test - y_pred) ** 2).sum() / ((y_test - y_test.mean()) ** 2).sum()
    }

## **2) Machine Learning Model Experiments**

### **2.1) Linear Models**

In [7]:
model = (
    LinearModelFactory.create("linear_regression")
    .compile(optimizer="sgd", loss='huber')
    .build()
)

model.fit(x_train, y_train, epochs=200)
y_pred = model.predict(x_test)

evaluation(y_pred.reshape(-1, ), y_test)

-------------------[EPOCH 1/200]---------------------
Error: 1.262807141884912
-------------------[EPOCH 2/200]---------------------
Error: 1.16868458723037
-------------------[EPOCH 3/200]---------------------
Error: 1.0843422243547913
-------------------[EPOCH 4/200]---------------------
Error: 1.0076630319620241
-------------------[EPOCH 5/200]---------------------
Error: 0.9375377504982582
-------------------[EPOCH 6/200]---------------------
Error: 0.873196548509985
-------------------[EPOCH 7/200]---------------------
Error: 0.8138617438188572
-------------------[EPOCH 8/200]---------------------
Error: 0.7588595734619578
-------------------[EPOCH 9/200]---------------------
Error: 0.7076804795846346
-------------------[EPOCH 10/200]---------------------
Error: 0.659931013372768
-------------------[EPOCH 11/200]---------------------
Error: 0.615548597304791
-------------------[EPOCH 12/200]---------------------
Error: 0.5743699482629634
-------------------[EPOCH 13/200]----------

{'mse': np.float64(0.3487954753517179),
 'mae': np.float64(0.38272241411454844),
 'r2': np.float64(0.6776352825785101)}

In [8]:
model = (
    BoostingFactory.create("regressor")
    .compile(n_estimators=10, learning_rate=0.1, max_depth=5, impurity_type="squared_loss")
    .build()
)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

evaluation(y_pred, y_test)

-------------------[Iteration 0/10]---------------------
Error : 0.6501683782149524
-------------------[Iteration 1/10]---------------------
Error : 0.4392855595321931
-------------------[Iteration 2/10]---------------------
Error : 0.3040970445759601
-------------------[Iteration 3/10]---------------------
Error : 0.21286330635516665
-------------------[Iteration 4/10]---------------------
Error : 0.15121356327752422
-------------------[Iteration 5/10]---------------------
Error : 0.10922861496272179
-------------------[Iteration 6/10]---------------------
Error : 0.08176271374149606
-------------------[Iteration 7/10]---------------------
Error : 0.06265158100353259
-------------------[Iteration 8/10]---------------------
Error : 0.04850211870978394
-------------------[Iteration 9/10]---------------------
Error : 0.03953270665115571


{'mse': np.float64(0.13347036966310763),
 'mae': np.float64(0.2608726868774021),
 'r2': np.float64(0.8766436463741313)}

In [9]:
model = (
    NearestNeighborFactory.create("regressor")
    .compile(k=10, metrics="manhattan")
    .build()
)

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

evaluation(y_pred, y_test)

{'mse': np.float64(0.21297396557972456),
 'mae': np.float64(0.30283494602508226),
 'r2': np.float64(0.8031646134084409)}

In [10]:
model = (
    IdentificationTreeFactory.create("regressor")
    .compile(max_depth=100, impurity_type="mse")
    .build()
)

# bagged_model = (
#     BaggingFactory.create("regressor")
#     .compile(n_estimators=10, estimator=model)
#     .build()

# )

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

evaluation(y_pred, y_test)

{'mse': np.float64(0.1985507788792941),
 'mae': np.float64(0.32629902573718395),
 'r2': np.float64(0.8164948508500625)}