In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

from collections import defaultdict

import numpy as np

from models.linear_model import LinearRegression as MyLinearRegression
from models.tree import DecisionTreeRegressor as MyDecisionTreeRegressor
from models.neighbors import KNeighborsRegressor as MyKNeighborsRegressor
from models.svm import SVR as MySVR
from models.cluster import KMeansRegressor as MyKMeansRegressor

from models.evaluate import evaluate_model_and_print

In [2]:
data = fetch_california_housing()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)

evaluate_model_and_print(model, "LinearRegression", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.5179331255246697, 'MAE': 0.5286283596582387, 'R2': 0.6125511913966952, 'Runtime (s)': 0.0}
Testing model
{'MSE': 0.5558915986952425, 'MAE': 0.5332001304956989, 'R2': 0.5757877060324521, 'Runtime (s)': 0.0}


In [7]:
model = MyLinearRegression()
model.fit(X_train, y_train)

evaluate_model_and_print(model, "LinearRegression", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.5179331255246699, 'MAE': 0.5286283595868713, 'R2': 0.6125511913966952, 'Runtime (s)': 0.0012273788452148438}
Testing model
{'MSE': 0.5558915986635711, 'MAE': 0.5332001304516896, 'R2': 0.5757877060566213, 'Runtime (s)': 0.0}


In [8]:
model = KNeighborsRegressor()
model.fit(X_train, y_train)

evaluate_model_and_print(model, "KNeighborsRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.5565297600425818, 'MAE': 0.5661103900193799, 'R2': 0.5836783131753738, 'Runtime (s)': 0.03767132759094238}
Testing model
{'MSE': 1.1694144088518572, 'MAE': 0.8304867425710595, 'R2': 0.10759585116572867, 'Runtime (s)': 0.01002359390258789}


In [9]:
model = MyKNeighborsRegressor()
model.fit(X_train, y_train)

evaluate_model_and_print(model, "KNeighborsRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.5565297600425818, 'MAE': 0.5661103900193799, 'R2': 0.5836783131753738, 'Runtime (s)': 22.726046323776245}
Testing model
{'MSE': 1.1694144088518572, 'MAE': 0.8304867425710595, 'R2': 0.10759585116572867, 'Runtime (s)': 5.631476402282715}


In [3]:
model = KNeighborsRegressor(weights="distance")
model.fit(X_train, y_train)

evaluate_model_and_print(model, "KNeighborsRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.0, 'MAE': 0.0, 'R2': 1.0, 'Runtime (s)': 0.08110833168029785}
Testing model
{'MSE': 1.0853068711974831, 'MAE': 0.7989658482565546, 'R2': 0.1717800402631512, 'Runtime (s)': 0.01402735710144043}


In [4]:
model = MyKNeighborsRegressor(weights="distance")
model.fit(X_train, y_train)

evaluate_model_and_print(model, "KNeighborsRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 4.3878329171171916e-17, 'MAE': 4.7830276118123784e-09, 'R2': 1.0, 'Runtime (s)': 26.903007984161377}
Testing model
{'MSE': 1.1442531948011332, 'MAE': 0.8202660546786413, 'R2': 0.1267968902829213, 'Runtime (s)': 6.086496591567993}


In [10]:
model = MyKNeighborsRegressor(distance_metric="manhattan")
model.fit(X_train, y_train)

evaluate_model_and_print(model, "KNeighborsRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.5110898704215525, 'MAE': 0.5371306974725452, 'R2': 0.6176704064188767, 'Runtime (s)': 30.201469898223877}
Testing model
{'MSE': 1.0538158181481643, 'MAE': 0.7806350775193798, 'R2': 0.19581150950078763, 'Runtime (s)': 7.889655828475952}


In [11]:
model = DecisionTreeRegressor(max_depth=5)
model.fit(X_train, y_train)

evaluate_model_and_print(model, "DecisionTreeRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.4843432765009063, 'MAE': 0.5063471123358747, 'R2': 0.6376786573648903, 'Runtime (s)': 0.002016782760620117}
Testing model
{'MSE': 0.5245146178314735, 'MAE': 0.5222592972077787, 'R2': 0.5997321244428706, 'Runtime (s)': 0.0009872913360595703}


In [12]:
model = MyDecisionTreeRegressor(max_depth=5)
model.fit(X_train, y_train)

evaluate_model_and_print(model, "DecisionTreeRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.4843432765009063, 'MAE': 0.5063471123358744, 'R2': 0.6376786573648903, 'Runtime (s)': 0.018975019454956055}
Testing model
{'MSE': 0.5245146178314738, 'MAE': 0.5222592972077785, 'R2': 0.5997321244428704, 'Runtime (s)': 0.003584623336791992}


In [13]:
model = MyDecisionTreeRegressor(max_depth=5, use_all_thresholds=True)
model.fit(X_train, y_train)

evaluate_model_and_print(model, "DecisionTreeRegressor", X_train, y_train, X_test, y_test)

Training model
{'MSE': 0.4843432765009063, 'MAE': 0.5063471123358744, 'R2': 0.6376786573648903, 'Runtime (s)': 0.024107933044433594}
Testing model
{'MSE': 0.5245146178314738, 'MAE': 0.5222592972077785, 'R2': 0.5997321244428704, 'Runtime (s)': 0.005521535873413086}


In [14]:
model = SVR()
model.fit(X_train, y_train)

evaluate_model_and_print(model, "SVR", X_train, y_train, X_test, y_test)

Training model
{'MSE': 1.3685019409034658, 'MAE': 0.8715010413485207, 'R2': -0.023731482780208424, 'Runtime (s)': 15.211580276489258}
Testing model
{'MSE': 1.3320115421348784, 'MAE': 0.8599506583445775, 'R2': -0.016485360107176605, 'Runtime (s)': 4.0692925453186035}


In [15]:
model = MySVR(epsilon=10)
model.fit(X_train, y_train)

evaluate_model_and_print(model, "SVR", X_train, y_train, X_test, y_test)

Training model
{'MSE': 5.629742323103131, 'MAE': 2.071946937378876, 'R2': -3.21142585468018, 'Runtime (s)': 0.0}
Testing model
{'MSE': 5.53344670252374, 'MAE': 2.0550030959302323, 'R2': -3.2226868057267666, 'Runtime (s)': 0.0}


In [16]:
model = KMeans()
model.fit(X_train)

train_clusters = model.predict(X_train)

cluster_to_value = defaultdict(float)

for i in range(model.n_clusters):
    mask = (train_clusters == i)
    if np.sum(mask) > 0:
        cluster_to_value[i] = np.mean(y_train[mask])
    else:
        cluster_to_value[i] = 0.0

test_clusters = model.predict(X_test)
y_pred = np.array([cluster_to_value[int(cid)] for cid in test_clusters])

print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


MSE: 1.309622168398441
MAE: 0.9058271467807946
R2 Score: 0.0006004307475936388


In [3]:
model = MyKMeansRegressor()
model.fit(X_train, y_train)
evaluate_model_and_print(model, "KMeansRegressor", X_train, y_train, X_test, y_test)

[3.821 1.726 0.934 ... 3.081 1.635 2.292]
[1.03  2.648 1.594 ... 2.221 2.835 3.25 ]
[3.156   1.581   2.865   1.807   1.837   1.429   2.185   1.219   3.491
 2.389   1.546   2.542   1.279   0.797   1.911   1.633   2.21    1.455
 1.289   1.162   4.025   1.694   1.739   1.43    0.618   2.62    1.126
 1.687   1.174   1.537   0.68    1.822   2.611   1.985   2.374   2.953
 2.107   2.165   0.694   2.609   2.905   2.194   2.252   2.13    2.275
 1.679   1.621   0.479   3.115   2.184   1.194   2.813   3.225   1.932
 1.11    1.673   2.621   3.304   1.868   1.6     1.661   0.225   1.782
 1.791   1.001   2.25    2.621   3.397   3.489   0.74    1.693   1.266
 2.403   1.005   2.347   2.596   0.883   1.188   1.345   2.109   1.983
 3.209   1.857   2.25    1.215   2.399   2.842   2.735   3.53    1.867
 2.148   3.107   2.362   1.234   1.668   3.1     1.298   2.534   3.258
 2.018   2.203   3.587   2.579   2.409   1.319   3.543   1.278   2.67
 1.411   1.879   1.568   1.83    2.713   1.591   5.00001 2.54    

In [None]:
model = MyKMeansRegressor(n_clusters=10000)
model.fit(X_train, y_train)
evaluate_model_and_print(model, "KMeansRegressor", X_train, y_train, X_test, y_test)