## scikit learn

In [59]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    GradientBoostingRegressor, 
    RandomForestRegressor
)
import joblib

In [11]:
housing = datasets.fetch_california_housing()

In [12]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [13]:
print(housing.target_names)

['MedHouseVal']


In [14]:
x = housing.data
y = housing.target

In [15]:
print("nuber of rows:", len(x))
print("nuber of columns:", len(housing.feature_names))

nuber of rows: 20640
nuber of columns: 8


In [16]:
print(x[0])
print(y[0])

[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
4.526


In [27]:
x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y, 
    test_size=0.2,
    random_state=432
    )

In [20]:
print("number of train samples:", len(x_train))
print("number of test samples:", len(x_test))

number of train samples: 16512
number of test samples: 4128


In [35]:
print(x_train[0])
print(y_train[0])

[   2.1442       52.            3.94886364    1.03693182  921.
    2.61647727   37.34       -121.88      ]
1.889


In [37]:
model = LinearRegression()
model.fit(x_train, y_train)

In [39]:
y_pred = model.predict(x_test)

r2 = r2_score(y_test, y_pred)

print('r2 score:', r2)

r2 score: 0.6080229586580355


In [None]:
# baseline
# r2 score: 0.6080229586580355

### Optimize Features

In [41]:
print("old number of features:", len(x_train[0]))

old number of features: 8


In [42]:
poly = PolynomialFeatures()
x_train = poly.fit_transform(x_train)
x_test = poly.fit_transform(x_test)

In [43]:
print("new number of features:", len(x_train[0]))

new number of features: 45


In [44]:
model.fit(x_train, y_train) # train
y_pred = model.predict(x_test) # test
r2 = r2_score(y_test, y_pred) # evaluate

print("new R2 score:", r2)

new R2 score: 0.6610240205031894


### Optimize Algorithms

In [47]:
# initialize models
LR = LinearRegression()
GBR = GradientBoostingRegressor()
RFR = RandomForestRegressor()

In [48]:
for model in [LR, GBR, RFR]:
    model.fit(x_train, y_train) # train
    y_pred = model.predict(x_test) # test
    r2 = r2_score(y_test, y_pred) # evaluate
    print("Model:", model)
    print("R2 score:", r2)
    print("-------------")

Model: LinearRegression()
R2 score: 0.6610240205031894
-------------
Model: GradientBoostingRegressor()
R2 score: 0.7918940977930871
-------------
Model: RandomForestRegressor()
R2 score: 0.805058980507783
-------------


### Optimize Algorithm Speed

In [53]:
GBR = HistGradientBoostingRegressor()
RFR = RandomForestRegressor(
    n_jobs=-1
)

In [54]:
for model in [GBR, RFR]:
    model.fit(x_train, y_train) # train
    y_pred = model.predict(x_test) # test
    r2 = r2_score(y_test, y_pred) # evaluate
    print("Model:", model)
    print("R2 score:", r2)
    print("-------------")

Model: HistGradientBoostingRegressor()
R2 score: 0.8350267333970385
-------------
Model: RandomForestRegressor(n_jobs=-1)
R2 score: 0.8042353824844476
-------------


### Optimize Hyperparameters

In [55]:
for i in [100, 200, 300, 400, 500]:
    model = HistGradientBoostingRegressor(
        max_iter=i
    )
    model.fit(x_train, y_train) # train
    y_pred = model.predict(x_test) # test
    r2 = r2_score(y_test, y_pred) # evaluate
    print("Number of trees:", i)
    print("R2 score:", r2)
    print("-------------")

Number of trees: 100
R2 score: 0.8393682862670675
-------------
Number of trees: 200
R2 score: 0.8421601365332462
-------------
Number of trees: 300
R2 score: 0.8461833863508974
-------------
Number of trees: 400
R2 score: 0.8418404699062381
-------------
Number of trees: 500
R2 score: 0.8442198486363686
-------------


In [57]:
for j in [0.1, 0.05, 0.001]:   
    for i in [100, 200, 300, 400, 500]:
        model = HistGradientBoostingRegressor(
            max_iter=i,
            learning_rate=j
        )
        model.fit(x_train, y_train) # train
        y_pred = model.predict(x_test) # test
        r2 = r2_score(y_test, y_pred) # evaluate
        print("Learning rate:", j)
        print("Number of trees:", i)
        print("R2 score:", r2)
        print("-------------")

Learning rate: 0.1
Number of trees: 100
R2 score: 0.8348523315182537
-------------
Learning rate: 0.1
Number of trees: 200
R2 score: 0.8450420970360444
-------------
Learning rate: 0.1
Number of trees: 300
R2 score: 0.8469678822655831
-------------
Learning rate: 0.1
Number of trees: 400
R2 score: 0.84821232549939
-------------
Learning rate: 0.1
Number of trees: 500
R2 score: 0.8440657754368279
-------------
Learning rate: 0.05
Number of trees: 100
R2 score: 0.8245385051352947
-------------
Learning rate: 0.05
Number of trees: 200
R2 score: 0.8410221601784034
-------------
Learning rate: 0.05
Number of trees: 300
R2 score: 0.8442352747021293
-------------
Learning rate: 0.05
Number of trees: 400
R2 score: 0.8456317670695327
-------------
Learning rate: 0.05
Number of trees: 500
R2 score: 0.8462042782511342
-------------
Learning rate: 0.001
Number of trees: 100
R2 score: 0.12057808544764603
-------------
Learning rate: 0.001
Number of trees: 200
R2 score: 0.22002238519569495
---------

### Model saving

Best parameters:

max_iter=500

learning_rate=0.05

In [60]:
model = HistGradientBoostingRegressor(
    max_iter=500,
    learning_rate=0.05
)
model.fit(x_train, y_train)

joblib.dump(model, "my_model.joblib")

['my_model.joblib']

In [61]:
# evaluate existing model
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print("existing model score:", r2)

existing model score: 0.8476683780037483


In [62]:
saved_model = joblib.load("my_model.joblib")

y_pred = saved_model.predict(x_test)
r2 = r2_score(y_test, y_pred)
print("saved model score:", r2)

saved model score: 0.8476683780037483


### TODOs:
- try optimizing more than 2 hyperparameters, reaching scores above 86%
- instead of simply printing the R2 scores, try visualising them with Pandas or Matplotlib
- Find even better algorithms that outperform HistGradientBoostingRegressor
- Find a different dataset and analyze it like we've analyzed this one.