In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV

In [2]:
# Some function
def mse(y, y_pred):
    return np.sum(np.square((y - y_pred))) / y.shape[0]

def rmsle(y, y_pred):
    return np.sqrt(mse(y, y_pred))

def report(
    model, x_train, x_test, y_train, y_test, normalize = False, y_mean = None, y_std = None
):
    # ------- Evaluation -------
    # Training
    pred = model.predict(x_train)

    if normalize:
        pred = pred * y_std + y_mean
    
    train_mae = np.sum(np.abs(y_train - pred)) / y_train.shape[0]
    train_rmsle = rmsle(y_train, pred)

    # Testing 
    pred = model.predict(x_test)

    if normalize:
        pred = pred * y_std + y_mean

    test_mae = np.sum(np.abs(y_test - pred)) / y_test.shape[0]
    test_rmsle = rmsle(y_test, pred)

    print('Training:')
    print(f'RMSLE: {train_rmsle:.4f}, MAE: {train_mae:.4f}\n')
    print('Testing:')
    print(f'RMSLE: {test_rmsle:.4f}, MAE: {test_mae:.4f}\n')

In [11]:
x_train = np.load('x_train.npy')
y_train = np.load('y_train.npy')
x_test = np.load('x_test.npy')
y_test = np.load('y_test.npy')

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(1095, 90) (1095,) (365, 90) (365,)


In [4]:
x_mean = np.mean(x_train, axis = 0)
x_std = np.std(x_train, axis = 0)
x_train_norm = (x_train - x_mean) / x_std
x_test_norm = (x_test - x_mean) / x_std

y_mean = np.mean(y_train)
y_std = np.std(y_train)
y_train_norm = (y_train - y_mean) / y_std
y_test_norm = (y_test - y_mean) / y_std

# Machine learning modeling
- [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)
- [ElasticNetCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNetCV.html#sklearn.linear_model.ElasticNetCV)

## 機器學習建模

In [14]:
# Training 
RF = RandomForestRegressor(
    n_estimators = 150, criterion='mse', max_depth=8, 
    min_samples_split = 3, min_samples_leaf = 4, 
    max_features='sqrt', max_leaf_nodes = None, min_impurity_decrease = 0.2,
    n_jobs = 4, ccp_alpha = 0.5, max_samples = 0.85
)
RF.fit(x_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [6]:
# Training
EN = ElasticNetCV(
    l1_ratio=0.5, eps=0.001, n_alphas=100, alphas=None, fit_intercept=True, normalize=False,       
    precompute='auto', max_iter=1000, tol=0.0001, cv=None, copy_X=True, verbose=0, n_jobs=None, 
    positive=False, random_state=None, selection='cyclic'
)
EN.fit(x_train_norm, y_train_norm)

ElasticNetCV()

In [7]:
report(RF, x_train, x_test, y_train, y_test)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [8]:
report(EN, x_train_norm, x_test_norm, y_train, y_test, normalize = True, y_mean = y_mean, y_std = y_std)

Training:
RMSLE: 30151.6206, MAE: 19627.6487

Testing:
RMSLE: 41570.0056, MAE: 23912.3402

