# Import librerie e training set

In [None]:
#import
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

#partitioning & grid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold
from scipy.stats import uniform
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression, r_regression
from sklearn.model_selection import learning_curve

#models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

#evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [None]:
df = pd.read_csv('ML-CUP22-TR.csv', header=None, skiprows=7, index_col=0) #, skipinitialspace=True)
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
X = df.drop(columns=[10,11]).values
X

In [None]:
y = df[[10,11]].values

# Partitioning

In [None]:
def splitting(X,y, test_size=0.15):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=0)
    y_train1 = np.array([i[0] for i in y_train])
    y_train2 = np.array([i[1] for i in y_train])
    y_test1 = np.array([i[0] for i in y_test])
    y_test2 = np.array([i[1] for i in y_test])
    return X_train, X_test, y_train1, y_train2, y_test1, y_test2

X_train, X_test, y_train1, y_train2, y_test1, y_test2  = splitting(X, y)


In [None]:
(y_train1 == y_train2).sum(), y_train1.shape

In [None]:
X_train.shape, X_test.shape, y_train1.shape, y_train2.shape, y_test1.shape, y_test2.shape

In [None]:
df.iloc[:, 0:9]

# Support functions

In [None]:
def mean_euclidean_error(T, O):
    sum = 0
    for t, o in zip(T, O):
        sum += np.linalg.norm(t - o) / T.shape[0]
    return sum 



In [None]:
def get_evaluations(model, X_test, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mee = mean_euclidean_error(y_test, y_pred)
    return y_pred, r2, mse, mae, mee

def print_evaluations(r2, mse, mae, mee):
    print('R2: %.3f' % r2)
    print('MSE: %.3f' % mse)
    print('MAE: %.3f' % mae)
    print('MEE: %.3f' % mee)

# Model selection

## Decision Tree

In [None]:
dt = DecisionTreeRegressor(random_state=42)

In [None]:
def get_alphas(y_train, vals=50):
    path = dt.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    ccp_alphas = np.linspace(min(ccp_alphas), max(ccp_alphas), vals)
    return ccp_alphas

ccp_alphas1 = get_alphas(y_train1)
ccp_alphas2 = get_alphas(y_train2)

In [None]:
ccp_alphas1

In [None]:
ccp_alphas2

In [None]:
param_grid = [{
    "criterion": ['squared_error', 'friedman_mse', 'absolute_error'],
    "ccp_alpha": ccp_alphas1,
    "max_depth": [i for i in range (4, 9)],
},
{
    "criterion": ['squared_error', 'friedman_mse', 'absolute_error'],
    "ccp_alpha": ccp_alphas2,
    "max_depth": [i for i in range (4, 9)],
}]

In [None]:
grid = GridSearchCV(
    dt,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring= make_scorer(mean_euclidean_error, greater_is_better=False),
    refit=True
    )

In [None]:
grid.fit(X_train, y_train1)

In [None]:
dt1 = grid.best_estimator_
grid.best_estimator_

In [None]:
grid.best_params_, grid.best_score_

In [None]:
y_pred_dt1, r2_dt1, mse_dt1, mae_dt1, mee_dt1 = get_evaluations(dt1, X_test, y_test1)

print('Model for y1')
print_evaluations(r2_dt1, mse_dt1, mae_dt1, mee_dt1)

In [None]:
grid.fit(X_train, y_train2)

In [None]:
dt2 = grid.best_estimator_

In [None]:
grid.best_params_, grid.best_score_

In [None]:
y_pred_dt2, r2_dt2, mse_dt2, mae_dt2, mee_dt2 = get_evaluations(dt2, X_test, y_test2)

In [None]:
y2_pred = dt2.predict(X_test)
print(r2_score(y_test2, y2_pred))

## Model assessment

In [None]:
#y_pred_dt1, r2_dt1, mse_dt1, mae_dt1, mee_dt1 = get_evaluations(dt1, X_test, y_test1)


print('Model for y1')
print_evaluations(r2_dt1, mse_dt1, mae_dt1, mee_dt1)
print()
print('Model for y2')
print_evaluations(r2_dt2, mse_dt2, mae_dt2, mee_dt2)



In [None]:
mean_euclidean_error(pd.DataFrame(y_pred_dt1, y_pred_dt2).values,
                    pd.DataFrame(y_test1, y_test2).values)

In [None]:

mean_euclidean_error(pd.DataFrame(data=list(zip(y_pred_dt1, y_pred_dt2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values
)

In [None]:
r2_ensambled_dt = r2_score(pd.DataFrame(data=list(zip(y_pred_dt1, y_pred_dt2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

mse_ensambled_dt = mean_squared_error(pd.DataFrame(data=list(zip(y_pred_dt1, y_pred_dt2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

mae_ensambled_dt = mean_absolute_error(pd.DataFrame(data=list(zip(y_pred_dt1, y_pred_dt2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

mee_ensambled_dt = mean_euclidean_error(pd.DataFrame(data=list(zip(y_pred_dt1, y_pred_dt2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

print('DT Ensemble models')
print_evaluations(r2_ensambled_dt, mse_ensambled_dt, mae_ensambled_dt, mee_ensambled_dt)

## Random Forest

In [None]:
rf = RandomForestRegressor()

In [None]:
param_list = {'n_estimators': [10, 50, 100, 250],            # number of trees in the forest
              'max_depth': np.arange(4, 12, 2),           # max depth of the tree
              'max_features': ['auto', 'sqrt', 'log2'],                   # number of features to consider at each split
              'min_samples_split': np.arange(6, 16, 2),          # samples required to split a node
              #'min_samples_leaf': np.arange(4, 9, 2),           # samples required at leaf nodes
              'bootstrap': [True, False]                                  # method of selecting samples for training each tree
             }

In [None]:
grid = GridSearchCV(
    rf,
    param_grid=param_list,
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    scoring= make_scorer(mean_euclidean_error, greater_is_better=False),
    refit=True
    )

In [None]:
grid.fit(X_train, y_train1)

In [None]:
rf1 = grid.best_estimator_

In [None]:
grid.best_params_, grid.best_score_

In [None]:
grid.fit(X_train, y_train2)

In [None]:
rf2 = grid.best_estimator_

In [None]:
grid.best_params_, grid.best_score_

## Model Assessment

In [None]:
y_pred_rf1, r2_rf1, mse_rf1, mae_rf1, mee_rf1 = get_evaluations(rf1, X_test, y_test1)
y_pred_rf2, r2_rf2, mse_rf2,mae_rf2, mee_rf2 = get_evaluations(rf2, X_test, y_test2)
print('Model for y1')
print_evaluations(r2_rf1, mse_rf1, mae_rf1, mee_rf1)
print()
print('Model for y2')
print_evaluations(r2_rf2, mse_rf2, mae_rf2, mee_rf2)

In [None]:
mean_euclidean_error(pd.DataFrame(data=list(zip(y_pred_rf1, y_pred_rf2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values
)

In [None]:
r2_ensambled_rf = r2_score(pd.DataFrame(data=list(zip(y_pred_rf1, y_pred_rf2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

mse_ensambled_rf = mean_squared_error(pd.DataFrame(data=list(zip(y_pred_rf1, y_pred_rf2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

mae_ensambled_rf = mean_absolute_error(pd.DataFrame(data=list(zip(y_pred_rf1, y_pred_rf2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

mee_ensambled_rf = mean_euclidean_error(pd.DataFrame(data=list(zip(y_pred_rf1, y_pred_rf2)), columns=['y_pred1', 'y_pred2']).values,
                    pd.DataFrame(data=list(zip(y_test1, y_test2)), columns=['y_test1', 'y_test2']).values)

print('RF Ensemble models')
print_evaluations(r2_ensambled_rf, mse_ensambled_rf, mae_ensambled_rf, mee_ensambled_rf)

# Plotting results

In [None]:
def get_learning_curve(model, plot=True, savefig=False, model_name='', plot_title=''):
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=5, train_sizes=np.linspace(0.01, 1.0, 50), scoring=make_scorer(mean_euclidean_error, greater_is_better=False))

    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)

    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)

    if plot:
        plt.subplots(1, figsize=(8,6))
        plt.plot(train_sizes, train_mean, '--',  label="Training score")
        plt.plot(train_sizes, test_mean, label="Cross-validation score")

        #plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std)
        #plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std)

        plt.title(plot_title)
        plt.xlabel("Training Set Size", fontsize=12), plt.ylabel("MEE", fontsize=12), plt.legend(loc="best", fontsize=12)
        plt.tight_layout()
        
    if savefig:
        plt.savefig(f'plots/{model_name}_learning_curve.pdf', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
get_learning_curve(dt1, savefig=True, model_name='DT1')

In [None]:
get_learning_curve(dt2, savefig=True, model_name='DT2')

In [None]:
get_learning_curve(rf1, savefig=True, model_name='RF1')

In [None]:
get_learning_curve(rf2, savefig=True, model_name='RF2')