In [2]:
!type python3
!/home/ec2-user/anaconda3/envs/python3/bin/python3 -m pip install  xgboost

python3 is /home/ec2-user/anaconda3/envs/python3/bin/python3
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-manylinux2014_x86_64.whl (200.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5


In [5]:
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
import xgboost as xgb
# from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import TheilSenRegressor
print('Imports finished...')

Imports finished...


In [6]:
# Basic functions for evaluations

def get_y_pred(model, x_test):
    """
    This function is to predict x_test on a model already trained.
    """
    return model.predict(x_test)

def get_rmse_test_data(y_test, y_pred):
    """
    This function is to get RMSE between y_test and y_pred
    """
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

def get_r2_score(y_test, y_pred):
    """
    This function is to get R2 between prediction and actual.
    """
    r2score = r2_score(y_test, y_pred)
    return r2score

def get_mae(y_test, y_pred):
    """
    This function is to get R2 between prediction and actual.
    """
    mae = mean_absolute_error(y_test, y_pred)
    return mae

def generate_result_df(model_name, parameters, rmse, r2score, mae, importance_list):
    """
    This function generates a DF with results
    """
    results = {
        "Model": model_name, 
        "Parameters": parameters,
        "RMSE": str(round(rmse,2)),
        "R2 Score": str(round(r2score,2)),
        "MAE": str(round(mae,2)),
        "Feature Importance": importance_list
    }
    df = pd.DataFrame([results])
    display(df)
    return df

def get_feature_importance(model, model_name, x_column_names):
    if model_name == 'DecissionTree Regressor' or model_name == 'RandomForest Regressor' or model_name == 'XGBoost' or model_name == 'Gradient Boost':
        importance = model.feature_importances_
    else:
        importance = model.coef_
    importance_list = []
    for i,v in enumerate(importance):
        importance_list.append((x_column_names[i], v))
    return importance_list
    
def graph_pred_real_data_comparison(y_test, y_pred, model_name):
    """
    This function generates a chart comparing y_train and y_test
    """
    x_ax = range(len(y_test))
    plt.plot(x_ax, y_test, label="original")
    plt.plot(x_ax, y_pred, label="predicted")
    plt.title(model_name)
    plt.xlabel('X-axis')
    plt.ylabel('Y-axis')
    plt.legend(loc='best',fancybox=True, shadow=True)
    plt.grid(True)
    plt.show()

In [7]:
def linear_regression(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for linear regression. Returns DF with results
    '''
    model_name = "Lineal Regression"
    lin_reg = LinearRegression()
    lin_reg.fit(x_train,y_train)
    y_pred = get_y_pred(lin_reg, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    importance_list = get_feature_importance(lin_reg, model_name, x_column_names)
    df = generate_result_df(model_name, 'NA', rmse, r2score, mae, importance_list)
    return df

In [8]:
def ransac(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for RANSAC regression. Returns DF with results
    '''
    model_name = "RANSAC"
    ransac_model = RANSACRegressor()
    ransac_model.fit(x_train, y_train)
    y_pred = get_y_pred(ransac_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(ransac_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, 'NA', rmse, r2score, mae, importance_list)
    return df

In [9]:
def ridge(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for Ridge regression. Returns DF with results
    '''
    model_name = "Ridge"
    ridge_model = Ridge()
    ridge_model.fit(x_train, y_train)
    y_pred = get_y_pred(ridge_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(ridge_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, 'NA', rmse, r2score, mae, importance_list)
    return df

In [10]:
def elasticnet(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for ElasticNet regression. Returns DF with results
    '''
    model_name = "ElasticNet"
    elasticnet_model = ElasticNet()
    elasticnet_model.fit(x_train, y_train)
    y_pred = get_y_pred(elasticnet_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(elasticnet_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, 'NA', rmse, r2score, mae, importance_list)
    return df

In [11]:
def linearsvr(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for LinearSVR regression. Returns DF with results
    '''
    model_name = "Linear SVR"
    param_grid_gb = {
                        # 'C': [1, 10, 100, 1000],
                        'C': [1, 10],
                        # 'max_iter': [1000, 2000],
                        'max_iter': [10000, 20000],
                        'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
                        'epsilon': [0, 0.01, 0.1, 0.3]
                    }
    linearSVR_model = LinearSVR()
    mse_grid = GridSearchCV(estimator = linearSVR_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 2, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_linearSVR_model = LinearSVR(C=best_parameters['C'], max_iter=best_parameters['max_iter'], loss=best_parameters['loss'], epsilon=best_parameters['epsilon'])
    best_linearSVR_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_linearSVR_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(best_linearSVR_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df

In [12]:
def bayesianridge(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for Bayesian Ridge regression. Returns DF with results
    '''
    model_name = "Bayesian Ridge"
    BayesianRidge_model = BayesianRidge()
    BayesianRidge_model.fit(x_train, y_train)
    y_pred = get_y_pred(BayesianRidge_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(BayesianRidge_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, 'NA', rmse, r2score, mae, importance_list)
    return df

In [13]:
def lasso(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for Lasso regression. Returns DF with results
    '''
    model_name = "Lasso"
    Lasso_model = Lasso()
    Lasso_model.fit(x_train, y_train)
    y_pred = get_y_pred(Lasso_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(Lasso_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, 'NA', rmse, r2score, mae, importance_list)
    return df

In [14]:
def decisiontree_regressor(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for DecisionTree regressor. Returns DF with results
    '''
    model_name = "DecissionTree Regressor"
    param_grid_gb = {'splitter': ['best', 'random'],
                     # 'max_features': ['auto', 'sqrt', 'log2'],
                     'max_features': [1, 'sqrt'],
                      'min_samples_split': [2, 5, 10],
                     'min_samples_leaf': [5]
                    }
    DecisionTreeRegressor_model = DecisionTreeRegressor()
    mse_grid = GridSearchCV(estimator = DecisionTreeRegressor_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 2, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_decissiontree_regressor_model = DecisionTreeRegressor(splitter=best_parameters['splitter'], max_features=best_parameters['max_features'], min_samples_split=best_parameters['min_samples_split'], min_samples_leaf=best_parameters['min_samples_leaf'])
    best_decissiontree_regressor_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_decissiontree_regressor_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(best_decissiontree_regressor_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df

In [15]:
def randomforest_regressor(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for RadomForest regressor. Returns DF with results
    '''
    model_name = "RandomForest Regressor"
    param_grid_gb = {'n_estimators': [600, 900, 1200],
                     # 'max_features': [0.3, 0.6, 1],
                     # 'max_features': [0.3, 0.6],
                     'max_features': [0.6, 0.8],
                     # 'min_samples_split': [1, 3, 5, 10, 15, 100],
                     'min_samples_split': [5, 10],
                     # 'min_samples_leaf': [1, 2, 5, 10]
                     'min_samples_leaf': [1]
                     # 'min_samples_leaf': [1, 2]
                    }
    rfg_model = RandomForestRegressor()
    mse_grid = GridSearchCV(estimator = rfg_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 1, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_rf_regresor_model = RandomForestRegressor(n_estimators=best_parameters['n_estimators'], max_features=best_parameters['max_features'])
    best_rf_regresor_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_rf_regresor_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(best_rf_regresor_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df

In [16]:
def xgboost(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for Xgboost regressor. Returns DF with results
    '''
    model_name = "XGBoost"
    param_grid_gb = {
                     # 'learning_rate': [0.3, 0.01, 0.1, 0.6, 0.9],
                     'learning_rate': [0.01],
                     # 'n_estimators' : [200, 300, 600],
                     'n_estimators' : [600],
                     # 'subsample' : [1, 0.1, 0.6],
                     'subsample' : [0.6],
                     'min_split_loss': [0, 10, 50],
                     # 'max_depth': [6, 10, 100]
                     'max_depth': [10, 100]
                    }
    xgb_model = xgb.XGBRegressor()
    mse_grid = GridSearchCV(estimator = xgb_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 2, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_xgboost_model = xgb.XGBRegressor(learning_rate=best_parameters['learning_rate'], n_estimators=best_parameters['n_estimators'], subsample=best_parameters['subsample'], min_split_loss=best_parameters['min_split_loss'], max_depth=best_parameters['max_depth'])
    best_xgboost_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_xgboost_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    importance_list = get_feature_importance(best_xgboost_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df

In [17]:
def gradientboost_regressor(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for gradient boost regressor from scklearn. Returns DF with results
    '''
    model_name = "Gradient Boost"
    param_grid_gb = {
                     # 'learning_rate': [0.3, 0.1, 0.6, 0.9],
                     'learning_rate': [0.1],
                     # 'n_estimators' : [200, 400, 600, 800],
                     'n_estimators' : [400, 600, 800],
                     'min_samples_split': [5, 15],
                    'loss': 'huber'
                    }
    gradientBoost_model = ensemble.GradientBoostingRegressor()
    mse_grid = GridSearchCV(estimator = gradientBoost_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 2, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_gradientboost_model = ensemble.GradientBoostingRegressor(learning_rate=best_parameters['learning_rate'], n_estimators=best_parameters['n_estimators'], min_samples_split=best_parameters['min_samples_split'])
    best_gradientboost_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_gradientboost_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(best_gradientboost_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df

In [18]:
def sgd_regressor(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for sgd regressor from scklearn. Returns DF with results
    '''
    model_name = "SGD"
    param_grid_gb = {
                     'penalty': ['l2', 'l1', 'elasticnet'],
                     # 'alpha': [0.0001, 0.001, 0.01, 0.1],
                     'alpha': [0.001, 0.1],
                     'epsilon': [0.1, 0.05],
                     'learning_rate': ['constant', 'invscaling', 'adaptive'],
                     'eta0' : [0.01, 0.1],
                     'power_t': [0.25, 0.1]
                    }
    sgdr_model = SGDRegressor()
    mse_grid = GridSearchCV(estimator = sgdr_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 2, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_sgd_model = SGDRegressor(learning_rate=best_parameters['learning_rate'], penalty=best_parameters['penalty'], alpha=best_parameters['alpha'], epsilon=best_parameters['epsilon'], eta0=best_parameters['eta0'], power_t=best_parameters['power_t'])
    best_sgd_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_sgd_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(best_sgd_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df

In [19]:
def huber_regressor(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for Huber regressor from scklearn. Returns DF with results
    '''
    model_name = "Huber"
    param_grid_gb = {
                     'alpha': [0.001, 0.01],
                     # 'epsilon': [1, 1.35, 1.8, 3],
                     'epsilon': [1.8, 3],
                     'max_iter': [100, 300]
                    }
    huber_model = HuberRegressor()
    mse_grid = GridSearchCV(estimator = huber_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 2, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_huber_model = HuberRegressor(alpha=best_parameters['alpha'], epsilon=best_parameters['epsilon'], max_iter=best_parameters['max_iter'])
    best_huber_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_huber_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(best_huber_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df

In [None]:
def theilsen_regressor(x_train, y_train, x_test, y_test, x_column_names):
    '''
    Model for TheilSen regressor from scklearn. Returns DF with results
    '''
    model_name = "TheilSen"
    param_grid_gb = {
                     'max_iter': [400, 800]
                    }
    theilsen_model = TheilSenRegressor()
    mse_grid = GridSearchCV(estimator = theilsen_model, param_grid = param_grid_gb, scoring = 'neg_mean_squared_error', cv = 2, verbose = 2)
    mse_grid.fit(x_train, y_train)
    best_parameters = mse_grid.best_params_
    best_theilsen_model = TheilSenRegressor(max_iter=best_parameters['max_iter'])
    best_theilsen_model.fit(x_train, y_train)
    y_pred = get_y_pred(best_theilsen_model, x_test)
    rmse = get_rmse_test_data(y_test,y_pred)
    r2score = get_r2_score(y_test,y_pred)
    mae = get_mae(y_test,y_pred)
    importance_list = get_feature_importance(best_theilsen_model, model_name, x_column_names)
    # graph_pred_real_data_comparison(y_test, y_pred, model_name)
    df = generate_result_df(model_name, best_parameters, rmse, r2score, mae, importance_list)
    return df