In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.io as pio
template = 'simple_white'

import seaborn as sns
sns.set()

from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['axes.labelsize'] = 15
plt.rcParams['axes.titlesize'] = 20

#Importing Regression algorithms

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import (
    LinearRegression,
    RidgeCV,
    SGDRegressor,
    ElasticNetCV,
    LarsCV,
    LassoLarsCV,
    BayesianRidge,
    ARDRegression,
    PassiveAggressiveRegressor
)

from sklearn.svm import (
    LinearSVR,
    NuSVR,
    SVR
)

from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import (
    RandomForestRegressor,
    AdaBoostRegressor, 
    BaggingRegressor, 
    ExtraTreesRegressor,
    GradientBoostingRegressor,
    HistGradientBoostingRegressor
)

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

#Progress Bar
from numpy import interp

class Progress:
    def __init__(self, value, end, title='Progress',buffer=100):
        self.title = title
        #when calling in a for loop it doesn't include the last number
        self.end = end
        self.buffer = buffer
        self.value = value + 1
        self.progress()

    def progress(self):
        maped = int(interp(self.value, [0, self.end], [0, self.buffer]))
        print(f'{self.title}: [{"#"*maped}{"-"*(self.buffer - maped)}]{self.value}/{self.end} {((self.value/self.end)*100):.2f}%', end='\r')

from sklearn import set_config
set_config(display='diagram')        

seed = #seed

In [None]:
#Function for selecting the right model

class regression_models:
    
    def __init__(self, Xtrain, ytrain, nontree_preprocessor, tree_preprocessor, cv):
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.nontree_preprocessor = nontree_preprocessor
        self.tree_preprocessor = tree_preprocessor 
        self.cv = cv
        
    def check_rgr_models(self):
        
        models = [
            LinearRegression(),
            RidgeCV(cv = 10),
            SGDRegressor(random_state = seed),
            ElasticNetCV(cv = 10, random_state = seed), 
            LarsCV(cv = 10),
            LassoLarsCV(cv = 10),
            BayesianRidge(),
            ARDRegression(),
            HuberRegressor(),
            PassiveAggressiveRegressor(random_state = seed),
            KNeighborsRegressor(),
            MLPRegressor(random_state = seed),
            LinearSVR(random_state = seed),
            NuSVR(kernel = 'rbf'),
            SVR(kernel = 'rbf')
        ]
        
        mae = []
        mse = []
        rmse = []
        r2 = []
        mape = []
        model_names = []
        
        print('Training using Non-Tree based models...')
        for model, i in zip(models, range(len(models))):
            pipe = Pipeline(steps = [('preprocessor', self.nontree_preprocessor), 
                                     (type(model).__name__, model)])
            
            scores = cross_validate(pipe, 
                                    self.Xtrain, 
                                    self.ytrain,
                                    scoring = ['neg_mean_absolute_error', 
                                               'neg_mean_squared_error', 
                                               'neg_root_mean_squared_error', 
                                               'r2', 
                                               'neg_mean_absolute_percentage_error'],
                                    cv = self.cv, 
                                    n_jobs = -1)
            
            mae.append(list(scores.values())[2].mean()*-1)
            mse.append(list(scores.values())[3].mean()*-1)
            rmse.append(list(scores.values())[4].mean()*-1)
            r2.append(list(scores.values())[5].mean()*100)
            mape.append(list(scores.values())[6].mean()*-1)
            
            model_names.append(type(model).__name__)
            Progress(i, len(models))
            
        print()

    #Using Ensemble Models

        tree_models = [
            DecisionTreeRegressor(max_depth = 6, random_state = seed),
            
            RandomForestRegressor(max_depth = 6, random_state = seed),
            
            AdaBoostRegressor(DecisionTreeRegressor(max_depth = 6), 
                             n_estimators = 100, learning_rate = 0.01, random_state = seed), 
            
            BaggingRegressor(DecisionTreeRegressor(max_depth = 6), n_estimators = 100, 
                            random_state = seed), 
            
            ExtraTreesRegressor(max_depth = 6, bootstrap = True, random_state = seed), 
            
            GradientBoostingRegressor(max_depth = 6, random_state = seed), 
            
            HistGradientBoostingRegressor(max_depth = 6, l2_regularization = 0.1, random_state = seed),
            
            XGBRegressor(max_depth = 6, random_state = seed),
            
            CatBoostRegressor(learning_rate = 0.1, depth = 6, random_seed = seed, silent = True), 
            
            LGBMRegressor(max_depth = 6, random_state = seed)
        ]
        
        print()
        print('Training Using Ensemble models...')
        for model, i in zip(tree_models, range(len(tree_models))):
            pipe = Pipeline(steps = [('preprocessor', self.tree_preprocessor), 
                                     (type(model).__name__, model)])
            
            scores = cross_validate(pipe, 
                                    self.Xtrain, 
                                    self.ytrain,
                                    scoring = ['neg_mean_absolute_error', 
                                               'neg_mean_squared_error', 
                                               'neg_root_mean_squared_error', 
                                               'r2', 
                                               'neg_mean_absolute_percentage_error'],
                                    cv = self.cv, 
                                    n_jobs = -1)
            
            mae.append(list(scores.values())[2].mean()*-1)
            mse.append(list(scores.values())[3].mean()*-1)
            rmse.append(list(scores.values())[4].mean()*-1)
            r2.append(list(scores.values())[5].mean()*100)
            mape.append(list(scores.values())[6].mean()*-1)
            
            model_names.append(type(model).__name__)
            Progress(i, len(tree_models))
        
        print()
        self.df = pd.DataFrame({
            'Models': model_names, 
            'MAE': mae, 
            'MSE': mse,
            'RMSE' : rmse, 
            'R2 %': r2, 
            'MAPE': mape 
        }).sort_values('RMSE', ignore_index = True).style.highlight_min(color = 'green')

        return self.df
    
# regression_models(Xtrain, ytrain, nontree_preprocessor, tree_preprocessor, cv)
#.check_rgr_models()

In [None]:
#for evaluating model

class validate_on_test:
    
    def __init__(self, model, Xtrain, ytrain, Xtest, ytest, cv):
        self.model = model
        self.Xtrain = Xtrain
        self.ytrain = ytrain
        self.Xtest = Xtest
        self.ytest = ytest
        self.cv = cv
        
#         self.Xtrain = self.Xtrain.values
#         self.ytrain = self.ytrain[self.ytrain.columns[0]]
#         self.Xtest = self.Xtest.values
#         self.ytest = self.ytest[self.ytest.columns[0]]
        
    def evaluate_model(self):
        
        from texttable import Texttable
        t = Texttable()
        
        try: 
            type(list(self.model.named_steps.items())[-1][1]).__name__

        except AttributeError:
            model_name = type(self.model).__name__

        else:
            model_name = type(list(self.model.named_steps.items())[-1][1]).__name__
            
        print('+' * len(f' {model_name} '))
        print(f' {model_name} ')
        print('+' * len(f' {model_name} '))
        print()

        print('Performing Cross-Validation...')
        print('-' * 63)
        t.set_deco(t.VLINES)
        
        t.add_rows([[
            'CV#', 
            '   MAE   ', 
            '   MSE   ', 
            '   RMSE   ', 
            '   R2 %   ', 
            '   MAPE   '
        ]], header = False)
        
        print(t.draw())
        print('-' * 63)
        t.reset()
        t.set_deco(t.HLINES)

        mae = []
        mse = []
        rmse = []
        r2 = []
        mape = []
        
        fold_no = 1

        for train_index, test_index in self.cv.split(self.Xtrain,self.ytrain):
            X_train_kfold, X_val_kfold = self.Xtrain.iloc[train_index,:], self.Xtrain.iloc[test_index,:]
            y_train_kfold, y_val_kfold = self.ytrain.iloc[train_index,:], self.ytrain.iloc[test_index,:]

            regressor = self.model
            col_name = self.ytrain.columns[0]

            regressor.fit(X_train_kfold, y_train_kfold[col_name]) 
            y_pred = regressor.predict(X_val_kfold)
           
            mae.append(np.round(metrics.mean_absolute_error(y_val_kfold, y_pred)))
            mse.append(np.round(metrics.mean_squared_error(y_val_kfold, y_pred), 2))
            rmse.append(np.round(np.sqrt(metrics.mean_squared_error(y_val_kfold, y_pred)), 2))
            r2.append(np.round(metrics.r2_score(y_val_kfold, y_pred)*100,2))
            mape.append(np.round(metrics.mean_absolute_percentage_error(y_val_kfold, y_pred),2))
            
            t.set_cols_align(["c", "c", "c", "c", "c", "c"])
            
            t.add_row([
                fold_no,
                mae[fold_no-1],  
                mse[fold_no-1],  
                rmse[fold_no-1],  
                r2[fold_no-1],
                mape[fold_no-1]
            ])
            
            print(t.draw())
            t.reset()
        
            fold_no += 1
                
        print()
        print(f'*** {model_name} Mean CV Scores ***')
        print('=' * len(f'*** {model_name} Mean CV Scores ***'))
        print(f'MAE   : {np.mean(mae):.2f} ± {np.std(mae):.1f}')
        print(f'MSE   : {np.mean(mse):.2f} ± {np.std(mse):.1f}')
        print(f'RMSE  : {np.mean(rmse):.2f} ± {np.std(rmse):.1f}')
        print(f'R2    : {np.mean(r2):.2f} ± {np.std(r2):.1f} %')
        print(f'MAPE  : {np.mean(mape):.2f} ± {np.std(mape):.1f}')
        print()
        print("---" * 40)
        
        y_pred = regressor.predict(self.Xtest)
        self.y_pred = y_pred

        print(f'*** {model_name} scores on Validation set ***')
        print('=' * len(f'*** {model_name} scores on Validation set ***'))
        print(f'MAE   : {metrics.mean_absolute_error(self.ytest, y_pred):.2f}')
        print(f'MSE   : {metrics.mean_squared_error(self.ytest, y_pred):.2f}')
        print(f'RMSE  : {np.sqrt(metrics.mean_squared_error(self.ytest, y_pred)):.2f}')
        print(f'R2    : {metrics.r2_score(self.ytest, y_pred)*100:.2f} %')
        print(f'MSE   : {metrics.mean_absolute_percentage_error(self.ytest, y_pred):.2f}')
        print()
        
        train_metric = np.mean(rmse)
        val_metric = np.sqrt(metrics.mean_squared_error(self.ytest, y_pred))
        diff = abs(100*(val_metric - train_metric)/train_metric)
        
        print(f'Train and Validation RMSE diffrence: {diff:.2f} %')
        print()
        
        df = pd.DataFrame({
        "Predictions" : y_pred,
        "Actual" : self.ytest[col_name].values,
        "Residuals" : self.ytest[col_name].values - y_pred
        })

        f, ax = plt.subplots(2, 1, figsize=(8, 13))

        sns.scatterplot(data = df, x = 'Predictions', y = 'Actual', edgecolor = 'black', ax = ax[0]);
        sns.scatterplot(data = df, x = 'Predictions', y = 'Residuals', edgecolor = 'black', ax = ax[1]);
        
        ax[0].plot([0, 1], [0, 1], transform=ax[0].transAxes, color='r', ls='--');
        ax[1].axhline(0, color='r', ls='--')

        ax[0].set_title("Actual Vs Prediction");
        ax[1].set_title("Residuals Vs Predictions");
        
    
    # validate_on_test(model, Xtrain, ytrain, Xtest, ytest, cv)
    # .evaluate_model()