## Project Functions

### Description

Below are the different functions used in the analysis.

In [2]:
# Imported necessary packages
import pandas as pd
import numpy as np

from sklearn.metrics import *
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold,cross_val_score

from scipy import stats
import math as ma

import matplotlib.pyplot as plt
import seaborn as sns

### Regression Test Metrics Function

In [3]:
# Returns regression metrics
def regression_test_metrics(y, y_pred):

    r2 = round(r2_score(y, y_pred),2)
    mae = round(mean_absolute_error(y, y_pred),2)
    mse = round(mean_squared_error(y, y_pred),2)
    rmse = round(ma.sqrt(mse),2)

    results = [('r-squared:',r2),
               ('mean absolute error:',mae),
               ('mean squared error:',mse), 
               ('root Mean squared error:',rmse)]

    print('\n' + 'model metrics:' + '\n')
    for label, value in results:
        print(f"{label:{35}} {value:.>{20}}")

### K-fold cross validation regression metrics Function

In [2]:
# K-fold cross validation regression metrics
def regression_cross_val(regressor, X_train, y_train, cv):
    
    # r2
    ###############################################################################################################
    accuracies_r2 = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = cv, n_jobs = -1, 
                                    scoring = 'r2')
    accuracies_r2_mean = round(accuracies_r2.mean(),2)
    accuracies_r2_std = round(accuracies_r2.std(),2)
    
    # MAE
    ###############################################################################################################
    accuracies_mae = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = cv, n_jobs = -1, 
                                     scoring = 'neg_mean_absolute_error')
    
    score_mae = [score * -1 for score in accuracies_mae]
    score_mae_df = pd.DataFrame(score_mae, columns = ['col'])
    accuracies_mae_mean = round(score_mae_df['col'].mean(),2)
    accuracies_mae_std = round(score_mae_df['col'].std(),2)
    
    # MSE
    ###############################################################################################################
    accuracies_mse = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = cv, n_jobs = -1, 
                                     scoring = 'neg_mean_squared_error')
    
    score_mse = [score * -1 for score in accuracies_mse]
    score_mse_df = pd.DataFrame(score_mse, columns = ['col'])
    accuracies_mse_mean = round(score_mse_df['col'].mean(),2)
    accuracies_mse_std = round(score_mse_df['col'].std(),2)
    
    # RMSE
    ###############################################################################################################
    accuracies_rmse = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10, n_jobs = -1, 
                                      scoring = 'neg_root_mean_squared_error')
    
    score_rmse = [score * -1 for score in accuracies_rmse]
    score_rmse_df = pd.DataFrame(score_rmse, columns = ['col'])
    accuracies_rmse_mean = round(score_rmse_df['col'].mean(),2)
    accuracies_rmse_std = round(score_rmse_df['col'].std(),2)
    
    # Tuple unpacking
    ###############################################################################################################
    
    print('\n' + 'test model f-fold metrics:' + '\n')
        
    plt.figure(figsize = (10,3))
    plt.plot(range(1, cv + 1, 1), accuracies_r2, ls = '-', marker = 'o')
    plt.title('r2 for kfold')
    plt.xlabel('kfold index')
    plt.ylabel('r2')
    plt.ylim(0,1.1)
    plt.show()
    
    results1 = [("r-squared k-fold cross validation: ", accuracies_r2_mean),
                ("r-squared std: ", accuracies_r2_std)]

    for label, value in results1:
        print(f"{label:{50}} {value:.>{20}}")
    
    plt.figure(figsize = (10,3))
    plt.plot(range(1, cv + 1, 1), score_mae, ls = '-', marker = 'o')
    plt.title('mae for kfold')
    plt.xlabel('kfold index')
    plt.ylabel('mae')
    plt.ylim(0, max(score_mae) * 1.25)
    plt.show()
    
    results2 = [("mean absolute error k-fold cross validation: ", accuracies_mae_mean),
                ("mean absolute error std: ", accuracies_mae_std)]
    
    print("\n")
    for label, value in results2:
        print(f"{label:{50}} {value:.>{20}}")
        
    plt.figure(figsize = (10,3))
    plt.plot(range(1, cv + 1, 1), score_mse, ls = '-', marker = 'o')
    plt.title('mse for kfold')
    plt.xlabel('kfold index')
    plt.ylabel('mse')
    plt.ylim(0, max(score_mse) * 1.25)
    plt.show()
    
    results3 = [("mean squared error k-fold cross validation: ", accuracies_mse_mean),
                ("mean squared error std: ", accuracies_mse_std)]
    
    print("\n")
    for label, value in results3:
        print(f"{label:{50}} {value:.>{20}}")
        
    plt.figure(figsize = (10,3))
    plt.plot(range(1, cv + 1, 1), score_rmse, ls = '-', marker = 'o')
    plt.title('rmse for kfold')
    plt.xlabel('kfold index')
    plt.ylabel('rmse')
    plt.ylim(0, max(score_rmse) * 1.25)
    plt.show()
    
    results4 = [("root mean squared error k-fold cross validation: ", accuracies_rmse_mean),
                ("root mean squared error std: ", accuracies_rmse_std)]
    
    print("\n")
    for label, value in results4:
        print(f"{label:{50}} {value:.>{20}}")
    print("\n")

### Feature Importance Bar Chart and Coefficients

In [1]:
# Feature importance bar chart followed by supporting stats
def regression_feature_importance(model, X_cols, font, length, width):
    
    coefficients = model.coef_
    intercept =  np.array([model.intercept_])
    coefficients = np.concatenate([coefficients, intercept])
    coefficients = coefficients.reshape((-1, 1))
    X_col = np.array(X_cols.columns)
    X_col = np.concatenate([X_col, np.array(['intercept'])], axis = 0)
    X_col = X_col.reshape((-1, 1))
    coefficients = np.concatenate((X_col, coefficients), axis = 1)
    coefficients = pd.DataFrame(coefficients, columns = ['', 'coef'])
    coefficients['positive'] = coefficients['coef'] > 0 
    coefficients['coef2'] = abs(coefficients['coef'])
    coefficients = coefficients.sort_values(by = ['coef2'], ascending = True)

    sns.set(font_scale = font, style = 'white')  
    coefficients['coef'].plot(kind = 'barh',
                              figsize = (width, length), 
                              color = coefficients.positive.map({True:'b', False:'r'}))
    plt.title('feature importance (features scaled)')
    plt.xlabel('coefficient units')
    plt.ylabel('features')
    plt.show()

    coefficients = coefficients.sort_values(by = ['coef2'], ascending = False)
    coefficients = coefficients.drop(['coef2'], axis = 1, errors = 'ignore')
    coefficients = coefficients.set_index('')

    display(coefficients)

### Residual Means and Counts Plot

In [None]:
# Residual means and counts plot
def residual_means_counts_plot(df, X, res, ymin1, ymax1, ymin2, ymax2):
    col_range = round(df[X].max() - df[X].min(),0) + 1
    bins = pd.cut(df[X], int(col_range))
    mean_res = df.groupby(bins).agg({res: "mean"})
    mean_res = mean_res.rename(columns = {res: 'mean'}, inplace = False).reset_index()
    count_res = df.groupby(bins).agg({res: "count"})
    count_res = count_res.rename(columns = {res: 'count'}, inplace = False).reset_index()
    mean_count_res = pd.merge(mean_res, count_res, on = X)
    mean_count_res[X] = mean_count_res[X].astype(str)
    
    # Plot the results
    sns.set(font_scale = 1, style = 'white') 
    plt.figure(figsize = (15,3))
    ax1 = plt.axes()
    ax2 = ax1.twinx()

    ax1.plot(mean_count_res[X], mean_count_res['mean'], ls = '-', marker = 'o')
    ax1.set_xticklabels(mean_count_res[X], rotation = 'vertical')
    ax1.axhline(y = 0, color = 'r', linestyle = '--')
    ax1.set(xlabel = X + ' bins', ylabel = 'average residuals per bin', title = 'average residuals per binned ' + X)
    ax1.set_ylim(mean_count_res['mean'].min() + ymin1, mean_count_res['mean'].max() + ymax1)

    ax2.bar(mean_count_res[X],  mean_count_res['count'], color = 'darkgreen')
    ax2.set(ylabel = 'count per bin')
    ax2.set_ylim(mean_count_res['count'].min() + ymin2, mean_count_res['count'].max() + ymax2)
    plt.show()