In [12]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance

from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_log_error

from statsmodels.api import OLS, add_constant

sns.set()

In [2]:
def scale_all_columns(df):
    df = df.copy()
    scaler = StandardScaler()
    for i in list(df.columns):
        df[i] = scaler.fit_transform(df[[i]])
    return df

In [3]:
def scale_one_col(df, column_name):
    df = df.copy()
    
    scaler = StandardScaler()
    
    df[column_name] = scaler.fit_transform(df[[column_name]])
        
    return df



In [6]:
def polynomial_single(df, column_name, depth):
    
    df = df.copy()
    
    df.reset_index(inplace=True)
    
    pt = PolynomialFeatures(degree= depth)
    
    Xpoly = df[[column_name]]
    p_features = pt.fit_transform(Xpoly)
    
    poly_df = pd.DataFrame(p_features, columns=[column_name + s for s in pt.get_feature_names()])
    
    df = df.join(poly_df.iloc[:,1:])
    df.drop(columns= column_name, inplace=True)
    
    df.set_index(["index"], inplace=True)
    
    return df



In [7]:
def poly_interaction(df, column_name_1, column_name_2, depth):
    
    df = df.copy()
    
    df.reset_index(inplace=True)
    
    pt = PolynomialFeatures(degree= depth)
    
    Xpoly = df[[column_name_1, column_name_2]]
    p_features = pt.fit_transform(Xpoly)
    
    poly_df = pd.DataFrame(p_features, columns=[column_name_1 + "-" + column_name_2 + s for s in pt.get_feature_names()])
    
    df = df.join(poly_df.iloc[:,1:])
    df.drop(columns=[column_name_1, column_name_2], inplace=True)
    
    df.set_index(["index"], inplace=True)
    
    return df



In [8]:
def one_hot_enc(df, column_name):
    
    df = df.copy()
    
    df[column_name] = column_name + "-" + df[column_name].astype(str)
    
    binary_sex = pd.get_dummies(df[column_name])
    df = df.join(binary_sex.iloc[:,:-1])
    
    df = df.drop(columns=column_name)
    
    return df

In [9]:


def mse(p,a): 
    return (p-a)**2/1

def rmse(p,a):  
    return np.sqrt((p-a)**2/1)

def rmsle(p,a):
    return np.sqrt(mean_squared_log_error(p,a))



In [10]:
def RandForestReg(X_train_df, y_train_data, X_test_df, y_test_data, maxdepth):
    
    #Random Forest Regressor, with best depth loop
    
    rfr_best_depth = 2
    rfr_best_test = 0
    
    for i in range (2, maxdepth + 1):
        depth = i
        mrf = RandomForestRegressor(max_depth = depth)
        mrf.fit(X_train_df, y_train_data)
        rfr_test_score = mrf.score(X_test_df, y_test_data)
        
        print(depth, rfr_test_score)
        
        if rfr_test_score > rfr_best_test:
            rfr_best_test =  rfr_test_score
            rfr_best_depth = depth
    
    print("RandForReg, best depth between 1 and " + str(maxdepth) + " is " + str(rfr_best_depth))
    
    mrf = RandomForestRegressor(max_depth = rfr_best_depth)
    mrf.fit(X_train_df, y_train_data)
    
    m_score = mrf.score(X_train_df, y_train_data)
    print("RandForest Train Score", (m_score * 100).round(2))
    c_val_score = cross_val_score(mrf, X_train_df, y_train, cv=5)
    print("cross-validation score", (c_val_score * 100).round(2))
    print("cross-validation Average", (c_val_score.mean() * 100).round(2))
    
    mtest_score = mrf.score(X_test_df, y_test_data)
    print("\nRandForest Test Score", (mtest_score * 100).round(2))
    
    y_pred = mrf.predict(X_test_df)
    
    print("RandForestReg on Test\nMean Squarred Error: ", mse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Error: ", rmse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Log Error: ", rmsle(y_pred, y_test_data))

In [11]:
def linreg(X_train_df, y_train_data, X_test_df, y_test_data):
    mlr = LinearRegression(normalize=True)
    mlr.fit(X_train_df, y_train_data)
    
    print("LinearRegression Coefficients", mlr.coef_)
    print("LinearRegression Intercept", mlr.intercept_)
    print("LinearRegression Slope", mlr.coef_[0])

    print("\nLinearRegression Train Score", mlr.score(X_train_df, y_train_data))
    c_val_score = cross_val_score(mlr, X_train_df, y_train_data, cv=5)
    print("cross-validation score", c_val_score)
    print("cross-validation Average", c_val_score.mean())

    mtest_score = mlr.score(X_test_df, y_test_data)
    print("\nLinearRegression Test Score", mtest_score)
    
    y_pred = mlr.predict(X_test_df)
    y_pred[y_pred < 0] = 0
    
    print("LinearReg on Test\nMean Squarred Error: ", mse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Error: ", rmse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Log Error: ", rmsle(y_pred, y_test_data))



In [None]:
def RandForestClassif(X_train_df, y_train_data, X_test_df, y_test_data, maxdepth):
    
    #Random Forest Regressor, with best depth loop
    
    rfr_best_depth = 2
    rfr_best_test = 0
    
    for i in range (2, maxdepth + 1):
        depth = i
        mrf = RandomForestClassifier(max_depth = depth)
        mrf.fit(X_train_df, y_train_data)
        rfr_test_score = mrf.score(X_test_df, y_test_data)
        
        print(depth, rfr_test_score)
        
        if rfr_test_score > rfr_best_test:
            rfr_best_test =  rfr_test_score
            rfr_best_depth = depth
    
    print("RandForClass, best depth between 1 and " + str(maxdepth) + " is " + str(rfr_best_depth))
    
    mrf = RandomForestClassifier(max_depth = rfr_best_depth)
    mrf.fit(X_train_df, y_train_data)
    
    m_score = mrf.score(X_train_df, y_train_data)
    print("RandForestClass Train Score", (m_score * 100).round(2))
    c_val_score = cross_val_score(mrf, X_train_df, y_train, cv=5)
    print("cross-validation score", (c_val_score * 100).round(2))
    print("cross-validation Average", (c_val_score.mean() * 100).round(2))
    
    mtest_score = mrf.score(X_test_df, y_test_data)
    print("\nRandForestClass Test Score", (mtest_score * 100).round(2))
    
    y_pred = mrf.predict(X_test_df)
    
    print("RandForestClass on Test\nMean Squarred Error: ", mse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Error: ", rmse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Log Error: ", rmsle(y_pred, y_test_data))

In [None]:
def RandForestClassif(X_train_df, y_train_data, X_test_df, y_test_data, maxdepth):
    
    #Random Forest Regressor, with best depth loop
    
    rfr_best_depth = 2
    rfr_best_test = 0
    
    for i in range (2, maxdepth + 1):
        depth = i
        mrf = RandomForestClassifier(max_depth = depth)
        mrf.fit(X_train_df, y_train_data)
        rfr_test_score = mrf.score(X_test_df, y_test_data)
        
        print(depth, rfr_test_score)
        
        if rfr_test_score > rfr_best_test:
            rfr_best_test =  rfr_test_score
            rfr_best_depth = depth
    
    print("RandForClass, best depth between 1 and " + str(maxdepth) + " is " + str(rfr_best_depth))
    
    mrf = RandomForestClassifier(max_depth = rfr_best_depth)
    mrf.fit(X_train_df, y_train_data)
    
    m_score = mrf.score(X_train_df, y_train_data)
    print("RandForestClass Train Score", (m_score * 100).round(2))
    c_val_score = cross_val_score(mrf, X_train_df, y_train, cv=5)
    print("cross-validation score", (c_val_score * 100).round(2))
    print("cross-validation Average", (c_val_score.mean() * 100).round(2))
    
    mtest_score = mrf.score(X_test_df, y_test_data)
    print("\nRandForestClass Test Score", (mtest_score * 100).round(2))
    
    y_pred = mrf.predict(X_test_df)
    
    print("RandForestClass on Test\nMean Squarred Error: ", mse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Error: ", rmse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Log Error: ", rmsle(y_pred, y_test_data))

In [None]:
GradientBoostingRegressor

In [13]:
def GradientBoostRegif(X_train_df, y_train_data, X_test_df, y_test_data, maxdepth):
    
    #Random Forest Regressor, with best depth loop
    
    rfr_best_depth = 2
    rfr_best_test = 0
    
    for i in range (2, maxdepth + 1):
        depth = i
        mrf = GradientBoostingRegressor(max_depth = depth)
        mrf.fit(X_train_df, y_train_data)
        rfr_test_score = mrf.score(X_test_df, y_test_data)
        
        print(depth, rfr_test_score)
        
        if rfr_test_score > rfr_best_test:
            rfr_best_test =  rfr_test_score
            rfr_best_depth = depth
    
    print("GradientBoostingRegressor, best depth between 1 and " + str(maxdepth) + " is " + str(rfr_best_depth))
    
    mrf = RandomForestClassifier(max_depth = rfr_best_depth)
    mrf.fit(X_train_df, y_train_data)
    
    m_score = mrf.score(X_train_df, y_train_data)
    print("GradientBoostingRegressor Train Score", (m_score * 100).round(2))
    c_val_score = cross_val_score(mrf, X_train_df, y_train, cv=5)
    print("cross-validation score", (c_val_score * 100).round(2))
    print("cross-validation Average", (c_val_score.mean() * 100).round(2))
    
    mtest_score = mrf.score(X_test_df, y_test_data)
    print("\nGradientBoostingRegressor Test Score", (mtest_score * 100).round(2))
    
    y_pred = mrf.predict(X_test_df)
    
    print("GradientBoostingRegressor on Test\nMean Squarred Error: ", mse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Error: ", rmse(y_pred, y_test_data).mean())
    print("Root Mean Squarred Log Error: ", rmsle(y_pred, y_test_data))