## Model Selection and Modeling

- [Linear Regression: OLS](#lm)
- [Random Forest Regression](#rfr)
- [Light Gradient Boosting Model With Bayesian Optimization](#lgbm)

### Import Necessary Modules and Datasets

In [1]:
# Import tools to get datasets
from zipfile import ZipFile
import urllib
import requests
import io

# Import modules for data reading and plots
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Import all for model selections and modelling
from bayes_opt import BayesianOptimization
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
import time

In [2]:
start = time.time() # Timer starts

Xtrainurl = 'https://raw.githubusercontent.com/jonahwinninghoff/Springboard_Capstone_Project/main/Assets/X_train'
Xvalidurl = 'https://raw.githubusercontent.com/jonahwinninghoff/Springboard_Capstone_Project/main/Assets/X_valid'
Xtestiurl = 'https://raw.githubusercontent.com/jonahwinninghoff/Springboard_Capstone_Project/main/Assets/X_test'

ytrainurl = 'https://github.com/jonahwinninghoff/Springboard_Capstone_Project/blob/main/Assets/y_train?raw=true'
yvalidurl = 'https://github.com/jonahwinninghoff/Springboard_Capstone_Project/blob/main/Assets/y_valid?raw=true'
ytestiurl = 'https://github.com/jonahwinninghoff/Springboard_Capstone_Project/blob/main/Assets/y_test?raw=true'

In [3]:
# Read X_train dataset
url = urllib.request.urlopen(Xtrainurl)
file = io.BytesIO(url.read())
X_train = pd.read_csv(file, encoding='cp1252').drop('Unnamed: 0', 
                                                    axis=1)

# Read X_valid dataset
url = urllib.request.urlopen(Xvalidurl)
file = io.BytesIO(url.read())
X_valid = pd.read_csv(file, encoding='cp1252').drop('Unnamed: 0', 
                                                    axis=1)

# Read X_test dataset
url = urllib.request.urlopen(Xtestiurl)
file = io.BytesIO(url.read())
X_test = pd.read_csv(file, encoding='cp1252').drop('Unnamed: 0', 
                                                    axis=1)

In [4]:
# Read y_train dataset
response = requests.get(ytrainurl)
response.raise_for_status()
y_train = np.load(io.BytesIO(response.content))

# Read y_valid dataset
response = requests.get(yvalidurl)
response.raise_for_status()
y_valid = np.load(io.BytesIO(response.content))

# Read y_test dataset
response = requests.get(ytestiurl)
response.raise_for_status()
y_test = np.load(io.BytesIO(response.content))

In [5]:
# Read X_TFIDF_train
url = urllib.request.urlopen('https://github.com/jonahwinninghoff/Springboard_Capstone_Project/blob/main/Assets/X_TFIDF_train.zip?raw=true')
unzipfile = ZipFile(io.BytesIO(url.read()))
file = unzipfile.open("X_TFIDF_train")
X_TFIDF_train = pd.read_csv(file, encoding='cp1252').drop('Unnamed: 0', 
                                                    axis=1)
file.close()

# Read X_TFIDF_valid
url = urllib.request.urlopen('https://github.com/jonahwinninghoff/Springboard_Capstone_Project/blob/main/Assets/X_TFIDF_valid.zip?raw=true')
unzipfile = ZipFile(io.BytesIO(url.read()))
file = unzipfile.open("X_TFIDF_valid")
X_TFIDF_valid = pd.read_csv(file, encoding='cp1252').drop('Unnamed: 0', 
                                                    axis=1)
file.close()

# Read X_TFIDF_test
url = urllib.request.urlopen('https://github.com/jonahwinninghoff/Springboard_Capstone_Project/blob/main/Assets/X_TFIDF_test.zip?raw=true')
unzipfile = ZipFile(io.BytesIO(url.read()))
file = unzipfile.open("X_TFIDF_test")
X_TFIDF_test = pd.read_csv(file, encoding='cp1252').drop('Unnamed: 0', 
                                                    axis=1)
file.close()

### Linear Regression: OLS <a id ='lm'></a>

In [6]:
X_TFIDF_train_wconstant = sm.add_constant(X_TFIDF_train)
model = sm.OLS(y_train,X_TFIDF_train_wconstant)
TFIDF_fitted_lm = model.fit()

display(TFIDF_fitted_lm.summary())

0,1,2,3
Dep. Variable:,y,R-squared:,0.909
Model:,OLS,Adj. R-squared:,0.909
Method:,Least Squares,F-statistic:,83400.0
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,0.0
Time:,13:36:25,Log-Likelihood:,110040.0
No. Observations:,141159,AIC:,-220000.0
Df Residuals:,141141,BIC:,-219900.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.1332,0.346,17.721,0.000,5.455,6.812
ability,-1.745e+10,2.37e+10,-0.737,0.461,-6.39e+10,2.9e+10
activities,-2.058e+10,5.1e+10,-0.403,0.687,-1.21e+11,7.94e+10
antianxiety,4.197e+10,3e+10,1.401,0.161,-1.67e+10,1.01e+11
antipsychotic,1.241e+10,3.49e+10,0.356,0.722,-5.6e+10,8.08e+10
appropriately,-1.489e+10,2.41e+10,-0.618,0.537,-6.22e+10,3.24e+10
assessed,-6.569e+09,2.66e+10,-0.247,0.805,-5.86e+10,4.55e+10
bladder,-5.572e+09,1.39e+11,-0.040,0.968,-2.79e+11,2.68e+11
bowels,1.404e+11,7.8e+10,1.799,0.072,-1.26e+10,2.93e+11

0,1,2,3
Omnibus:,39867.364,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,602595.2
Skew:,-0.95,Prob(JB):,0.0
Kurtosis:,12.942,Cond. No.,5.37e+16


In [7]:
X_train_wconstant = sm.add_constant(X_train)
model = sm.OLS(y_train,X_train_wconstant)
fitted_lm = model.fit()

display(fitted_lm.summary())

0,1,2,3
Dep. Variable:,y,R-squared:,0.696
Model:,OLS,Adj. R-squared:,0.696
Method:,Least Squares,F-statistic:,64640.0
Date:,"Fri, 03 Sep 2021",Prob (F-statistic):,0.0
Time:,13:36:25,Log-Likelihood:,24551.0
No. Observations:,141159,AIC:,-49090.0
Df Residuals:,141153,BIC:,-49030.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1596,0.007,23.924,0.000,0.146,0.173
cosine median,-0.0452,0.002,-19.139,0.000,-0.050,-0.041
cosine mean,0.7183,0.015,47.088,0.000,0.688,0.748
cosine minimum,-0.1735,0.009,-20.246,0.000,-0.190,-0.157
cosine maximum,0.6937,0.003,198.503,0.000,0.687,0.701
cosine std,-0.6243,0.035,-18.031,0.000,-0.692,-0.556

0,1,2,3
Omnibus:,16213.742,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28286.985
Skew:,0.786,Prob(JB):,0.0
Kurtosis:,4.53,Cond. No.,80.1


The OLS with TFIDF vectorization shows that a few variables are statistically significant at two-sided 5% alpha level. The Akaike and Bayesian Information Criterions (AIC and BIC) are −0.98 and −0.977. The R-squared score is 90.9%. The adjusted R-squared score shows that the number of parameters has little impact on the percentage of the target variable variation that this model explains. When the number of features reduces using cosine similarities, each variable is satistically significant at two-sided 1% alpha level, and AIC and BIC are lower. However, the R-squared score is 69.9, and the adjusted R-squared score is same. 

The kurtosis score lowers but it is still leptokurtotic. The skewness for OLS with TFIDF vectorization is extremely negative. When the cosine similarites are applied, the skewness shifts to postive or right skewness.

In [8]:
X_valid_wconstant = sm.add_constant(X_valid)
y_predicted = fitted_lm.predict(X_valid_wconstant)

X_TFIDF_valid_wconstant = sm.add_constant(X_TFIDF_valid)
y_TFIDF_predicted = TFIDF_fitted_lm.predict(X_TFIDF_valid_wconstant)

In [9]:
def r2_calculator(predict, true):
    merged = pd.concat([pd.DataFrame(list(predict)).rename(columns={0:'predicted'}), 
           pd.DataFrame(list(true)).rename(columns={0:'true'})],axis=1)
    
    # 1 - (predict - true)^2/(true - mean(true))^2 = 1 - RSS/TSS = r_2
    r2 = 1 - sum((merged['predicted'] - merged['true'])**2)/sum((merged['true'] - np.mean(merged['true']))**2)
    
    return r2

def adjusted_r2_calculator(predict, true):
    r2 = r2_calculator(predict, true) # r_2
    n = len(true)                     # number of rows
    k = len(fitted_lm.params)         # number of parameters
    
    adj_r2 = 1 - ((1-r2)*(n-1))/(n-k-1)    # 1 - [(1 - r_2)*(n - 1)/(n - k - 1)] = adjusted r^2
    
    return adj_r2

def mae_and_mse_calculator(predict, true):
    merged = pd.concat([pd.DataFrame(list(predict)).rename(columns={0:'predicted'}), 
           pd.DataFrame(list(true)).rename(columns={0:'true'})],axis=1)
    mae = sum(np.abs(merged['true'] - merged['predicted']))/len(true) # 1/n ∑ |true - predict|   = MAE
    mse = sum((merged['true'] - merged['predicted'])**2)/len(true)    # 1/n ∑ (true - predict)^2 = MSE
    
    return mae, mse

In [10]:
print('TFIDF Validation Set')
print('R2: '+str(round(r2_calculator(y_TFIDF_predicted, y_valid),4)))
print('Adjusted R2: '+str(round(adjusted_r2_calculator(y_TFIDF_predicted,y_valid),4)))
print('MAE: '+ str(round(mae_and_mse_calculator(y_TFIDF_predicted,y_valid)[0],4)))
print('RMSE: '+ str(round(mae_and_mse_calculator(y_TFIDF_predicted,y_valid)[1]**0.5,4)))

TFIDF Validation Set
R2: 0.9091
Adjusted R2: 0.9091
MAE: 0.0687
RMSE: 0.1108


In [11]:
print('Cos Similarity Validation Set')
print('R2: '+str(round(r2_calculator(y_predicted, y_valid),4)))
print('Adjusted R2: '+str(round(adjusted_r2_calculator(y_predicted,y_valid),4)))
print('MAE: '+ str(round(mae_and_mse_calculator(y_predicted,y_valid)[0],4)))
print('RMSE: '+ str(round(mae_and_mse_calculator(y_predicted,y_valid)[1]**0.5,4)))

Cos Similarity Validation Set
R2: 0.6928
Adjusted R2: 0.6928
MAE: 0.1531
RMSE: 0.2036


When the model with TFIDF vectorization is being generalized using validation set, it is already at highest accuracy and lowest loss. The model with cosine similarities still performs well when it generalizes. The problem with TFIDF vectorization is that the number of parameters is too wide.

## Random Forest Regression <a id='rfr'></a>

In [12]:
TFIDF_rfr_model = RandomForestRegressor()
TFIDF_rfr_model.fit(X_TFIDF_train, y_train.ravel())

rfr_model = RandomForestRegressor()
rfr_model.fit(X_train, y_train.ravel())

RandomForestRegressor()

In [13]:
print('TFIDF Similarity Validation Set')
print('R2: ' + str(round(r2_calculator(TFIDF_rfr_model.predict(X_TFIDF_valid),y_valid),4)))
print('Adjusted R2: ' + str(round(adjusted_r2_calculator(TFIDF_rfr_model.predict(X_TFIDF_valid),y_valid),4)))
print('MAE: ' + str(round(mae_and_mse_calculator(TFIDF_rfr_model.predict(X_TFIDF_valid),y_valid)[0],4)))
print('RMSE: ' + str(round(mae_and_mse_calculator(TFIDF_rfr_model.predict(X_TFIDF_valid),y_valid)[1]**0.5,4)))

TFIDF Similarity Validation Set
R2: 0.9091
Adjusted R2: 0.9091
MAE: 0.0687
RMSE: 0.1107


In [14]:
print('Cos Similarity Validation Set')
print('R2: ' + str(round(r2_calculator(rfr_model.predict(X_valid),y_valid),4)))
print('Adjusted R2: ' + str(round(adjusted_r2_calculator(rfr_model.predict(X_valid),y_valid),4)))
print('MAE: ' + str(round(mae_and_mse_calculator(rfr_model.predict(X_valid),y_valid)[0],4)))
print('RMSE: ' + str(round(mae_and_mse_calculator(rfr_model.predict(X_valid),y_valid)[1]**0.5,4)))

Cos Similarity Validation Set
R2: 0.9091
Adjusted R2: 0.9091
MAE: 0.0687
RMSE: 0.1107


As a result, the random forest regressor proves to be capable of performing the model at highest level even if the number of parameters reduce using cosine similarities. 

## Light Gradient Boosting Model With Bayesian Optimization <a id='lgbm'></a>

In [15]:
def lgbm_eval(lambda_l2,lambda_l1,max_depth,learning_rate,n_estimators):
    lgbm_model = LGBMRegressor(lambda_l2 = lambda_l2, lambda_l1 = lambda_l1, 
                               max_depth = int(round(max_depth,0)),
                               learning_rate = learning_rate, 
                               n_estimators = int(round(n_estimators,0)))
    
    lgbm_model.fit(X_train, y_train.ravel())
    return r2_calculator(lgbm_model.predict(X_valid),y_valid)

In [16]:
lgbmBO = BayesianOptimization(lgbm_eval, {'lambda_l2':(0, 0.5),
                                          'lambda_l1':(0,0.5),
                                          'max_depth':(-1,6),
                                          'learning_rate':(0.01,0.5),
                                          'n_estimators':(10,200)})

lgbmBO.maximize(n_iter=20, init_points=2)

|   iter    |  target   | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9091  [0m | [0m 0.09487 [0m | [0m 0.1745  [0m | [0m 0.2014  [0m | [0m-0.8501  [0m | [0m 80.65   [0m |
| [0m 2       [0m | [0m 0.9091  [0m | [0m 0.3941  [0m | [0m 0.1887  [0m | [0m 0.1184  [0m | [0m 2.559   [0m | [0m 104.9   [0m |
| [0m 3       [0m | [0m 0.9091  [0m | [0m 0.001538[0m | [0m 0.2456  [0m | [0m 0.4179  [0m | [0m 0.2545  [0m | [0m 80.02   [0m |
| [95m 4       [0m | [95m 0.9091  [0m | [95m 0.2373  [0m | [95m 0.1865  [0m | [95m 0.3677  [0m | [95m 4.104   [0m | [95m 197.4   [0m |
| [95m 5       [0m | [95m 0.9091  [0m | [95m 0.05774 [0m | [95m 0.07863 [0m | [95m 0.1766  [0m | [95m 4.936   [0m | [95m 44.38   [0m |
| [95m 6       [0m | [95m 0.9091  [0m | [95m 0.3401  [0m | [95m 0.1508  [0m | [95m 0.1808  [0m

In [17]:
def TFIDF_lgbm_eval(lambda_l2,lambda_l1,max_depth,learning_rate,n_estimators):
    lgbm_model = LGBMRegressor(lambda_l2 = lambda_l2, lambda_l1 = lambda_l1, 
                               max_depth = int(round(max_depth,0)),
                               learning_rate = learning_rate, 
                               n_estimators = int(round(n_estimators,0)))
    
    lgbm_model.fit(X_TFIDF_train, y_train.ravel())
    return r2_calculator(lgbm_model.predict(X_TFIDF_valid),y_valid)

In [18]:
TFIDF_lgbmBO = BayesianOptimization(TFIDF_lgbm_eval, {'lambda_l2':(0, 0.5),
                                          'lambda_l1':(0,0.5),
                                          'max_depth':(-1,6),
                                          'learning_rate':(0.01,0.5),
                                          'n_estimators':(10,200)})

TFIDF_lgbmBO.maximize(n_iter=20, init_points=2)

|   iter    |  target   | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9091  [0m | [0m 0.4742  [0m | [0m 0.0881  [0m | [0m 0.259   [0m | [0m 4.581   [0m | [0m 135.8   [0m |
| [0m 2       [0m | [0m 0.906   [0m | [0m 0.3256  [0m | [0m 0.3623  [0m | [0m 0.3262  [0m | [0m 0.7226  [0m | [0m 61.46   [0m |
| [95m 3       [0m | [95m 0.9091  [0m | [95m 0.4992  [0m | [95m 0.1555  [0m | [95m 0.4905  [0m | [95m 5.784   [0m | [95m 136.3   [0m |
| [0m 4       [0m | [0m 0.9082  [0m | [0m 0.4417  [0m | [0m 0.1758  [0m | [0m 0.01701 [0m | [0m-0.8859  [0m | [0m 199.8   [0m |
| [0m 5       [0m | [0m 0.8736  [0m | [0m 0.0     [0m | [0m 0.2158  [0m | [0m 0.01    [0m | [0m-1.0     [0m | [0m 161.1   [0m |
| [0m 6       [0m | [0m 0.9091  [0m | [0m 0.5     [0m | [0m 0.4773  [0m | [0m 0.5     [0m | [0m 6.0 

In [19]:
print(lgbmBO.max['params'])
print(TFIDF_lgbmBO.max['params'])

{'lambda_l1': 0.340093247157719, 'lambda_l2': 0.15080299601720903, 'learning_rate': 0.1808274612032137, 'max_depth': 4.572273005043445, 'n_estimators': 39.323025009117075}
{'lambda_l1': 0.053083311782776144, 'lambda_l2': 0.028736780795506844, 'learning_rate': 0.3752551731221079, 'max_depth': -0.9437298530160639, 'n_estimators': 14.950985278511745}


In [20]:
lgbm_model = LGBMRegressor(lambda_l2 = lgbmBO.max['params']['lambda_l1'],
                           lambda_l1 = lgbmBO.max['params']['lambda_l2'],
                           max_depth = int(round(lgbmBO.max['params']['max_depth'],0)),
                           learning_rate = lgbmBO.max['params']['learning_rate'],
                           n_estimators = int(round(lgbmBO.max['params']['n_estimators'],0)))
lgbm_model.fit(X_train, y_train.ravel())

LGBMRegressor(lambda_l1=0.15080299601720903, lambda_l2=0.340093247157719,
              learning_rate=0.1808274612032137, max_depth=5, n_estimators=39)

In [21]:
TFIDF_lgbm_model = LGBMRegressor(lambda_l2 = TFIDF_lgbmBO.max['params']['lambda_l1'],
                           lambda_l1 = TFIDF_lgbmBO.max['params']['lambda_l2'],
                           max_depth = int(round(TFIDF_lgbmBO.max['params']['max_depth'],0)),
                           learning_rate = TFIDF_lgbmBO.max['params']['learning_rate'],
                           n_estimators = int(round(TFIDF_lgbmBO.max['params']['n_estimators'],0)))
TFIDF_lgbm_model.fit(X_TFIDF_train, y_train.ravel())

LGBMRegressor(lambda_l1=0.028736780795506844, lambda_l2=0.053083311782776144,
              learning_rate=0.3752551731221079, n_estimators=15)

In [22]:
print('TFIDF Validation Set')
print('R2: ' + str(round(r2_calculator(TFIDF_lgbm_model.predict(X_TFIDF_valid),y_valid),4)))
print('Adjusted R2: ' + str(round(adjusted_r2_calculator(TFIDF_lgbm_model.predict(X_TFIDF_valid),y_valid),4)))
print('MAE: ' + str(round(mae_and_mse_calculator(TFIDF_lgbm_model.predict(X_TFIDF_valid),y_valid)[0],4)))
print('RMSE: ' + str(round(mae_and_mse_calculator(TFIDF_lgbm_model.predict(X_TFIDF_valid),y_valid)[1]**0.5,4)))

TFIDF Validation Set
R2: 0.9091
Adjusted R2: 0.9091
MAE: 0.0688
RMSE: 0.1107


In [23]:
print('Cosine Similarity Validation Set')
print('R2: ' + str(round(r2_calculator(lgbm_model.predict(X_valid),y_valid),4)))
print('Adjusted R2: ' + str(round(adjusted_r2_calculator(lgbm_model.predict(X_valid),y_valid),4)))
print('MAE: ' + str(round(mae_and_mse_calculator(lgbm_model.predict(X_valid),y_valid)[0],4)))
print('RMSE: ' + str(round(mae_and_mse_calculator(lgbm_model.predict(X_valid),y_valid)[1]**0.5,4)))

Cosine Similarity Validation Set
R2: 0.9091
Adjusted R2: 0.9091
MAE: 0.0688
RMSE: 0.1107


When the hyperparameter search completes, the result indicates that the error is irreducible. Two ways to reduce this error are feature engineering and data wrangling. Besides this, the random forest model is top choice for this problem because this model is more resistant to overfitting. 

In [24]:
print('Cos Similarity Testing Set')
print('R2: ' + str(round(r2_calculator(rfr_model.predict(X_test),y_test),4)))
print('Adjusted R2: ' + str(round(adjusted_r2_calculator(rfr_model.predict(X_test),y_test),4)))
print('MAE: ' + str(round(mae_and_mse_calculator(rfr_model.predict(X_test),y_test)[0],4)))
print('RMSE: ' + str(round(mae_and_mse_calculator(rfr_model.predict(X_test),y_test)[1]**0.5,4)))

Cos Similarity Testing Set
R2: 0.9097
Adjusted R2: 0.9097
MAE: 0.0686
RMSE: 0.1106


In [25]:
print(f'Time taken to run: {time.time() - start} seconds')

Time taken to run: 77.93327593803406 seconds
