### 5a. Non-Text Features

#### Import Required Modules and Load Data

In [1]:
import pickle
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from catboost import CatBoostRegressor, Pool

from catboost import EShapCalcType, EFeaturesSelectionAlgorithm

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

from keras.regularizers import l2
from keras.layers import Dropout, Dense, Input
from keras import Model
from tensorflow import keras
from keras import losses

from scikeras.wrappers import KerasRegressor

from keras.callbacks import EarlyStopping

import model_evaluation as me

import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
with open('X_tr_1_nontext.pkl', 'rb') as file_name:
    X_tr_1_nontext = pickle.load(file_name)
    
with open('X_val_1_nontext.pkl', 'rb') as file_name:
    X_val_1_nontext = pickle.load(file_name)
    
with open('y_tr_1.pkl', 'rb') as file_name:
    y_tr_1 = pickle.load(file_name)
    
with open('y_val_1.pkl', 'rb') as file_name:
    y_val_1 = pickle.load(file_name)
    
with open('X_tr_full_nontext.pkl', 'rb') as file_name:
    X_tr_full_nontext = pickle.load(file_name)
    
with open('X_tr_full_text.pkl', 'rb') as file_name:
    X_tr_full_text = pickle.load(file_name)
    
with open('y_train.pkl', 'rb') as file_name:
    y_train = pickle.load(file_name)

In [3]:
with open('X_test_nontext.pkl', 'rb') as file_name:
    X_test_nontext = pickle.load(file_name)

#### Evaluation Metrics

In [4]:
cv = KFold(n_splits=10, shuffle=True, random_state=777)

In [5]:
scoring = {'negRMSE': 'neg_root_mean_squared_error', 'negMAE': 'neg_mean_absolute_error', 'R_squared': 'r2'}

In [6]:
model_name = []
cv_RMSE = []
cv_MAE = []
cv_R2 = []

#### 1. Ridge Regression

In [7]:
ridge = Ridge()

In [8]:
score = cross_validate(ridge, X_tr_full_nontext, y_train, scoring=scoring, cv=cv)

In [9]:
model_name.append('Ridge Regression')
cv_RMSE.append(-score['test_negRMSE'].mean())
cv_MAE.append(-score['test_negMAE'].mean())
cv_R2.append(score['test_R_squared'].mean())

#### 2. Random Forest Regressor

In [10]:
rf = RandomForestRegressor()

parameters = {"n_estimators" : [750],
             "max_features" : ["sqrt"],
             "min_samples_split" : [3, 4, 5],
             "min_samples_leaf" : [3, 5],
             "random_state" : [123]
             }

rf_grid = GridSearchCV(rf,
                        parameters,
                        cv = 2,
                        scoring='neg_root_mean_squared_error',
                        verbose=False)

rf_grid.fit(X_tr_full_nontext, y_train)

GridSearchCV(cv=2, estimator=RandomForestRegressor(),
             param_grid={'max_features': ['sqrt'], 'min_samples_leaf': [3, 5],
                         'min_samples_split': [3, 4, 5], 'n_estimators': [750],
                         'random_state': [123]},
             scoring='neg_root_mean_squared_error', verbose=False)

In [11]:
best_params = rf_grid.best_params_

rf_reg = RandomForestRegressor(**best_params)

In [12]:
score = cross_validate(rf_reg, X_tr_full_nontext, y_train, scoring=scoring, cv=cv)

In [13]:
model_name.append('Random Forest Regressor')
cv_RMSE.append(-score['test_negRMSE'].mean())
cv_MAE.append(-score['test_negMAE'].mean())
cv_R2.append(score['test_R_squared'].mean())

#### 3. XGB Regressor

In [14]:
xgb_reg = XGBRegressor()

parameters = {'learning_rate': [0.03, 0.04, 0.05],
              'max_depth': [8],
              'min_child_weight': [4, 6],
              'colsample_bytree': [0.7],
              'n_estimators': [800],
              'random_state': [123]}

xgb_grid = GridSearchCV(xgb_reg,
                        parameters,
                        cv = 2,
                        n_jobs = -1,
                        scoring='neg_root_mean_squared_error',
                        verbose=False)

xgb_grid.fit(X_tr_full_nontext, y_train)

GridSearchCV(cv=2,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    val

In [15]:
xgb_reg = XGBRegressor(**xgb_grid.best_params_)

In [16]:
score = cross_validate(xgb_reg, X_tr_full_nontext, y_train, scoring=scoring, cv=cv)

In [17]:
model_name.append('XGB Regressor')
cv_RMSE.append(-score['test_negRMSE'].mean())
cv_MAE.append(-score['test_negMAE'].mean())
cv_R2.append(score['test_R_squared'].mean())

#### 4. CatBoost Regressor

In [18]:
catboost_params = {
    'iterations': [2000],
    'depth': [6, 8, 10],
    'learning_rate': [0.02, 0.03, 0.04],
    'l2_leaf_reg': [2, 3, 4],
    'early_stopping_rounds': [20],
    'verbose': [0]
}


cbr = CatBoostRegressor()

cbr_grid = GridSearchCV(cbr,
                        catboost_params,
                        cv = 2,
                        n_jobs = -1,
                        scoring='neg_root_mean_squared_error',
                        verbose=False)

cbr_grid.fit(X_tr_full_nontext, y_train)

GridSearchCV(cv=2,
             estimator=<catboost.core.CatBoostRegressor object at 0x17116f850>,
             n_jobs=-1,
             param_grid={'depth': [6, 8, 10], 'early_stopping_rounds': [20],
                         'iterations': [2000], 'l2_leaf_reg': [2, 3, 4],
                         'learning_rate': [0.02, 0.03, 0.04], 'verbose': [0]},
             scoring='neg_root_mean_squared_error', verbose=False)

In [19]:
best_params = cbr_grid.best_params_
best_params['iterations'] = 1500
best_params['verbose'] = 0

In [20]:
cbr_reg = CatBoostRegressor(**best_params)

In [21]:
score = cross_validate(cbr_reg, X_tr_full_nontext, y_train, scoring=scoring, cv=cv)

In [22]:
model_name.append('CatBoostRegressor')
cv_RMSE.append(-score['test_negRMSE'].mean())
cv_MAE.append(-score['test_negMAE'].mean())
cv_R2.append(score['test_R_squared'].mean())

### 5. Feature Selection from CatBoost Regressor 

In [23]:
train_pool = Pool(X_tr_1_nontext, y_tr_1)
test_pool = Pool(X_val_1_nontext, y_val_1)

In [24]:
summary = cbr_reg.select_features(
    train_pool,
    eval_set=test_pool,
    features_for_select=['host_response_rate-flag_missing_description'],
    num_features_to_select=70,
    steps=3,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Silent',
    plot=False
)

In [25]:
X_tr_1_nontext_sel = X_tr_1_nontext.iloc[:,summary['selected_features']]
X_val_1_nontext_sel = X_val_1_nontext.iloc[:,summary['selected_features']]

In [26]:
X_tr_full_nontext_sel = X_tr_full_nontext.iloc[:,summary['selected_features']]

In [27]:
X_test_nontext_sel = X_test_nontext.iloc[:,summary['selected_features']]

We save the pruned datasets.

In [28]:
with open('X_tr_1_nontext_sel.pkl', 'wb') as file_name:
    pickle.dump(X_tr_1_nontext_sel, file_name)
    
with open('X_val_1_nontext_sel.pkl', 'wb') as file_name:
    pickle.dump(X_val_1_nontext_sel, file_name)
    
with open('X_tr_full_nontext_sel.pkl', 'wb') as file_name:
    pickle.dump(X_tr_full_nontext_sel, file_name)
    
with open('X_test_nontext_sel.pkl', 'wb') as file_name:
    pickle.dump(X_test_nontext_sel, file_name)

#### 5.1 XGB with selected features

In [29]:
score = cross_validate(xgb_reg, X_tr_full_nontext_sel, y_train, scoring=scoring, cv=cv)

model_name.append('XGB Regressor (Feat. Sel.)')
cv_RMSE.append(-score['test_negRMSE'].mean())
cv_MAE.append(-score['test_negMAE'].mean())
cv_R2.append(score['test_R_squared'].mean())

#### 5.2 CBR with selected features

In [30]:
catboost_params = {
    'iterations': [2000],
    'depth': [10],
    'learning_rate': [0.03, 0.035, 0.04],
    'l2_leaf_reg': [3, 4, 5],
    'early_stopping_rounds': [20],
    'verbose': [0]
}


cbr = CatBoostRegressor()

cbr_grid = GridSearchCV(cbr,
                        catboost_params,
                        cv = 2,
                        n_jobs = -1,
                        scoring='neg_root_mean_squared_error',
                        verbose=False)

cbr_grid.fit(X_tr_1_nontext_sel, y_tr_1, eval_set=(X_val_1_nontext_sel, y_val_1))

GridSearchCV(cv=2,
             estimator=<catboost.core.CatBoostRegressor object at 0x17ce7ab20>,
             n_jobs=-1,
             param_grid={'depth': [10], 'early_stopping_rounds': [20],
                         'iterations': [2000], 'l2_leaf_reg': [3, 4, 5],
                         'learning_rate': [0.03, 0.035, 0.04], 'verbose': [0]},
             scoring='neg_root_mean_squared_error', verbose=False)

In [31]:
cbr_reg = CatBoostRegressor(**cbr_grid.best_params_)

In [32]:
score = cross_validate(cbr_reg, X_tr_full_nontext_sel, y_train, scoring=scoring, cv=cv)

In [33]:
model_name.append('CatBoostRegressor (Feat. Sel.)')
cv_RMSE.append(-score['test_negRMSE'].mean())
cv_MAE.append(-score['test_negMAE'].mean())
cv_R2.append(score['test_R_squared'].mean())

#### 5.3 Neural Network with Dropout Regularization
Based on [this source](https://www.adriangb.com/scikeras/stable/notebooks/Basic_Usage.html#3.-Training-a-regressor).

In [34]:
callbacks = [EarlyStopping(monitor="val_loss", patience=3, verbose=1, min_delta=0.1, restore_best_weights=True)]

In [35]:
def get_reg_nn(n_hidden=[256, 128, 64], dropout=0.025, reg=0.025, N=X_tr_1_nontext_sel.shape[1]):
    
    n_epochs = 15
    batch_size = 128
    
    inputs = Input(shape=(N,))
    inter = Dropout(dropout)(inputs)
    inter = Dense(n_hidden[0], activation='relu', kernel_initializer=keras.initializers.he_normal(seed=123), kernel_regularizer=l2(reg))(inter)
    for i in range(len(n_hidden) - 1):
        inter = Dropout(dropout)(inter)
        inter = Dense(n_hidden[i+1], activation='relu', kernel_initializer=keras.initializers.he_normal(seed=123), kernel_regularizer=l2(reg))(inter)
    inter = Dropout(dropout)(inter)
    outputs = Dense(1)(inter)
    model = Model(inputs, outputs)
    
    return model

In [43]:
reg = KerasRegressor(
    model=get_reg_nn(N=X_tr_full_nontext_sel.shape[1]),
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[me.root_mean_squared_error],
    verbose=0
)

In [44]:
score = cross_validate(reg, X_tr_full_nontext_sel, y_train, scoring=scoring, cv=cv)

INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmpdicw7pvp/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmpqz85kkvx/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmpubv_1_rz/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmpb9dkhm83/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmp6ctlpsye/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmpsb456dh2/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmp3liifmxg/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmpspipppf3/assets
INFO:tensorflow:Assets written to: ram:///var/folders/ry/rzz66hz547v4ph_l8p5sc_d00000gn/T/tmpkh39xavt/assets
INFO:tensorflow:Ass

In [45]:
model_name.append('Deep Neural Network (Feat. Sel.)')
cv_RMSE.append(-score['test_negRMSE'].mean())
cv_MAE.append(-score['test_negMAE'].mean())
cv_R2.append(score['test_R_squared'].mean())

#### Export Results Table

In [46]:
results = pd.DataFrame({'Model Name': model_name, 'RMSE': cv_RMSE, 'MAE': cv_MAE, 'R-squared': cv_R2})

In [47]:
results.to_csv('results_models_nontext.csv')

In [48]:
results

Unnamed: 0,Model Name,RMSE,MAE,R-squared
0,Ridge Regression,54.554011,34.767277,0.575427
1,Random Forest Regressor,48.996181,29.725004,0.657523
2,XGB Regressor,45.370005,27.335242,0.706233
3,CatBoostRegressor,45.562294,27.495034,0.703754
4,XGB Regressor (Feat. Sel.),45.457703,27.379079,0.705108
5,CatBoostRegressor (Feat. Sel.),45.074265,27.108501,0.710033
6,Deep Neural Network (Feat. Sel.),51.983582,31.575575,0.614442
