# Train Final Models Using LightGBM
After comparing the performance of different models fitted by the team, we decided to use LightGBM for our final models.  
The code below trains 3 models, each with its own hyperparameter values:
1. LGBM trained on all data
2. LGBM trained on CA data only
3. LGBM trained on GA data only

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as rmse_score

from scipy.stats import randint
from scipy.stats import loguniform
from scipy.stats import uniform

from xgboost.sklearn import XGBRegressor
from lightgbm.sklearn import LGBMRegressor

from hyperopt import tpe, hp, fmin, STATUS_OK,Trials
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample


### Code to save and load models

In [None]:
# save models to local drive
import pickle

def save_obj(obj, filename):
    try:
        with open(filename, "wb") as f:
            pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
    except Exception as ex:
        print("Error:", ex)


def load_obj(filename):
    try:
        with open(filename, "rb") as f:
            return pickle.load(f)
    except Exception as ex:
        print("Error:", ex)


## Train LBGM on updated data - All Data

In [None]:
# load data and clean
X_train = pd.read_csv('../large_data_files/Train and Valid Data for model comparison/X_train_model2.csv')
y_train = pd.read_csv('../large_data_files/Train and Valid Data for model comparison/y_train_model2.csv')
X_validate = pd.read_csv('../large_data_files/Train and Valid Data for model comparison/X_valid_model2.csv')
y_validate = pd.read_csv('../large_data_files/Train and Valid Data for model comparison/y_valid_model2.csv')

X_train_all = pd.read_csv('../large_data_files/All Data/X_train_all.csv')
y_train_all = pd.read_csv('../large_data_files/All Data/y_train_all.csv')
X_test = pd.read_csv('../large_data_files/All Data/X_test_all.csv')
y_test = pd.read_csv('../large_data_files/All Data/y_test_all.csv')


X_train.drop(columns=['Unnamed: 0'], inplace=True)
y_train.drop(columns=['Unnamed: 0'], inplace=True)
X_validate.drop(columns=['Unnamed: 0'], inplace=True)
y_validate.drop(columns=['Unnamed: 0'], inplace=True)

X_train_all.drop(columns=['Unnamed: 0'], inplace=True)
y_train_all.drop(columns=['Unnamed: 0'], inplace=True)
X_test.drop(columns=['Unnamed: 0'], inplace=True)
y_test.drop(columns=['Unnamed: 0'], inplace=True)


# rename this column which gives an error in LGBM because its name has quotation marks
X_train.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)
X_validate.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)

X_train_all.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)
X_test.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)


In [None]:
# use hyperopt to tune hyperparameter values
space = {
    'boosting': 'dart',
    'n_estimators': scope.int(hp.quniform('n_estimators', 255,290,3)),
    'num_leaves': scope.int(hp.quniform("num_leaves", 140, 170, 2)),
    'max_depth': scope.int(hp.quniform("max_depth", 0, 4, 1)),
    'learning_rate': hp.loguniform('learning_rate', -1.3, 0),
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 0, 4, 1)),
    'feature_fraction': hp.uniform('feature_fraction', 0.73, 0.83),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 0.6),
    'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_value', -10, -6)]),
    'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_value', -4, -2.5)]),
    'min_child_weight': hp.loguniform('min_child_weight', -12, -10),
}

def lgbm_tuning(params):
    clf=LGBMRegressor(**params)
    clf.fit(X_train, y_train)
    rmse = (rmse_score(y_true=y_validate, y_pred=clf.predict(X_validate)))**0.5
    print("RMSE:", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

trials = Trials()



In [None]:
best = fmin(
    fn=lgbm_tuning,
    space = space,
    algo=tpe.suggest,
    max_evals=150,
    trials=trials
)

In [None]:
best

{'bagging_fraction': 0.552936314269197,
 'feature_fraction': 0.7733644348291246,
 'lambda_l1': 0,
 'lambda_l2': 1,
 'lambda_l2_value': 0.05349775487562866,
 'learning_rate': 0.5567627566029032,
 'max_depth': 2.0,
 'min_child_weight': 2.607851332618328e-05,
 'min_data_in_leaf': 2.0,
 'n_estimators': 260.0,
 'num_leaves': 160.0}

In [None]:
# fit on all training data
lgbm_all_data = LGBMRegressor(
    boosting='DART',
    bagging_fraction=best['bagging_fraction'], 
    feature_fraction=best['feature_fraction'],
    reg_alpha=0,
    reg_lambda=best['lambda_l2_value'],
    max_depth=int(best['max_depth']),
    min_child_weight=best['min_child_weight'],
    min_data_in_leaf=int(best['min_data_in_leaf']),
    n_estimators=int(best['n_estimators']),
    num_leaves=int(best['num_leaves']),
    learning_rate=best['learning_rate'],
)
lgbm_all_data.fit(X_train_all, y_train_all)
r2_score(y_test,lgbm_all_data.predict(X_test))



0.5475287443196867

In [None]:
# save model
save_obj(lgbm_all_data, '../models/lgbm_all_data.pickle')

## Train on GA data only

In [None]:
# laod data and clean
# training data
X_train_GA = pd.read_csv('../large_data_files/GA Data/X_train_GA.csv')
y_train_GA = pd.read_csv('../large_data_files/GA Data/y_train_GA.csv')

X_train_GA = X_train_GA[X_train_GA.columns.drop(list(X_train_GA.filter(regex='state_')))]
X_train_GA = X_train_GA[X_train_GA.columns.drop(list(X_train_GA.filter(regex='city_')))]
X_train_GA = X_train_GA[X_train_GA.columns.drop(list(X_train_GA.filter(regex='county_')))]

X_train_GA.drop(columns=['Unnamed: 0'], inplace=True)
y_train_GA.drop(columns=['Unnamed: 0'], inplace=True)

X_train_GA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)

# test data
X_test_GA = pd.read_csv('../large_data_files/GA Data/X_test_GA.csv')
y_test_GA = pd.read_csv('../large_data_files/GA Data/y_test_GA.csv')

X_test_GA = X_test_GA[X_test_GA.columns.drop(list(X_test_GA.filter(regex='state_')))]
X_test_GA = X_test_GA[X_test_GA.columns.drop(list(X_test_GA.filter(regex='city_')))]
X_test_GA = X_test_GA[X_test_GA.columns.drop(list(X_test_GA.filter(regex='county_')))]

X_test_GA.drop(columns=['Unnamed: 0'], inplace=True)
y_test_GA.drop(columns=['Unnamed: 0'], inplace=True)

X_test_GA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)


In [None]:
# train validate split
X_train_GA_train, X_validate_GA, y_train_GA_train, y_validate_GA = train_test_split(X_train_GA, y_train_GA, train_size=0.7, random_state=42)

In [None]:
# use hyperopt to tune hyperparameters
space = {
    'boosting': 'dart',
    'n_estimators': scope.int(hp.quniform('n_estimators', 235,265,3)),
    'num_leaves': scope.int(hp.quniform("num_leaves", 195, 215, 3)),
    'max_depth': scope.int(hp.quniform("max_depth", 2, 6, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3.5, -0.5),
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 0, 4, 1)),
    'feature_fraction': hp.uniform('feature_fraction', 0.70, 0.77),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.62, 0.68),
    'lambda_l1': hp.choice('lambda_l1', [0]),
    'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_value', -7, -5.5)]),
    'min_child_weight': hp.loguniform('min_child_weight', -9.5, -7.5),
}

def lgbm_tuning(params):
    clf=LGBMRegressor(**params)
    clf.fit(X_train_GA_train, y_train_GA_train)
    rmse = (rmse_score(y_true=y_validate_GA, y_pred=clf.predict(X_validate_GA)))**0.5
    print("RMSE:", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

trials = Trials()

best = fmin(
    fn=lgbm_tuning,
    space = space,
    algo=tpe.suggest,
    max_evals=150,
    trials=trials
)


In [None]:
best

{'bagging_fraction': 0.655632696730533,
 'feature_fraction': 0.7665910276504753,
 'lambda_l1': 0,
 'lambda_l2': 1,
 'lambda_l2_value': 0.002716867464021161,
 'learning_rate': 0.44406926645007505,
 'max_depth': 2.0,
 'min_child_weight': 9.559323995283971e-05,
 'min_data_in_leaf': 0.0,
 'n_estimators': 252.0,
 'num_leaves': 210.0}

In [None]:
# fit on all GA training data
lgbm_using_GA = LGBMRegressor(
    boosting='DART',
    bagging_fraction=best['bagging_fraction'], 
    feature_fraction=best['feature_fraction'],
    reg_alpha=0,
    reg_lambda=best['lambda_l2_value'],
    max_depth=int(best['max_depth']),
    min_child_weight=best['min_child_weight'],
    min_data_in_leaf=int(best['min_data_in_leaf']),
    n_estimators=int(best['n_estimators']),
    num_leaves=int(best['num_leaves']),
    learning_rate=best['learning_rate'],
)
lgbm_using_GA.fit(X_train_GA, y_train_GA)
r2_score(y_test_GA,lgbm_using_GA.predict(X_test_GA))



0.5433452719912426

In [None]:
# save model
save_obj(lgbm_using_GA, '../models/lgbm_using_GA.pickle')

## Train on CA data only

In [None]:
# load data and clean
# training data
X_train_CA = pd.read_csv('../large_data_files/CA Data/X_train_CA.csv')
y_train_CA = pd.read_csv('../large_data_files/CA Data/y_train_CA.csv')

X_train_CA = X_train_CA[X_train_CA.columns.drop(list(X_train_CA.filter(regex='state_')))]
X_train_CA = X_train_CA[X_train_CA.columns.drop(list(X_train_CA.filter(regex='city_')))]
X_train_CA = X_train_CA[X_train_CA.columns.drop(list(X_train_CA.filter(regex='county_')))]

X_train_CA.drop(columns=['Unnamed: 0'], inplace=True)
y_train_CA.drop(columns=['Unnamed: 0'], inplace=True)

X_train_CA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)

# test data
X_test_CA = pd.read_csv('../large_data_files/CA Data/X_test_CA.csv')
y_test_CA = pd.read_csv('../large_data_files/CA Data/y_test_CA.csv')

X_test_CA = X_test_CA[X_test_CA.columns.drop(list(X_test_CA.filter(regex='state_')))]
X_test_CA = X_test_CA[X_test_CA.columns.drop(list(X_test_CA.filter(regex='city_')))]
X_test_CA = X_test_CA[X_test_CA.columns.drop(list(X_test_CA.filter(regex='county_')))]

X_test_CA.drop(columns=['Unnamed: 0'], inplace=True)
y_test_CA.drop(columns=['Unnamed: 0'], inplace=True)

X_test_CA.rename(columns={'city_"oneals"': 'city_oneals'}, inplace=True)


In [None]:
# train validate split
X_train_CA_train, X_validate_CA, y_train_CA_train, y_validate_CA = train_test_split(X_train_CA, y_train_CA, train_size=0.7, random_state=42)

In [None]:
# use hyperopt to tune hyperparameters
space = {
    'boosting': 'dart',
    'n_estimators': scope.int(hp.quniform('n_estimators', 130,170,2)),
    'num_leaves': scope.int(hp.quniform("num_leaves", 250, 275, 2)),
    'max_depth': scope.int(hp.quniform("max_depth", 2, 7, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'min_data_in_leaf': scope.int(hp.quniform('min_data_in_leaf', 2, 6, 1)),
    'feature_fraction': hp.uniform('feature_fraction', 0.55, 0.75),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.55, 0.75),
    'lambda_l1': hp.choice('lambda_l1', [0]),
    'lambda_l2': hp.choice('lambda_l2', [0]),
    'min_child_weight': hp.loguniform('min_child_weight', -8, -4),
}

def lgbm_tuning(params):
    clf=LGBMRegressor(**params)
    clf.fit(X_train_CA_train, y_train_CA_train)
    rmse = (rmse_score(y_true=y_validate_CA, y_pred=clf.predict(X_validate_CA)))**0.5
    print("RMSE:", rmse)
    return {'loss': rmse, 'status': STATUS_OK}

trials = Trials()

best = fmin(
    fn=lgbm_tuning,
    space = space,
    algo=tpe.suggest,
    max_evals=250,
    trials=trials
)


RMSE:                                                  
1435272.1712212753                                     
RMSE:                                                                            
1432459.8467993482                                                               
RMSE:                                                                            
1523669.7827525772                                                               
RMSE:                                                                            
1548727.0982058162                                                               
RMSE:                                                                            
1474096.6967393067                                                               
RMSE:                                                                            
1506450.5410535256                                                               
RMSE:                                                               

In [None]:
# fit on all CA training data
lgbm_using_CA = LGBMRegressor(
    boosting='DART',
    bagging_fraction=best['bagging_fraction'], 
    feature_fraction=best['feature_fraction'],
    reg_alpha=0,
    reg_lambda=0,
    max_depth=int(best['max_depth']),
    min_child_weight=best['min_child_weight'],
    min_data_in_leaf=int(best['min_data_in_leaf']),
    n_estimators=int(best['n_estimators']),
    num_leaves=int(best['num_leaves']),
    learning_rate=best['learning_rate'],
)
lgbm_using_CA.fit(X_train_CA, y_train_CA)
r2_score(y_test_CA,lgbm_using_CA.predict(X_test_CA))



0.5071921270247036

In [None]:
# save model
save_obj(lgbm_using_CA, '../models/lgbm_using_CA.pickle')