<a href="https://colab.research.google.com/github/jphall663/GWU_ML/blob/main/notebook/lecture_12/Assignment_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# License 
***
Copyright (C) 2017 -- 2022 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## Installs and Imports

1. Standard Python imports

In [None]:
import pandas as pd # import pandas for easy data manipulation using data frames
import numpy as np  # basic array and matric handling

# to upload local files
import io
from google.colab import files  

SEED = 12345           # seed for better reproducibility

2. Install Java

In [None]:
# install Java for h2o backend
!apt-get install default-jre
!java -version

3. Install H2O

In [None]:
# install h2o
!pip install h2o 

4. Import h2o package and required classes

In [None]:
# import h2o and required classes
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch

## Load example data

5. Upload class example data

In [None]:
# special google collab command to upload a file from computer
uploaded = files.upload() # REQUIRES STUDENT INPUT

In [None]:
# 6
uploaded.keys() # what is stored in that Python object?

7. Covert to Pandas DataFrame

In [None]:
# convert data to Pandas DataFrame
raw = pd.read_csv(io.StringIO(uploaded['loan_clean.csv'].decode('utf-8'))) # name in quotes here must match name in quotes directly above 

## Split training data into three partitions for improved model selection"

8. Add partition marker to raw data and split into train, valid, and test data

In [None]:
# ALWAYS set a random seed when working with randomness
np.random.seed() # REQUIRES STUDENT INPUT
raw['partition'] = np.random.choice(3, raw.shape[0])
train = raw[raw['partition'] == 0].copy(deep=True)
valid = raw[raw['partition'] == 1].copy(deep=True)
test = raw[raw['partition'] == 2].copy(deep=True)

print('Training data rows: %d,training data columns: %d' % (train.shape[0], train.shape[1]))
print('Validation data rows: %d,training data columns: %d' % (valid.shape[0], valid.shape[1]))
print('Test data rows: %d,training data columns: %d' % (test.shape[0], test.shape[1]))

assert(raw.shape[0] == train.shape[0] + valid.shape[0] + test.shape[0]) # test that splits sum to original data size

9. Add cross-validation market to test set

In [None]:
valid.loc[:, 'fold'] = np.random.choice(5, valid.shape[0])
valid.head()

## Train penalized GLM model to predict loan default with validation data

10. Assign global constants

In [None]:
x_names = ["GRP_REP_home_ownership", "GRP_addr_state", "GRP_purpose",	"GRP_verification_status", "STD_IMP_REP_annual_inc", "STD_IMP_REP_delinq_2yrs",	"STD_IMP_REP_dti",	"STD_IMP_REP_emp_length",	"STD_IMP_REP_int_rate",	"STD_IMP_REP_loan_amnt",	"STD_IMP_REP_longest_credit_lengt",	"STD_IMP_REP_revol_util",	"STD_IMP_REP_term_length",	"STD_IMP_REP_total_acc"] # REQUIRES STUDENT INPUT
y_name = 'bad_loan'# REQUIRES STUDENT INPUT

11. Start h2o server

In [None]:
# start h2o
h2o.init()

12. Function for penalized GLM training that selects good alpha and lamda

In [None]:
def glm_grid(x_names, y_name, htrain, hvalid=None, seed_=SEED):

    """ Wrapper function for penalized GLM with alpha and lambda search.
    :param x_names: List of inputs.
    :param y_name: Name of target variable.
    :param htrain: Training H2OFrame.
    :param hvalid: Validation H2OFrame, default None.
    :param seed_: Random seed for better reproducibility, default 12345.
    :return: Best H2OGeneralizedLinearEstimator.
    """

    alpha_opts = [0.01, 0.25, 0.5, 0.99]  # REQUIRES STUDENT INPUT

    # define search criteria
    # i.e., over alpha
    # lamda search handled by lambda_search param below
    hyper_parameters = {'alpha': alpha_opts}

    # initialize grid search
    grid = H2OGridSearch(
        H2OGeneralizedLinearEstimator(family="binomial",
                                      lambda_search=True,
                                      seed=seed_),
        hyper_params=hyper_parameters)

    # execute training w/ grid search
    grid.train(y=y_name,
               x=x_names,
               training_frame=htrain,
               validation_frame=hvalid,
               standardize=False,
               seed=seed_)

    # return entire grid of models
    return grid


13. Convert from Pandas DataFrames to H2OFrames

In [None]:
# training data
htrain = h2o.H2OFrame(train) # load Pandas DataFrame in H2OFrame
htrain[y_name] = htrain[y_name].asfactor() # ensures h2o treats y/target as categorical and not numeric

# validation data
hvalid = h2o.H2OFrame(valid) 
hvalid[y_name] = hvalid[y_name].asfactor() 

14. Train model using `glm_grid` function with validation data

In [None]:
loan_grid = glm_grid(x_names, y_name, htrain, hvalid)


15. View validation AUC for models in grid search

In [None]:
# cycle through grid search results and print valid AUC
for i in range(0, 4):
  candidate_glm = loan_grid.get_grid()[i]
  print('Model %d validation AUC: %.4f' % (i, candidate_glm.auc(valid=True)))
  print(loan_grid.get_grid()[i].summary())

16. Extract best model from grid search results

In [None]:
best_glm = loan_grid.get_grid()[0]

17. Display coefficients

In [None]:
import operator
print('GLM Coefficients:')
for c_name, c_val in sorted(best_glm.coef().items(), key=operator.itemgetter(1)):
      print('%s %s' % (str(c_name + ':').ljust(25), c_val))

18. Generate Java POJO representation of the best model

In [None]:
best_glm.download_pojo()

19. Shutdown h2o

In [None]:
# shutdown h2o
h2o.cluster().shutdown()