<a href="https://colab.research.google.com/github/jphall663/GWU_ML/blob/main/notebook/lecture_3/Assignment_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# License 
***
Copyright (C) 2017 -- 2022 J. Patrick Hall, jphall@gwu.edu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

## Installs and Imports

1. Standard Python imports

In [None]:
import pandas as pd # import pandas for easy data manipulation using data frames

from matplotlib import pyplot as plt # plotting
import numpy as np                   # basic array and matric handling
import seaborn as sns                # slightly better plotting 

# for model eval
from sklearn.metrics import accuracy_score, f1_score, log_loss, mean_squared_error, roc_auc_score

# to upload local files
import io
from google.colab import files  

ROUND = 3              # generally, insane precision is not needed 
SEED = 12345           # seed for better reproducibility

2. Install Java

In [None]:
# install Java for h2o backend
!apt-get install default-jre
!java -version

3. Install H2O

In [None]:
# install h2o
!pip install h2o 

4. Import h2o package and required classes

In [None]:
# import h2o and required classes
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch

## Load example data

5. Upload class example data

In [None]:
# special google collab command to upload a file from computer
uploaded = files.upload() # REQUIRES STUDENT INPUT

In [None]:
# 6
uploaded.keys() # what is stored in that Python object?

7. Covert to Pandas DataFrame

In [None]:
# convert data to Pandas DataFrame
raw = pd.read_csv(io.StringIO(uploaded['loan_clean.csv'].decode('utf-8'))) # name in quotes here must match name in quotes directly above 

## Split training data into three partitions for improved model selection"

8. Add partition marker to raw data and split into train, valid, and test data

In [None]:
# ALWAYS set a random seed when working with randomness
np.random.seed() # REQUIRES STUDENT INPUT
raw['partition'] = np.random.choice(3, raw.shape[0])
train = raw[raw['partition'] == 0].copy(deep=True)
valid = raw[raw['partition'] == 1].copy(deep=True)
test = raw[raw['partition'] == 2].copy(deep=True)

print('Training data rows: %d,training data columns: %d' % (train.shape[0], train.shape[1]))
print('Validation data rows: %d,training data columns: %d' % (valid.shape[0], valid.shape[1]))
print('Test data rows: %d,training data columns: %d' % (test.shape[0], test.shape[1]))

assert(raw.shape[0] == train.shape[0] + valid.shape[0] + test.shape[0]) # test that splits sum to original data size

9. Add cross-validation market to test set

In [None]:
valid.loc[:, 'fold'] = np.random.choice(5, valid.shape[0])
valid.head()

## Train penalized GLM model to predict loan default with validation data

10. Assign global constants

In [None]:
x_names = [] # REQUIRES STUDENT INPUT
y_name = ''# REQUIRES STUDENT INPUT

11. Start h2o server

In [None]:
# start h2o
h2o.init()

12. Function for penalized GLM training that selects good alpha and lamda

In [None]:
def glm_grid(x_names, y_name, htrain, hvalid=None, seed_=SEED):

    """ Wrapper function for penalized GLM with alpha and lambda search.
    :param x_names: List of inputs.
    :param y_name: Name of target variable.
    :param htrain: Training H2OFrame.
    :param hvalid: Validation H2OFrame, default None.
    :param seed_: Random seed for better reproducibility, default 12345.
    :return: Best H2OGeneralizedLinearEstimator.
    """

    alpha_opts = [0.01, 0.25, 0.5, 0.99]  # REQUIRES STUDENT INPUT

    # define search criteria
    # i.e., over alpha
    # lamda search handled by lambda_search param below
    hyper_parameters = {'alpha': alpha_opts}

    # initialize grid search
    grid = H2OGridSearch(
        H2OGeneralizedLinearEstimator(family="binomial",
                                      lambda_search=True,
                                      seed=seed_),
        hyper_params=hyper_parameters)

    # execute training w/ grid search
    grid.train(y=y_name,
               x=x_names,
               training_frame=htrain,
               validation_frame=hvalid,
               seed=seed_)

    # return entire grid of models
    return grid


13. Convert from Pandas DataFrames to H2OFrames

In [None]:
# training data
htrain = h2o.H2OFrame(train) # load Pandas DataFrame in H2OFrame
htrain[y_name] = htrain[y_name].asfactor() # ensures h2o treats y/target as categorical and not numeric

# validation data
hvalid = h2o.H2OFrame(valid) 
hvalid[y_name] = hvalid[y_name].asfactor() 

14. Train model using `glm_grid` function with validation data

In [None]:
loan_grid =  # REQUIRES STUDENT INPUT

15. View validation AUC for models in grid search

In [None]:
# cycle through grid search results and print valid AUC
for i in range(0, 4):
  candidate_glm = loan_grid.get_grid()[i]
  print('Model %d validation AUC: %.4f' % (i, candidate_glm.auc(valid=True)))
  print(loan_grid.get_grid()[i].summary())

16. Score grid search models on validation set

In [None]:
scores_frame = pd.DataFrame(valid[[y_name, 'fold']].copy(deep=True))
scores_frame = scores_frame.reset_index(drop=True)
for i in range(0, 4):
  model_name = 'glm_' + str(i)
  scores_frame[model_name] = loan_grid.get_grid()[i].predict(hvalid)['p1'].as_data_frame()

17. Utility function for max. accuracy

In [None]:
def max_acc(y, phat, res=0.01): 

    """ Utility function for finding max. accuracy at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. accuracy, default 0.01.
        :return: Max. accuracy for model scores.
    
    """
    
    # init frame to store acc at different cutoffs
    acc_frame = pd.DataFrame(columns=['cut', 'acc'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find accuracy at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        acc = accuracy_score(temp_df.iloc[:, 0], temp_df['decision'])
        acc_frame = acc_frame.append({'cut': cut,
                                      'acc': acc},
                                     ignore_index=True)

    # find max accurcay across all cutoffs
    max_acc = acc_frame['acc'].max()
    
    # house keeping
    del acc_frame, temp_df
    
    return max_acc

18. Utility function for max. F1

In [None]:
def max_f1(y, phat, res=0.01): 
    
    """ Utility function for finding max. F1 at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. F1, default 0.01.
        :return: Max. F1 for model scores.
    
    """
    
    # init frame to store f1 at different cutoffs
    f1_frame = pd.DataFrame(columns=['cut', 'f1'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find f1 at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        f1 = f1_score(temp_df.iloc[:, 0], temp_df['decision'])
        f1_frame = f1_frame.append({'cut': cut,
                                    'f1': f1},
                                    ignore_index=True)
        
    # find max f1 across all cutoffs
    max_f1 = f1_frame['f1'].max()
    
     # house keeping
    del f1_frame, temp_df
    
    return max_f1

19. Apply Caruana et al. 2004 cross-validated ranking model selection 

In [None]:
eval_frame = pd.DataFrame() # init frame to hold score ranking
metric_list = ['acc', 'auc', 'f1', 'logloss', 'mse'] # metric to use for evaluation

# create eval frame row-by-row
for fold in sorted(scores_frame['fold'].unique()): # loop through folds 
    for metric_name in metric_list: # loop through metrics
        
        # init row dict to hold each rows values
        row_dict = {'fold': fold,
                    'metric': metric_name}
        
        # cache known y values for fold
        fold_y = scores_frame.loc[scores_frame['fold'] == fold, y_name]

        for col_name in scores_frame.columns[2:]:

            # cache fold scores
            fold_scores = scores_frame.loc[scores_frame['fold'] == fold, col_name]

            # calculate evaluation metric for fold
            # with reasonable precision 
            
            if metric_name == 'acc':
                row_dict[col_name] = np.round(max_acc(fold_y, fold_scores), ROUND)
                
            if metric_name == 'auc':
                row_dict[col_name] = np.round(roc_auc_score(fold_y, fold_scores), ROUND)
                
            if metric_name == 'f1':
                row_dict[col_name] = np.round(max_f1(fold_y, fold_scores), ROUND) 
                
            if metric_name == 'logloss':
                row_dict[col_name] = np.round(log_loss(fold_y, fold_scores), ROUND)
                
            if metric_name == 'mse':
                row_dict[col_name] = np.round(mean_squared_error(fold_y, fold_scores), ROUND)
        
        # append row values to eval_frame
        eval_frame = eval_frame.append(row_dict, ignore_index=True)

# init a temporary frame to hold rank information
rank_names = [name + '_rank' for name in eval_frame.columns if name not in ['fold', 'metric']]
rank_frame = pd.DataFrame(columns=rank_names)        

# set columns to necessary order
eval_frame = eval_frame[['fold', 'metric'] + [name for name in sorted(eval_frame.columns) if name not in ['fold', 'metric']]]

# determine score ranks row-by-row
for i in range(0, eval_frame.shape[0]):
        
        # get ranks for row based on metric
        metric_name = eval_frame.loc[i, 'metric']
        if metric_name in ['logloss', 'mse']:
            ranks = eval_frame.iloc[i, 2:].rank().values
        else:
            ranks = eval_frame.iloc[i, 2:].rank(ascending=False).values
        
        # create single-row frame and append to rank_frame
        row_frame = pd.DataFrame(ranks.reshape(1, ranks.shape[0]), columns=rank_names)
        rank_frame = rank_frame.append(row_frame, ignore_index=True)
        
        # house keeping
        del row_frame

# merge ranks onto eval_frame
eval_frame = pd.concat([eval_frame, rank_frame], axis=1)

# house keeping
del rank_frame
        
eval_frame

20. Display simple ranked score list

In [None]:
# average model ranks across folds and metrics
# lower is better
eval_frame[[name for name in eval_frame.columns if name.endswith('rank')]].mean().sort_values()

21. Select best model

In [None]:
best_glm_index =  # REQUIRES STUDENT INPUT: SELECT MODEL 0, 1, 2 or 3
best_glm = loan_grid.get_grid()[best_glm_index]

22. Determine honest estimate of test AUC

In [None]:
htest = h2o.H2OFrame() # REQUIRES STUDENT INPUT: SELECT CORRECT PARTITION
auc = best_glm.model_performance(htest).auc()
print('Best GLM test AUC: %.4f' % auc)

23. Generate a prediction for a new customer

In [None]:
# predict on new data -- REQUIRES STUDENT INPUT
new_row = h2o.H2OFrame({
  "GRP_REP_home_ownership": ,
  "GRP_addr_state": ,
  "GRP_purpose": ,
  "GRP_verification_status": ,
  "STD_IMP_REP_annual_inc": ,
  "STD_IMP_REP_delinq_2yrs": ,
  "STD_IMP_REP_dti": ,
  "STD_IMP_REP_emp_length": ,
  "STD_IMP_REP_int_rate": ,
  "STD_IMP_REP_loan_amnt": ,
  "STD_IMP_REP_longest_credit_lengt": ,
  "STD_IMP_REP_revol_util": ,
  "STD_IMP_REP_term_length": ,
  "STD_IMP_REP_total_acc": 
}) 

# generate prediction -- REQUIRES STUDENT INPUT


24. Shutdown h2o

In [None]:
# shutdown h2o
h2o.cluster().shutdown()