In [1]:
import pandas as pd # import pandas for easy data manipulation using data frames

from matplotlib import pyplot as plt # plotting
import numpy as np                   # basic array and matric handling
import seaborn as sns                # slightly better plotting 

# for model eval
from sklearn.metrics import accuracy_score, f1_score, log_loss, mean_squared_error, roc_auc_score

# to upload local files
import io
from google.colab import files  

ROUND = 3              # generally, insane precision is not needed 
SEED = 12345           # seed for better reproducibility

In [2]:
# install Java for h2o backend
!apt-get install default-jre
!java -version

Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jre is already the newest version (2:1.11-68ubuntu1~18.04.1).
default-jre set to manually installed.
The following packages were automatically installed and are no longer required:
  cuda-command-line-tools-10-0 cuda-command-line-tools-10-1
  cuda-command-line-tools-11-0 cuda-compiler-10-0 cuda-compiler-10-1
  cuda-compiler-11-0 cuda-cuobjdump-10-0 cuda-cuobjdump-10-1
  cuda-cuobjdump-11-0 cuda-cupti-10-0 cuda-cupti-10-1 cuda-cupti-11-0
  cuda-cupti-dev-11-0 cuda-documentation-10-0 cuda-documentation-10-1
  cuda-documentation-11-0 cuda-documentation-11-1 cuda-gdb-10-0 cuda-gdb-10-1
  cuda-gdb-11-0 cuda-gpu-library-advisor-10-0 cuda-gpu-library-advisor-10-1
  cuda-libraries-10-0 cuda-libraries-10-1 cuda-libraries-11-0
  cuda-memcheck-10-0 cuda-memcheck-10-1 cuda-memcheck-11-0 cuda-nsight-10-0
  cuda-nsight-10-1 cuda-nsight-11-0 cuda-nsight-11-1 cuda-nsight-compute-10-0
  cuda-nsight-c

In [3]:
# install h2o
!pip install h2o 

Collecting h2o
  Downloading h2o-3.36.0.2.tar.gz (176.0 MB)
[K     |████████████████████████████████| 176.0 MB 5.1 kB/s 
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.36.0.2-py2.py3-none-any.whl size=176112127 sha256=903c432b4f79ba88cb68dbdb50585440913d069d0c5f94b9ed76614febf1840e
  Stored in directory: /root/.cache/pip/wheels/0b/83/bd/343a6140119b4098103e725e1cfe9f198568d13cf103acbc7c
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.36.0.2


In [4]:
# import h2o and required classes
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch

In [5]:
# special google collab command to upload a file from computer
uploaded = files.upload() 

Saving loan_clean(HW4).csv to loan_clean(HW4).csv


In [6]:
uploaded.keys() # what is stored in that Python object?

dict_keys(['loan_clean(HW4).csv'])

In [7]:
# convert data to Pandas DataFrame
raw = pd.read_csv(io.StringIO(uploaded['loan_clean(HW4).csv'].decode('utf-8'))) # name in quotes here must match name in quotes directly above 

In [8]:
# ALWAYS set a random seed when working with randomness
np.random.seed(12345) # REQUIRES STUDENT INPUT
raw['partition'] = np.random.choice(3, raw.shape[0])
train = raw[raw['partition'] == 0].copy(deep=True)
valid = raw[raw['partition'] == 1].copy(deep=True)
test = raw[raw['partition'] == 2].copy(deep=True)

print('Training data rows: %d,training data columns: %d' % (train.shape[0], train.shape[1]))
print('Validation data rows: %d,training data columns: %d' % (valid.shape[0], valid.shape[1]))
print('Test data rows: %d,training data columns: %d' % (test.shape[0], test.shape[1]))

assert(raw.shape[0] == train.shape[0] + valid.shape[0] + test.shape[0]) # test that splits sum to original data size

Training data rows: 55033,training data columns: 19
Validation data rows: 54827,training data columns: 19
Test data rows: 54127,training data columns: 19


In [9]:
valid.loc[:, 'fold'] = np.random.choice(5, valid.shape[0])
valid.head()

Unnamed: 0,id,bad_loan,GRP_REP_home_ownership,GRP_addr_state,GRP_home_ownership,GRP_purpose,GRP_verification_status,_WARN_,STD_IMP_REP_annual_inc,STD_IMP_REP_delinq_2yrs,STD_IMP_REP_dti,STD_IMP_REP_emp_length,STD_IMP_REP_int_rate,STD_IMP_REP_loan_amnt,STD_IMP_REP_longest_credit_lengt,STD_IMP_REP_revol_util,STD_IMP_REP_term_length,STD_IMP_REP_total_acc,partition,fold
1,10002,1,3,10,3,8,2,,-1.045077,-0.392196,-1.986153,-1.62139,0.357273,-1.334708,-0.420596,-1.78827,1.971879,-1.796518,1,3
2,10003,0,3,7,3,7,3,,-1.501267,-0.392196,-0.955642,1.228817,0.515891,-1.347329,-0.721238,1.778298,-0.516496,-1.27183,1,0
3,10004,0,3,2,3,4,2,,-0.303921,-0.392196,0.550079,1.228817,-0.051913,-0.38813,0.030368,0.032565,-0.516496,1.089264,1,1
5,10006,0,3,2,3,8,2,,-0.582409,-0.392196,-1.40549,0.943796,1.131969,-1.271603,-1.623166,1.337981,-0.516496,-1.796518,1,1
8,10009,0,4,14,4,2,3,,0.034481,-0.392196,0.032153,-0.196287,0.214748,-0.829866,-0.270274,-1.339947,1.971879,-0.135007,1,4


In [10]:
x_names = ["GRP_REP_home_ownership", "GRP_addr_state","GRP_purpose", "GRP_verification_status", "STD_IMP_REP_annual_inc",
"STD_IMP_REP_delinq_2yrs", "STD_IMP_REP_dti", "STD_IMP_REP_emp_length",
"STD_IMP_REP_int_rate", "STD_IMP_REP_loan_amnt",
"STD_IMP_REP_longest_credit_lengt", "STD_IMP_REP_revol_util",
"STD_IMP_REP_term_length", "STD_IMP_REP_total_acc"]
y_name = 'bad_loan'

In [11]:
# start h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.13" 2021-10-19; OpenJDK Runtime Environment (build 11.0.13+8-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.13+8-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp_gq7309w
  JVM stdout: /tmp/tmp_gq7309w/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp_gq7309w/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.2
H2O_cluster_version_age:,12 days
H2O_cluster_name:,H2O_from_python_unknownUser_uxx3kr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.172 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [12]:
def glm_grid(x_names, y_name, htrain, hvalid=None, seed_=SEED):

    """ Wrapper function for penalized GLM with alpha and lambda search.
    :param x_names: List of inputs.
    :param y_name: Name of target variable.
    :param htrain: Training H2OFrame.
    :param hvalid: Validation H2OFrame, default None.
    :param seed_: Random seed for better reproducibility, default 12345.
    :return: Best H2OGeneralizedLinearEstimator.
    """

    alpha_opts = [0.01, 0.25, 0.5, 0.99]  # REQUIRES STUDENT INPUT

    # define search criteria
    # i.e., over alpha
    # lamda search handled by lambda_search param below
    hyper_parameters = {'alpha': alpha_opts}

    # initialize grid search
    grid = H2OGridSearch(
        H2OGeneralizedLinearEstimator(family="binomial",
                                      lambda_search=True,
                                      seed=seed_),
        hyper_params=hyper_parameters)

    # execute training w/ grid search
    grid.train(y=y_name,
               x=x_names,
               training_frame=htrain,
               validation_frame=hvalid,
               seed=seed_)

    # return entire grid of models
    return grid

In [13]:
# training data
htrain = h2o.H2OFrame(train) # load Pandas DataFrame in H2OFrame
htrain[y_name] = htrain[y_name].asfactor() # ensures h2o treats y/target as categorical and not numeric

# validation data
hvalid = h2o.H2OFrame(valid) 
hvalid[y_name] = hvalid[y_name].asfactor() 

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [14]:
loan_grid=glm_grid(x_names, y_name, htrain=htrain, hvalid=hvalid, seed_=SEED)

glm Grid Build progress: |███████████████████████████████████████████████████████| (done) 100%




In [15]:
# cycle through grid search results and print valid AUC
for i in range(0, 4):
  candidate_glm = loan_grid.get_grid()[i]
  print('Model %d validation AUC: %.4f' % (i, candidate_glm.auc(valid=True)))
  print(loan_grid.get_grid()[i].summary())

Model 0 validation AUC: 0.6779

GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.01, lambda = 0.01066 )","nlambda = 100, lambda.max = 8.648, lambda.min = 0.01066, lambda.1s...",14,14,73,py_1_sid_b806



Model 1 validation AUC: 0.6778

GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.25, lambda = 0.004365 )","nlambda = 100, lambda.max = 0.3459, lambda.min = 0.004365, lambda....",14,13,64,py_1_sid_b806



Model 2 validation AUC: 0.6777

GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.99, lambda = 0.001599 )","nlambda = 100, lambda.max = 0.08735, lambda.min = 0.001599, lambda...",14,13,55,py_1_sid_b806



Model 3 validation AUC: 0.6776

GLM Model: summary


Unnamed: 0,Unnamed: 1,family,link,regularization,lambda_search,number_of_predictors_total,number_of_active_predictors,number_of_iterations,training_frame
0,,binomial,logit,"Elastic Net (alpha = 0.5, lambda = 0.003166 )","nlambda = 100, lambda.max = 0.173, lambda.min = 0.003166, lambda.1...",14,13,58,py_1_sid_b806





In [16]:
scores_frame = pd.DataFrame(valid[[y_name, 'fold']].copy(deep=True))
scores_frame = scores_frame.reset_index(drop=True)
for i in range(0, 4):
  model_name = 'glm_' + str(i)
  scores_frame[model_name] = loan_grid.get_grid()[i].predict(hvalid)['p1'].as_data_frame()

glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [17]:
def max_acc(y, phat, res=0.01): 

    """ Utility function for finding max. accuracy at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. accuracy, default 0.01.
        :return: Max. accuracy for model scores.
    
    """
    
    # init frame to store acc at different cutoffs
    acc_frame = pd.DataFrame(columns=['cut', 'acc'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find accuracy at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        acc = accuracy_score(temp_df.iloc[:, 0], temp_df['decision'])
        acc_frame = acc_frame.append({'cut': cut,
                                      'acc': acc},
                                     ignore_index=True)

    # find max accurcay across all cutoffs
    max_acc = acc_frame['acc'].max()
    
    # house keeping
    del acc_frame, temp_df
    
    return max_acc

In [18]:
def max_f1(y, phat, res=0.01): 
    
    """ Utility function for finding max. F1 at some cutoff. 
    
        :param y: Known y values.
        :param phat: Model scores.
        :param res: Resolution over which to search for max. F1, default 0.01.
        :return: Max. F1 for model scores.
    
    """
    
    # init frame to store f1 at different cutoffs
    f1_frame = pd.DataFrame(columns=['cut', 'f1'])
    
    # copy known y and score values into a temporary frame
    temp_df = pd.concat([y, phat], axis=1)
    
    # find f1 at different cutoffs and store in acc_frame
    for cut in np.arange(0, 1 + res, res):
        temp_df['decision'] = np.where(temp_df.iloc[:, 1] > cut, 1, 0)
        f1 = f1_score(temp_df.iloc[:, 0], temp_df['decision'])
        f1_frame = f1_frame.append({'cut': cut,
                                    'f1': f1},
                                    ignore_index=True)
        
    # find max f1 across all cutoffs
    max_f1 = f1_frame['f1'].max()
    
     # house keeping
    del f1_frame, temp_df
    
    return max_f1

In [19]:
eval_frame = pd.DataFrame() # init frame to hold score ranking
metric_list = ['acc', 'auc', 'f1', 'logloss', 'mse'] # metric to use for evaluation

# create eval frame row-by-row
for fold in sorted(scores_frame['fold'].unique()): # loop through folds 
    for metric_name in metric_list: # loop through metrics
        
        # init row dict to hold each rows values
        row_dict = {'fold': fold,
                    'metric': metric_name}
        
        # cache known y values for fold
        fold_y = scores_frame.loc[scores_frame['fold'] == fold, y_name]

        for col_name in scores_frame.columns[2:]:

            # cache fold scores
            fold_scores = scores_frame.loc[scores_frame['fold'] == fold, col_name]

            # calculate evaluation metric for fold
            # with reasonable precision 
            
            if metric_name == 'acc':
                row_dict[col_name] = np.round(max_acc(fold_y, fold_scores), ROUND)
                
            if metric_name == 'auc':
                row_dict[col_name] = np.round(roc_auc_score(fold_y, fold_scores), ROUND)
                
            if metric_name == 'f1':
                row_dict[col_name] = np.round(max_f1(fold_y, fold_scores), ROUND) 
                
            if metric_name == 'logloss':
                row_dict[col_name] = np.round(log_loss(fold_y, fold_scores), ROUND)
                
            if metric_name == 'mse':
                row_dict[col_name] = np.round(mean_squared_error(fold_y, fold_scores), ROUND)
        
        # append row values to eval_frame
        eval_frame = eval_frame.append(row_dict, ignore_index=True)

# init a temporary frame to hold rank information
rank_names = [name + '_rank' for name in eval_frame.columns if name not in ['fold', 'metric']]
rank_frame = pd.DataFrame(columns=rank_names)        

# set columns to necessary order
eval_frame = eval_frame[['fold', 'metric'] + [name for name in sorted(eval_frame.columns) if name not in ['fold', 'metric']]]

# determine score ranks row-by-row
for i in range(0, eval_frame.shape[0]):
        
        # get ranks for row based on metric
        metric_name = eval_frame.loc[i, 'metric']
        if metric_name in ['logloss', 'mse']:
            ranks = eval_frame.iloc[i, 2:].rank().values
        else:
            ranks = eval_frame.iloc[i, 2:].rank(ascending=False).values
        
        # create single-row frame and append to rank_frame
        row_frame = pd.DataFrame(ranks.reshape(1, ranks.shape[0]), columns=rank_names)
        rank_frame = rank_frame.append(row_frame, ignore_index=True)
        
        # house keeping
        del row_frame

# merge ranks onto eval_frame
eval_frame = pd.concat([eval_frame, rank_frame], axis=1)

# house keeping
del rank_frame
        
eval_frame

Unnamed: 0,fold,metric,glm_0,glm_1,glm_2,glm_3,glm_0_rank,glm_1_rank,glm_2_rank,glm_3_rank
0,0.0,acc,0.805,0.805,0.805,0.805,2.5,2.5,2.5,2.5
1,0.0,auc,0.683,0.683,0.683,0.683,2.5,2.5,2.5,2.5
2,0.0,f1,0.409,0.409,0.407,0.407,1.5,1.5,3.5,3.5
3,0.0,logloss,0.461,0.461,0.461,0.461,2.5,2.5,2.5,2.5
4,0.0,mse,0.147,0.147,0.147,0.147,2.5,2.5,2.5,2.5
5,1.0,acc,0.806,0.806,0.806,0.806,2.5,2.5,2.5,2.5
6,1.0,auc,0.676,0.675,0.675,0.675,1.0,3.0,3.0,3.0
7,1.0,f1,0.399,0.398,0.397,0.397,1.0,2.0,3.5,3.5
8,1.0,logloss,0.463,0.463,0.463,0.463,2.5,2.5,2.5,2.5
9,1.0,mse,0.147,0.147,0.147,0.147,2.5,2.5,2.5,2.5


In [20]:
# average model ranks across folds and metrics
# lower is better
eval_frame[[name for name in eval_frame.columns if name.endswith('rank')]].mean().sort_values()

glm_0_rank    2.26
glm_1_rank    2.46
glm_3_rank    2.62
glm_2_rank    2.66
dtype: float64

In [21]:
best_glm_index =0 # REQUIRES STUDENT INPUT: SELECT MODEL 0, 1, 2 or 3
best_glm = loan_grid.get_grid()[best_glm_index]

In [22]:
htest = h2o.H2OFrame(test) # REQUIRES STUDENT INPUT: SELECT CORRECT PARTITION
auc = best_glm.model_performance(htest).auc()
print('Best GLM test AUC: %.4f' % auc)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Best GLM test AUC: 0.6780


In [25]:
# predict on new data -- REQUIRES STUDENT INPUT
new_row = h2o.H2OFrame({
  "GRP_REP_home_ownership":1 ,
  "GRP_addr_state":20 ,
  "GRP_purpose":1 ,
  "GRP_verification_status":1 ,
  "STD_IMP_REP_annual_inc":4 ,
  "STD_IMP_REP_delinq_2yrs":1 ,
  "STD_IMP_REP_dti":1 ,
  "STD_IMP_REP_emp_length":2 ,
  "STD_IMP_REP_int_rate":1 ,
  "STD_IMP_REP_loan_amnt":1 ,
  "STD_IMP_REP_longest_credit_lengt":1 ,
  "STD_IMP_REP_revol_util": 1,
  "STD_IMP_REP_term_length": 1,
  "STD_IMP_REP_total_acc":0 
}) 

best_glm.predict(new_row)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
glm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


predict,p0,p1
0,0.848162,0.151838




In [26]:
# shutdown h2o
h2o.cluster().shutdown()

H2O session _sid_b806 closed.
