# Scoring Function.
#### Project 2- Submitted by - Sai Lahari Korsipati SXK230101


## Initial Data preparation and cleanup

In [1]:

#Import the necessary modules
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
import pandas as pd
import numpy as np
import os
import pickle
import math

from h2o.grid.grid_search import H2OGridSearch

pd.set_option('display.max_columns', 1500)

import warnings
warnings.filterwarnings('ignore')

#Extend cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))
#Start the H2o Cluster
try:
    h2o.cluster().shutdown()
except:
    pass 
import psutil

# Initialize H2O using all available CPU cores and 90% of available memory
num_cores = 8
mem_size = 8
h2o.init(nthreads=num_cores, max_mem_size=mem_size)

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,1 hour 39 mins
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,4 months and 13 days
H2O_cluster_name:,H2O_from_python_Lahari_Reddy_x5p82z
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.511 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


In [2]:
import pickle

def load_and_print_artifacts_dict(path):
    artifacts_dict = pickle.load(open(path, "rb"))

    print("Target encoder mapping:")
    print([ac for ac in artifacts_dict["trg_encoders"].mapping])

    print("Columns to train:")
    print([ac for ac in artifacts_dict["training_columns"]])

if __name__ == "__main__":
    load_and_print_artifacts_dict("../artifacts/artifacts_dict_file.pkl")

Target encoder mapping:
['City', 'State', 'Bank', 'BankState', 'RevLineCr', 'LowDoc', 'NewExist', 'UrbanRural']
Columns to train:
['City_trg', 'State_trg', 'Zip_trg', 'Bank_trg', 'BankState_trg', 'NAICS_trg', 'NoEmp_trg', 'NewExist_trg', 'CreateJob_trg', 'RetainedJob_trg', 'FranchiseCode_trg', 'UrbanRural_trg', 'RevLineCr_trg', 'LowDoc_trg', 'DisbursementGross_trg', 'BalanceGross_trg', 'GrAppv_trg', 'SBA_Appv_trg', 'Zip', 'NAICS', 'NoEmp', 'CreateJob', 'RetainedJob', 'FranchiseCode', 'DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv', 'Log_DisbursementGross', 'Log_NoEmp', 'Log_GrAppv', 'Log_SBA_Appv', 'Log_BalanceGross', 'Disbursement_Bins', 'Loan_Efficiency', 'Guarantee_Ratio', 'Loan_Guarantee_Interaction', 'Disbursement_Squared']


In [3]:
def project_2_scoring(data):
    """
    Function to score input dataset.
    
    Input: dataset in Pandas DataFrame format
    Output: Python list of labels in the same order as input records
    
    Flow:
        - Load artifacts
        - Transform dataset
        - Score dataset
        - Return labels
    
    """
    from sklearn.preprocessing import OneHotEncoder
    from copy import deepcopy
    from sklearn.linear_model import LogisticRegression
    import pickle
    import joblib

    if "MIS_Status" in data.columns:
        data = data.drop(columns=["MIS_Status"])

    ''' Loading best model'''

    # load the model
    model_path = '../artifacts/final_grid_model_17' #please change this name based on best model in artifacts file
    best_model = h2o.load_model(model_path)
    best_model

    '''Loading Artifacts'''
    artifacts_dict_file = open("../artifacts/artifacts_dict_file.pkl", "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()
    
 
    encoder = artifacts_dict["trg_encoders"]
    cat_enc_columns = artifacts_dict["trg_enc_columns"]
    training_columns = artifacts_dict["training_columns"]
    f1ThresholdBest = artifacts_dict["threshold"]

    ''' Data Cleaning '''

    for i in data['RevLineCr']:
        if i not in ['Y','N']:
            data['RevLineCr'].replace(i,'N',inplace=True)

    for i in data['LowDoc']:
        if i not in ['Y','N']:
            data['LowDoc'].replace(i,'N',inplace=True)

    for i in data['NewExist']:
        if i not in [1,2]:
            data['NewExist'].replace(i,None,inplace=True)

    for col in data.select_dtypes(include=['float64', 'int64']).columns:
        data[col].fillna(data[col].mean(), inplace=True)
    # For categorical features, fill missing values with the mode
    for col in data.select_dtypes(include=['object']).columns:
        data[col].fillna(data[col].mode()[0], inplace=True)

    if 'Zip' in data.columns:
        data['Zip'] =data['Zip'].astype(str)

    ''' Encoding'''
    
    HO_encoded = encoder.transform(data)
    HO_encoded = HO_encoded.add_suffix('_trg')
    HO_encoded = pd.concat([HO_encoded, data], axis=1)
    for column in cat_enc_columns:
        HO_encoded[column + "_trg"].fillna(HO_encoded[column + "_trg"].mean(), inplace=True)

    HO_encoded.drop(columns=cat_enc_columns, inplace=True)
    
    '''Adding engineered features'''
    # Creating log-based features for the test dataset
    HO_encoded['Log_DisbursementGross'] = np.log1p(HO_encoded['DisbursementGross'])
    HO_encoded['Log_NoEmp'] = np.log1p(HO_encoded['NoEmp'])
    HO_encoded['Log_GrAppv'] = np.log1p(HO_encoded['GrAppv'])
    HO_encoded['Log_SBA_Appv'] = np.log1p(HO_encoded['SBA_Appv'])
    HO_encoded['Log_BalanceGross'] = np.log1p(HO_encoded['BalanceGross'])

    # Binning

    HO_encoded['Disbursement_Bins'] = pd.cut(HO_encoded['DisbursementGross'],
                                             bins=[-np.inf, 50000, 150000, np.inf],
                                             labels=['Low', 'Medium', 'High'])

    # Loan Efficiency
    HO_encoded['Loan_Efficiency'] = HO_encoded['DisbursementGross'] / (HO_encoded['CreateJob'] + HO_encoded['RetainedJob'] + 1)  # Adding 1 to avoid division by zero

    # Guarantee Ratio
    HO_encoded['Guarantee_Ratio'] = HO_encoded['SBA_Appv'] / HO_encoded['GrAppv']

    # Loan Guarantee Interaction
    HO_encoded['Loan_Guarantee_Interaction'] = HO_encoded['SBA_Appv'] * HO_encoded['GrAppv']

    # Disbursement Squared
    HO_encoded['Disbursement_Squared'] = HO_encoded['DisbursementGross'] ** 2

    data_hf = h2o.H2OFrame(HO_encoded)

    '''Prediction '''

    y_pred_probability = best_model.predict(data_hf[training_columns]).as_data_frame()

    #print("****y_pred_probability*******")
    #print(y_pred_probability.head())
    y_pred = (y_pred_probability['p1'] > f1ThresholdBest).astype(int)

    #print("****Y Pred*******")
    #print(y_pred.head())

    index_df = data_hf["index"].as_data_frame()
    #print("****Index DF*******")
    #print(index_df.describe())
    #print(index_df.head())
       
    # Create the dictionary of results
    d = {"ID":index_df["index"],
         "label":y_pred,
         "probability_0":y_pred_probability["p0"],
         "probability_1":y_pred_probability["p1"]}
    
    #print("************** Dictionary print *************")
    #print(d)
    return pd.DataFrame(d)

In [4]:
# testing the project_2_scoring_function with the holdout_Students.csv

data = pd.read_csv('../SBA_loans_project_2_holdout_students_valid.csv')
data_input = data.copy()
print("Data shape:", data_input.shape)
result= project_2_scoring(data_input)
result 

Data shape: (99808, 19)
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,ID,label,probability_0,probability_1
0,0,0,0.943796,0.056204
1,1,0,0.805917,0.194083
2,2,0,0.933051,0.066949
3,3,0,0.945429,0.054571
4,4,0,0.885328,0.114672
...,...,...,...,...
99803,99803,1,0.690781,0.309219
99804,99804,0,0.896284,0.103716
99805,99805,0,0.955100,0.044900
99806,99806,0,0.860355,0.139645


In [5]:
##  exporting final prediction on holdout to csv to check in Kaggle
result[['ID','probability_1']].to_csv('checkonkaggle.csv', sep=',', index=False, encoding='utf-8')