In [3]:
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder
from sklearn.preprocessing import StandardScaler
from copy import deepcopy
import numpy as np
import pandas as pd
import pickle
import os

import warnings
warnings.filterwarnings('ignore')

# Get the current working directory
cwd = os.getcwd()

# Define the directory path relative to the current working directory
dirname = os.path.join(cwd, 'Project 1')

In [24]:
def project_1_scoring(data):

    filepath = os.path.join(dirname, 'artifacts_dict_file.pkl')
    # working with the artifact file to import the model and threshold
    artifacts_dict_file = open(filepath, "rb")
    artifacts_dict = pickle.load(file=artifacts_dict_file)
    artifacts_dict_file.close()
    logreg = artifacts_dict["model"]
    threshold = artifacts_dict["threshold"]
    f_names = artifacts_dict["feature_names"]
    woe = artifacts_dict["woe_encoder"]

    
    data.drop(columns="index",inplace=True)
    categorical_cols = ['City', 'State', 'Bank', 'BankState', 'NewExist', 'UrbanRural', 'RevLineCr', 'LowDoc', 'Zip', 'NAICS', 'FranchiseCode']
    numerical_cols = ['NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']
    # replace Na/Null values
    #For categorical columns
    for column in categorical_cols:
      data[column]=data[column].fillna(data[column].mode()[0])

    for column in numerical_cols:
      data[column]=data[column].fillna(0)
    
    #Invalid data handling
    data['RevLineCr'] = data['RevLineCr'].map({'N':'N','Y':'Y','0':'N','1':'Y','T':'Y'})
    data = data[data['RevLineCr'].isin(['N','Y'])]
    data['LowDoc'] = data['LowDoc'].map({'N':'N','Y':'Y','0':'N','1':'Y'})
    data = data[data['LowDoc'].isin(['N','Y'])]
    data = data[data['NewExist'].isin([1, 2])]
    data['NewExist'].unique()
    
    data.fillna(value=np.nan, inplace=True)
    data.dropna(inplace=True)

    #Adding Engineered Features
    
    #1. Creating a feature that is indicating the ratio of retained jobs to created jobs
    data['Retained_Created_Job_Ratio'] = data['RetainedJob'] /(data['CreateJob'] + 1)

    #2. Calculating SBA's Guarenteed Portion of Approved Loan
    data['Guaren_SBA_Appv'] =  data['SBA_Appv']/ data['GrAppv']
    
    #3. Creating a feature that is indicating the ratio of the loan amount disbursed to the gross amount approved
    data['Loan_Gross_Ratio'] = data['DisbursementGross']/ data['GrAppv']
    
    #4. Creating a feature that is indicating the ratio of the SBA loan amount to the gross disbursement
    data['SBA_Loan_Gross_Ratio'] = data['SBA_Appv'] / data['DisbursementGross']

    #5. Calculating 'EmployeesToLoanRatio' as a ratio of 'NoEmp' to 'SBA_Appv' 
    data['EmployeesToLoanRatio'] = data['NoEmp'] / data['SBA_Appv']
    
    #6. Creating a feature that is log transformation of Disbursement Gross
    data['LogDisbursementGross'] = np.log(data['DisbursementGross'] + 1)
    
    #7. Creating a feature that is Log Transformation of SBA Approval Amount
    data['LogSBAApprovalAmount'] = np.log(data['SBA_Appv'] + 1)

    #8. Creating a feature that is indicating if loan is originated Loan same state
    data['BankOriginatedLoan'] = np.where(data['State'] == data['BankState'], 1, 0)

    #9. Creating a feature that is indicating loan amount to income ratio or disbursement amount per employees
    data['LoanToIncomeRatio'] = data['DisbursementGross'] / (data['NoEmp'] + 1)

    #10. Creating a feature that is Total Jobs which is sum of new and previous jobs
    data['TotalJobs'] = data['CreateJob'] + data['RetainedJob']
    
    #Replacing infinite values
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.fillna(999, inplace=True)
    
    #Splitting the dataset into train (60%), validation (20%), and test (20%) sets
    train_val, test = train_test_split(data, test_size=0.2, random_state=182)
    train, val = train_test_split(train_val, test_size=0.25, random_state=182)

    #Categorical encoders dictionary
    cat_encoders = {}
    #New categorical (encoded) columns
    cat_enc_columns = []
    for col in categorical_cols:
        if train[col].nunique() < 10:
            print("One-hot encoding of ", col)
            enc = OneHotEncoder(handle_unknown='ignore',sparse=False)
            enc.fit(train[[col]])
            result = enc.transform(train[[col]])
            ohe_columns = [col+"_"+str(x) for x in enc.categories_[0]]
            cat_enc_columns = cat_enc_columns + ohe_columns
            result_train = pd.DataFrame(result, columns=ohe_columns, index=train.index)
            train = pd.concat([train, result_train.reindex(data.index)], axis=1, join='inner')
            train.drop(columns=[col], inplace=True)
            #Encode Testing
            result = enc.transform(test[[col]])
            result_test = pd.DataFrame(result, columns=ohe_columns, index=test.index)
            test = pd.concat([test, result_test.reindex(data.index)], axis=1, join='inner')
            test.drop(columns=[col], inplace=True)
            #Encode Validation
            result = enc.transform(val[[col]])
            result_val = pd.DataFrame(result, columns=ohe_columns, index=val.index)
            val = pd.concat([val, result_val.reindex(data.index)], axis=1, join='inner')
            cat_encoders[col] = [deepcopy(enc), "ohe"]
            val.drop(columns=[col], inplace=True)
            
        else:
            print("WOE encoding of ", col)
            woe_encoder = ce.WOEEncoder(cols=[col])
            woe_encoder.fit(train[col],train['MIS_Status'])
            train[col+'_woe'] = woe_encoder.transform(train[col])
            test[col+'_woe'] = woe_encoder.transform(test[col])
            val[col+'_woe'] = woe_encoder.transform(val[col])
            train.drop(columns=[col], inplace=True)
            test.drop(columns=[col], inplace=True)
            val.drop(columns=[col], inplace=True)
            
    train.pop("RevLineCr_Y")
    train.pop("LowDoc_Y")
    train.pop("UrbanRural_1")
    train.pop("NewExist_1.0")
    test.pop("RevLineCr_Y")
    test.pop("LowDoc_Y")
    test.pop("UrbanRural_1")
    test.pop("NewExist_1.0")
    val.pop("RevLineCr_Y")
    val.pop("LowDoc_Y")
    val.pop("UrbanRural_1")
    val.pop("NewExist_1.0")

    #scaling for numerical columns
    num_scalers = {}

    numerical_cols = ['NoEmp', 'CreateJob', 'RetainedJob', 'DisbursementGross', 'BalanceGross', 'GrAppv', 'SBA_Appv']

    '''Scaling only original numerical columns'''
    for col in numerical_cols:
        print("Standard scale of ", col)
        scaler = StandardScaler()
        scaler.fit(train[[col]])
        pickle.dump(scaler, open(col+'_sc_'+'pre_processing.p', "wb"))
        train[col+"_sc"] = scaler.transform(train[[col]])
        test[col+"_sc"] = scaler.transform(test[[col]])
        val[col+"_sc"] = scaler.transform(val[[col]])
        num_scalers[col] = [deepcopy(scaler),"Standard"]
        train.drop(columns=[col], inplace=True)
        test.drop(columns=[col], inplace=True)
        val.drop(columns=[col], inplace=True)

    X_train = train
    X_test=test
    X_valid=val

    
    X_train = X_train[f_names] # To avoid the error :The feature names should match those that were passed during fit.
    X_test = X_test[f_names]
    X_valid = X_valid[f_names]
    
    y_pred_prob = logreg.predict_proba(X_train)
    y_pred = (y_pred_prob[:,0] < threshold).astype(np.int16)
    answer_dataframe = {
         "label":y_pred,
         "probability_0":y_pred_prob[:,0],
         "probability_1":y_pred_prob[:,1]}
    

    return pd.DataFrame(answer_dataframe)

In [25]:
#reading the data and keeping the new test data file in the same folder
datafilepath = os.path.join(dirname, 'SBA_loans_project_1.csv')
data = pd.read_csv(datafilepath)
# calling the function
answer=project_1_scoring(data)
# printing the answer
print(answer)

WOE encoding of  City
WOE encoding of  State
WOE encoding of  Bank
WOE encoding of  BankState
One-hot encoding of  NewExist
One-hot encoding of  UrbanRural
One-hot encoding of  RevLineCr
One-hot encoding of  LowDoc
WOE encoding of  Zip
WOE encoding of  NAICS
WOE encoding of  FranchiseCode
Standard scale of  NoEmp
Standard scale of  CreateJob
Standard scale of  RetainedJob
Standard scale of  DisbursementGross
Standard scale of  BalanceGross
Standard scale of  GrAppv
Standard scale of  SBA_Appv
        label  probability_0  probability_1
0           0       0.981217       0.018783
1           1       0.585139       0.414861
2           0       0.968295       0.031705
3           1       0.369174       0.630826
4           0       0.985633       0.014367
...       ...            ...            ...
478547      0       0.926648       0.073352
478548      0       0.972130       0.027870
478549      0       0.995350       0.004650
478550      0       0.943083       0.056917
478551      1     

In [8]:
import pandas as pd
ef = pd.read_csv("SBA_loans_project_1_holdout_students_valid.csv")
ef1 = ef.copy()

In [None]:
 #since ['MIS_Status'] column wont be there in new data, the scoring function needs to be modified a bit

In [None]:
print(project_1_scoring(ef1))