In [17]:
# Mount google drive
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [58]:
import pandas as pd 
import numpy as np
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [117]:
# reading in the data 
df = pd.read_csv('/gdrive/MyDrive/PatientFI/data.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [118]:
# deleting single observation where home ownership = ANY before test-train split
df = df[df['home_ownership'] != 'ANY']

In [119]:
X = df.drop(['loan_status'],axis=1)   # independant features
y = df['loan_status']					# dependant variable

In [120]:
# Choose your test size to split between training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [121]:
# drop cols we know we don't need 
X_test = X_test.drop(['id',
               'member_id',
               'desc',
               'sec_app_fico_range_low',
               'sec_app_fico_range_high',
               'sec_app_earliest_cr_line',
               'sec_app_inq_last_6mths',
               'sec_app_mort_acc',
               'sec_app_open_acc',
               'sec_app_revol_util',
               'sec_app_open_act_il',
               'sec_app_num_rev_accts',
               'sec_app_chargeoff_within_12_mths',
               'sec_app_collections_12_mths_ex_med',
               'sec_app_mths_since_last_major_derog',
               'revol_bal_joint',
               'annual_inc_joint',
               'dti_joint',
               'verification_status_joint',
               'disbursement_method',
               'emp_title'],
              axis=1)

X_train = X_train.drop(['id',
               'member_id',
               'desc',
               'sec_app_fico_range_low',
               'sec_app_fico_range_high',
               'sec_app_earliest_cr_line',
               'sec_app_inq_last_6mths',
               'sec_app_mort_acc',
               'sec_app_open_acc',
               'sec_app_revol_util',
               'sec_app_open_act_il',
               'sec_app_num_rev_accts',
               'sec_app_chargeoff_within_12_mths',
               'sec_app_collections_12_mths_ex_med',
               'sec_app_mths_since_last_major_derog',
               'revol_bal_joint',
               'annual_inc_joint',
               'dti_joint',
               'verification_status_joint',
               'disbursement_method',
               'emp_title'],
              axis=1)

In [122]:
str_extract = ['term','emp_length','zip_code','earliest_cr_line']

# function to extract strings 
def string_extractor(df, str_extract):
  for column in str_extract:
    df[column] = df[column].str.extract('(\d+)', expand=False)
    df[column] = pd.to_numeric(df[column], errors='coerce')
    print(df[column].head(n=5))
  return df


In [None]:
# extracting strings from train and test 
X_train = string_extractor(df=X_train, str_extract=str_extract)
X_test = string_extractor(df=X_test, str_extract=str_extract)

In [126]:
# loop to deal with hardships nan values before encoding 
# for commented out cols, we will let xgboost deal with missing values 
hardship_cols = ['hardship_flag',
                 'hardship_type',
                 'hardship_reason',
                 'hardship_status',
                 'deferral_term',
                 #'hardship_amount',
                 'hardship_start_date',
                 'hardship_end_date',
                 'payment_plan_start_date',
                 #'hardship_length',
                 #'hardship_dpd',
                 'hardship_loan_status',
                 #'hardship_payoff_balance_amount',
                 #'hardship_last_payment_amount'
                 ]

for column in hardship_cols:
  X_train[column].fillna('No Hardship').astype(str)

for column in hardship_cols:
  X_test[column].fillna('No Hardship').astype(str)

In [127]:
# loop to deal with settlment nan values before encoding 
# for commented out cols (float dtype), we will let xgboost deal with missing values 
settlement_cols = ['debt_settlement_flag',
                   'debt_settlement_flag_date',
                   'settlement_status',
                   'settlement_date',
                   #'settlement_amount',
                   #'settlement_percentage',
                   #'settlement_term'
                   ]

for column in settlement_cols:
  X_train[column].fillna('No Settlement').astype(str)

for column in settlement_cols:
  X_test[column].fillna('No Settlement').astype(str)

In [128]:
# preparing for ord encoding 
from sklearn.preprocessing import OrdinalEncoder
# define ordinal encoding
encoder = OrdinalEncoder()

# function to encode ordinal cat vars 
ord_cols = ['grade',
            'sub_grade',
            'issue_d',
            'earliest_cr_line',
            'last_pymnt_d',
            'next_pymnt_d',
            'last_credit_pull_d',
            'hardship_start_date',
            'hardship_end_date',
            'payment_plan_start_date',
            'settlement_date',
            'debt_settlement_flag_date']

def ord_encoder(df, ord_cols):
  for column in ord_cols:
    # assign column to reshaped numpy array
    array = df[column].to_numpy()
    shaped = array.reshape(-1,1)
    #transform data 
    df[column] = encoder.fit_transform(shaped)
    print(df[column].head(n=5))
  return df

In [None]:
# encoding ordinal vars 
X_train = ord_encoder(df=X_train, ord_cols=ord_cols)
X_test = ord_encoder(df=X_test, ord_cols=ord_cols)

In [131]:
# loop to encode dummy cat vars 
one_hot_cols = ['home_ownership',
                'verification_status',
                'addr_state',
                'initial_list_status',
                'application_type',
                'hardship_flag',
                'hardship_type',
                'hardship_reason',
                'hardship_status',
                'deferral_term',
                'hardship_loan_status',
                'debt_settlement_flag',
                'settlement_status',
                'pymnt_plan',
]

def one_hot_encoder(df, one_hot_cols):
  for column in one_hot_cols:
      tempdf = pd.get_dummies(df[column], prefix=column)
      df = pd.merge(
          left=df,
          right=tempdf,
          left_index=True,
          right_index=True,
      )
      df = df.drop(columns=column)
  return df

In [132]:
# encoding dummies 
X_train = one_hot_encoder(df=X_train, one_hot_cols=one_hot_cols)
X_test = one_hot_encoder(df=X_test, one_hot_cols=one_hot_cols)

In [134]:
# fitting the model 
RANDOM_STATE = 1234
xgb = XGBClassifier(objective='binary:logistic',
                    random_state=RANDOM_STATE,
                    tree_method='hist',
                    verbosity=2)
xgb.fit(X_train, y_train)

[05:50:28] INFO: /workspace/src/learner.cc:215: Tree method is selected to be 'hist', which uses a single updater grow_quantile_histmaker.
[05:50:31] INFO: /workspace/src/tree/updater_quantile_hist.cc:63: Generating gmat: 2.14046 sec
[05:50:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[05:50:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 8 extra nodes, 0 pruned nodes, max_depth=3
[05:50:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[05:50:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[05:50:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[05:50:31] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_dept

XGBClassifier(objective='multi:softprob', random_state=1234, tree_method='hist',
              verbosity=2)

In [135]:
# predicting on the test set 
y_pred = xgb.predict(X_test)

In [136]:
# checking acc 
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" %(acc*100.0))

Accuracy: 99.22%
