# JPMorgan Quantitative Research
## Task 3: Credit risk analysis

In [25]:
# import relevant packages
import pandas as pd
from google.colab import drive

# read and check the file
drive.mount('/content/drive', force_remount = True)
file_path = '/content/drive/My Drive/Forage/Task 3 and 4_Loan_Data.csv'
loan_data = pd.read_csv(file_path)
loan_data.head()

Mounted at /content/drive


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [26]:
loan_data.describe()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4974577.0,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577,0.1851
std,2293890.0,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906,0.388398
min,1000324.0,0.0,46.783973,31.652732,1000.0,0.0,408.0,0.0
25%,2977661.0,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0,0.0
50%,4989502.0,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0,0.0
75%,6967210.0,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0,0.0
max,8999789.0,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0,1.0


In [27]:
# import relevant ML packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# define target and features
y = loan_data['default']
features = ['credit_lines_outstanding', 'total_debt_outstanding', 'income', 'years_employed', 'fico_score']
X = loan_data[features]

# check which model is more suitable
train_X, val_X, train_y, val_y = train_test_split(X, y, test_size = 0.2, random_state = 1)

log_model = LogisticRegression(max_iter = 1000)
log_model.fit(train_X, train_y)
log_probs = log_model.predict_proba(val_X)[:, 1]
log_auc = roc_auc_score(val_y, log_probs)

rf_model = RandomForestClassifier(random_state = 1)
rf_model.fit(train_X, train_y)
rf_probs = rf_model.predict_proba(val_X)[:, 1]
rf_auc = roc_auc_score(val_y, rf_probs)

if rf_auc > log_auc:
  best_model = RandomForestClassifier(random_state = 1)
else:
  best_model = LogisticRegression(max_iter = 1000)

best_model.fit(X, y)

# get the value of prompts
def user_input(prompt):
  while True:
    try:
      value = int(input(prompt))
      if value >= 0:
        return value
      else:
        print('This is not a valid number.')
    except ValueError:
      print('This is not a valid number.')


# model the expected loss on a loan
def expected_loss(best_model):
  loan_amt_outstanding = user_input('Please enter how much of the loan amount is outstanding: ')
  credit_lines_outstanding = user_input('Please enter how many credit lines are outstanding: ')
  total_debt_outstanding = user_input('Please enter how much of your total debt is outstanding: ')
  income = user_input('Please enter how much income you earn: ')
  years_employed = user_input('Please enter how many years you have been employed for: ')
  fico_score = user_input('Please enter your FICO score: ')

  borrower_features = [credit_lines_outstanding, total_debt_outstanding, income, years_employed, fico_score]
  borrower_df = pd.DataFrame([borrower_features], columns = features)

  # probability of default
  PD = best_model.predict_proba(borrower_df)[:, 1][0]

  # expected loss (recovery rate = 10%)
  EL = 0.9 * PD * loan_amt_outstanding
  return PD, EL

PD, EL = expected_loss(best_model)
print('The probability of default is {:.3g} and the expected loss on this loan is {:.3g}.'.format(PD, EL))

Please enter how much of the loan amount is outstanding: 1000
Please enter how many credit lines are outstanding: 5
Please enter how much of your total debt is outstanding: 500
Please enter how much income you earn: 50000
Please enter how many years you have been employed for: 5
Please enter your FICO score: 600
The probability of default is 0.144 and the expected loss on this loan is 130.
