# Loan Defaulters

## Imports

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

## setting paths

In [2]:
root_dir = 'ML_dataset'
train_file = os.path.join(root_dir,'train.csv')
test_file = os.path.join(root_dir,'test.csv')
sample_submission = os.path.join(root_dir,'sample_submission.csv')
save_path = os.path.join(root_dir,'my_submission.csv')

## loading train and test data

In [3]:
df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

In [4]:
df=df.drop(['member_id'], axis=1)
test_member_id = test_df['member_id'].values
test_df=test_df.drop(['member_id'], axis=1)

## a glance at training data

In [5]:
df.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,batch_enrolled,int_rate,grade,sub_grade,emp_title,emp_length,...,collections_12_mths_ex_med,mths_since_last_major_derog,application_type,verification_status_joint,last_week_pay,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,loan_status
0,14350,14350,14350.0,36 months,,19.19,E,E3,clerk,9 years,...,0.0,74.0,INDIVIDUAL,,26th week,0.0,0.0,28699.0,30800.0,0
1,4800,4800,4800.0,36 months,BAT1586599,10.99,B,B4,Human Resources Specialist,< 1 year,...,0.0,,INDIVIDUAL,,9th week,0.0,0.0,9974.0,32900.0,0
2,10000,10000,10000.0,36 months,BAT1586599,7.26,A,A4,Driver,2 years,...,0.0,,INDIVIDUAL,,9th week,0.0,65.0,38295.0,34900.0,0
3,15000,15000,15000.0,36 months,BAT4808022,19.72,D,D5,Us office of Personnel Management,10+ years,...,0.0,,INDIVIDUAL,,135th week,0.0,0.0,55564.0,24700.0,0
4,16000,16000,16000.0,36 months,BAT2833642,10.64,B,B2,LAUSD-HOLLYWOOD HIGH SCHOOL,10+ years,...,0.0,,INDIVIDUAL,,96th week,0.0,0.0,47159.0,47033.0,0


In [6]:
df.describe()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,...,total_rec_late_fee,recoveries,collection_recovery_fee,collections_12_mths_ex_med,mths_since_last_major_derog,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,loan_status
count,532428.0,532428.0,532428.0,532428.0,532425.0,532428.0,532412.0,532412.0,259874.0,82123.0,...,532428.0,532428.0,532428.0,532333.0,132980.0,532412.0,490424.0,490424.0,490424.0,532428.0
mean,14757.595722,14744.271291,14704.926696,13.242969,75029.84,18.138767,0.314448,0.694603,34.055735,70.093068,...,0.394954,45.717832,4.859221,0.014299,44.121462,0.005015,213.562222,139554.1,32080.57,0.236327
std,8434.42008,8429.139277,8441.290381,4.379611,65199.85,8.369074,0.860045,0.997025,21.884797,28.139219,...,4.091546,409.647467,63.123361,0.133005,22.19841,0.079117,1958.571538,153914.9,38053.04,0.424826
min,500.0,500.0,0.0,5.32,1200.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8000.0,8000.0,8000.0,9.99,45000.0,11.93,0.0,0.0,15.0,51.0,...,0.0,0.0,0.0,0.0,27.0,0.0,0.0,29839.75,14000.0,0.0
50%,13000.0,13000.0,13000.0,12.99,65000.0,17.65,0.0,0.0,31.0,70.0,...,0.0,0.0,0.0,0.0,44.0,0.0,0.0,80669.5,23700.0,0.0
75%,20000.0,20000.0,20000.0,16.2,90000.0,23.95,0.0,1.0,50.0,92.0,...,0.0,0.0,0.0,0.0,61.0,0.0,0.0,208479.2,39800.0,0.0
max,35000.0,35000.0,35000.0,28.99,9500000.0,672.52,30.0,31.0,180.0,121.0,...,358.68,33520.27,7002.19,16.0,180.0,14.0,496651.0,8000078.0,9999999.0,1.0


## checking null values

In [7]:
print(len(df))
print(df.isnull().sum())

532428
loan_amnt                           0
funded_amnt                         0
funded_amnt_inv                     0
term                                0
batch_enrolled                  85149
int_rate                            0
grade                               0
sub_grade                           0
emp_title                       30833
emp_length                      26891
home_ownership                      0
annual_inc                          3
verification_status                 0
pymnt_plan                          0
desc                           456829
purpose                             0
title                              90
zip_code                            0
addr_state                          0
dti                                 0
delinq_2yrs                        16
inq_last_6mths                     16
mths_since_last_delinq         272554
mths_since_last_record         450305
open_acc                           16
pub_rec                            16
revol

## checking labels ratio

In [8]:
print("Total number of labels: {}".format(df.shape[0]))
print("Number of negatives: {}".format(df[df.loan_status == 0].shape[0]))
print("Number of positives: {}".format(df[df.loan_status == 1].shape[0]))

Total number of labels: 532428
Number of negatives: 406601
Number of positives: 125827


## removing columns with more than 20% null values

In [9]:
print('Removing the following columns')
for col_name in df.columns:
    if 100*(df[col_name].isnull().sum())//len(df)>20:
        print('column',col_name,'has',100*(df[col_name].isnull().sum())//len(df),'% null values')
        df=df.drop([col_name], axis=1)
        test_df=test_df.drop([col_name], axis=1)

Removing the following columns
column desc has 85 % null values
column mths_since_last_delinq has 51 % null values
column mths_since_last_record has 84 % null values
column mths_since_last_major_derog has 75 % null values
column verification_status_joint has 99 % null values


## removing columns with more than 2000 unique values

In [10]:
print('Removing the following columns')
for col_name in df.columns:
    if df[col_name].dtypes in ['O'] and df[col_name].nunique()>2000:
        print('column',col_name,'has',100*(df[col_name].nunique())/len(df),'% uniques values')
        df=df.drop([col_name], axis=1)
        test_df=test_df.drop([col_name], axis=1)

Removing the following columns
column emp_title has 35.70886579969498 % uniques values
column title has 7.45509251955194 % uniques values


## gathering column names with less than 2000 uniques values and str data type for encoding

In [11]:
col_names=[]
for col_name in df.columns:
    if df[col_name].dtypes in ['O']:
        if df[col_name].nunique()<2000:
            print('column',col_name,'has',df[col_name].dtype,df[col_name].nunique(),'uniques values')
            col_names+=[col_name]
print(col_names)

column term has object 2 uniques values
column batch_enrolled has object 104 uniques values
column grade has object 7 uniques values
column sub_grade has object 35 uniques values
column emp_length has object 11 uniques values
column home_ownership has object 6 uniques values
column verification_status has object 3 uniques values
column pymnt_plan has object 2 uniques values
column purpose has object 14 uniques values
column zip_code has object 917 uniques values
column addr_state has object 51 uniques values
column initial_list_status has object 2 uniques values
column application_type has object 2 uniques values
column last_week_pay has object 98 uniques values
['term', 'batch_enrolled', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 'zip_code', 'addr_state', 'initial_list_status', 'application_type', 'last_week_pay']


## performing encoding on the gethered columns

In [12]:
encoder = LabelEncoder()
for col_name in col_names:
    print(col_name,df[col_name].nunique())
    df[col_name] = encoder.fit_transform(df[col_name])
    test_df[col_name] = encoder.fit_transform(test_df[col_name])

term 2
batch_enrolled 104
grade 7
sub_grade 35
emp_length 11
home_ownership 6
verification_status 3
pymnt_plan 2
purpose 14
zip_code 917
addr_state 51
initial_list_status 2
application_type 2
last_week_pay 98


## filling in the null values with mean

In [13]:
for col_name in df.columns[:-1]:
    mean_value = df[col_name].mean()
    if df[col_name].isnull().sum():
        print(col_name,100*(df[col_name].isnull().sum())//len(df))
        df[col_name].fillna(value=mean_value, inplace=True)
    if test_df[col_name].isnull().sum():
        test_df[col_name].fillna(value=mean_value, inplace=True)

annual_inc 0
delinq_2yrs 0
inq_last_6mths 0
open_acc 0
pub_rec 0
revol_util 0
total_acc 0
collections_12_mths_ex_med 0
acc_now_delinq 0
tot_coll_amt 7
tot_cur_bal 7
total_rev_hi_lim 7


## saperating features and class label

In [14]:
X=df.iloc[:, :-1]
X.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,batch_enrolled,int_rate,grade,sub_grade,emp_length,home_ownership,...,total_rec_late_fee,recoveries,collection_recovery_fee,collections_12_mths_ex_med,application_type,last_week_pay,acc_now_delinq,tot_coll_amt,tot_cur_bal,total_rev_hi_lim
0,14350,14350,14350.0,0,0,19.19,4,22,9,4,...,0.0,0.0,0.0,0.0,0,62,0.0,0.0,28699.0,30800.0
1,4800,4800,4800.0,0,9,10.99,1,8,10,1,...,0.0,0.0,0.0,0.0,0,96,0.0,0.0,9974.0,32900.0
2,10000,10000,10000.0,0,9,7.26,0,3,2,4,...,0.0,0.0,0.0,0.0,0,96,0.0,65.0,38295.0,34900.0
3,15000,15000,15000.0,0,80,19.72,3,19,1,5,...,0.0,0.0,0.0,0.0,0,13,0.0,0.0,55564.0,24700.0
4,16000,16000,16000.0,0,35,10.64,1,6,1,5,...,0.0,0.0,0.0,0.0,0,95,0.0,0.0,47159.0,47033.0


In [15]:
Y=df.iloc[:,-1]
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

## scaling the data

In [16]:
#scalar = QuantileTransformer(random_state=13,output_distribution='normal')
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

test_df = scaler.transform(test_df)

## spliting the data for training

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.001, random_state=41)

## loading the model

In [18]:
#from sklearn.naive_bayes import GaussianNB
#model = GaussianNB()

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
model = LogisticRegression(max_iter=1000)

## fitting the model

In [19]:
model.fit(X_train,y_train)
y_pred_proba=model.predict_proba(X_test)

## checking accuracy and probability of class label

In [20]:
print(model.score(X_test, y_test))

0.7879924953095685


In [21]:
ray = model.predict(X_test)
roc_auc_score(y_test, ray)

0.6184561965811965

In [22]:
ray = [round(i[1],2) for i in y_pred_proba]
roc_auc_score(y_test, ray)

0.7781681459566075

In [23]:
#here first element of array represents probability for class label 0 
#and second element of array represents probability for class label 1. 

In [24]:
y_pred_proba

array([[5.41523641e-01, 4.58476359e-01],
       [6.84478519e-01, 3.15521481e-01],
       [6.82416131e-01, 3.17583869e-01],
       ...,
       [9.13351512e-01, 8.66484876e-02],
       [9.99410345e-01, 5.89654676e-04],
       [9.08220583e-01, 9.17794171e-02]])

## running prediction on test data

In [25]:
results = model.predict_proba(test_df)

## converting to the requested format and saving to csv

In [26]:
results = [round(i[1],2) for i in results]
sub_df = pd.DataFrame()
sub_df['member_id'] = test_member_id
sub_df['loan_status'] = results
sub_df.to_csv(save_path, index = False)