In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv('D:\Academic\Data Science\Lending-Club\data\lending-club-loan-data/loan.csv')

In [4]:
dataset.head(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,


# Basic Feature Removal
1. Remove any column that has a null value <<< Not ideal. Change it.
2. Convert all non-numeric columns to numeric representations.

In [5]:
#Labels (y_true of this dataset)
y = dataset.pop("int_rate")
# Drop Columns with Null values.
# Usually, null values need to be filled with mean or some kind of aggregate function. 
# Must Analyse each feature for that.
dataset = dataset.dropna(axis=1)

In [6]:
# Function to remap non-numeric values to a number value.
def remap(df, col) :
  col_set = list(set(df[col]));  # Get the column specified, covert it to set so that all duplicates are removed
  col_set_map = {val: i+1 for i, val in enumerate(sorted(col_set))} # Enumerate over the sorted set and build a Mapping dictionary
  df[col].replace(col_set_map, inplace=True) # Replace the values with the provided map
  print("Remaped", col, "with", col_set_map); # Log
  

In [7]:
# Convert non-numeric columns to numeric
remap(dataset, "term")

Remaped term with {' 36 months': 1, ' 60 months': 2}


In [8]:
remap(dataset, "grade")

Remaped grade with {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}


In [9]:
remap(dataset, "sub_grade")

Remaped sub_grade with {'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5, 'B1': 6, 'B2': 7, 'B3': 8, 'B4': 9, 'B5': 10, 'C1': 11, 'C2': 12, 'C3': 13, 'C4': 14, 'C5': 15, 'D1': 16, 'D2': 17, 'D3': 18, 'D4': 19, 'D5': 20, 'E1': 21, 'E2': 22, 'E3': 23, 'E4': 24, 'E5': 25, 'F1': 26, 'F2': 27, 'F3': 28, 'F4': 29, 'F5': 30, 'G1': 31, 'G2': 32, 'G3': 33, 'G4': 34, 'G5': 35}


In [10]:
remap(dataset, "verification_status")

Remaped verification_status with {'Not Verified': 1, 'Source Verified': 2, 'Verified': 3}


In [11]:
remap(dataset, "loan_status")

Remaped loan_status with {'Charged Off': 1, 'Current': 2, 'Default': 3, 'Does not meet the credit policy. Status:Charged Off': 4, 'Does not meet the credit policy. Status:Fully Paid': 5, 'Fully Paid': 6, 'In Grace Period': 7, 'Issued': 8, 'Late (16-30 days)': 9, 'Late (31-120 days)': 10}


In [12]:
remap(dataset, "home_ownership")

Remaped home_ownership with {'ANY': 1, 'MORTGAGE': 2, 'NONE': 3, 'OTHER': 4, 'OWN': 5, 'RENT': 6}


In [13]:
remap(dataset, "application_type")

Remaped application_type with {'INDIVIDUAL': 1, 'JOINT': 2}


In [14]:
remap(dataset, "initial_list_status")

Remaped initial_list_status with {'f': 1, 'w': 2}


In [16]:
remap(dataset, "pymnt_plan")

Remaped pymnt_plan with {'n': 1, 'y': 2}


In [17]:
# Dropping Columns that are not needed. [Dates, ids, urls, address, etc]
cols = ["member_id", "id", "issue_d", "url", "purpose","zip_code", "addr_state"] # Add the columns that need to be dropped
for column in cols :
  dataset.pop(column) # Drop the column
print("Dropped the following columns:", ', '.join(cols)) # Log

Dropped the following columns: member_id, id, issue_d, url, purpose, zip_code, addr_state


In [18]:
dataset.head(5)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,installment,grade,sub_grade,home_ownership,verification_status,loan_status,...,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,policy_code,application_type
0,5000.0,5000.0,4975.0,1,162.87,2,7,6,3,6,...,5861.071414,5831.78,5000.0,861.07,0.0,0.0,0.0,171.62,1.0,1
1,2500.0,2500.0,2500.0,2,59.83,3,14,6,2,1,...,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,119.66,1.0,1
2,2400.0,2400.0,2400.0,1,84.33,3,15,6,1,6,...,3003.653644,3003.65,2400.0,603.65,0.0,0.0,0.0,649.91,1.0,1
3,10000.0,10000.0,10000.0,1,339.31,3,11,6,2,6,...,12226.302212,12226.3,10000.0,2209.33,16.97,0.0,0.0,357.48,1.0,1
4,3000.0,3000.0,3000.0,2,67.79,2,10,6,2,2,...,3242.17,3242.17,2233.1,1009.07,0.0,0.0,0.0,67.79,1.0,1


In [19]:
dataset.keys()

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'installment',
       'grade', 'sub_grade', 'home_ownership', 'verification_status',
       'loan_status', 'pymnt_plan', 'dti', 'revol_bal', 'initial_list_status',
       'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv',
       'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt', 'policy_code',
       'application_type'],
      dtype='object')

Need to confirm if any value in the data frame is of non-numeric type. This is because non-numeric types cannot be used in training

In [20]:
columnsWithStringType = []
for k in dataset.keys() :
  col = dataset[k];
  for v in col :
    if type(v) == type(""):
      columnsWithStringType.append(k)
      break;
      
assert len(columnsWithStringType) == 0, "Some columns have string type."
print("All columns are number type.");

All columns are number type.


In [21]:
dataset.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,installment,grade,sub_grade,home_ownership,verification_status,loan_status,...,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_amnt,policy_code,application_type
0,5000.0,5000.0,4975.0,1,162.87,2,7,6,3,6,...,5861.071414,5831.78,5000.0,861.07,0.0,0.0,0.0,171.62,1.0,1
1,2500.0,2500.0,2500.0,2,59.83,3,14,6,2,1,...,1008.71,1008.71,456.46,435.17,0.0,117.08,1.11,119.66,1.0,1
2,2400.0,2400.0,2400.0,1,84.33,3,15,6,1,6,...,3003.653644,3003.65,2400.0,603.65,0.0,0.0,0.0,649.91,1.0,1
3,10000.0,10000.0,10000.0,1,339.31,3,11,6,2,6,...,12226.302212,12226.3,10000.0,2209.33,16.97,0.0,0.0,357.48,1.0,1
4,3000.0,3000.0,3000.0,2,67.79,2,10,6,2,2,...,3242.17,3242.17,2233.1,1009.07,0.0,0.0,0.0,67.79,1.0,1
