In [84]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [85]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from skimpy import skim

In [3]:
pd.set_option('display.max_columns', None)

In [83]:
# Load the data
data = pd.read_csv("../data/accepted_2007_to_2018Q4.csv", encoding = "ISO-8859-1")

  data = pd.read_csv("../data/accepted_2007_to_2018Q4.csv", encoding = "ISO-8859-1")


In [4]:
data['grade']

0            C
1            C
2            B
3            C
4            F
          ... 
2260696      B
2260697      C
2260698      C
2260699    NaN
2260700    NaN
Name: grade, Length: 2260701, dtype: object

In [5]:
print(data.columns)

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=151)


For each of these features, we check the description in the Data Dictionary and only keep the features that would have been available to investors considering an investment in the loan. These include features in the loan application, and any features added by LendingClub when the loan listing was accepted, such as the loan grade and interest rate.

I'm using my best available knowledge to determine which loan features are known to potential investors. I am not an investor on LendingClub, so my knowledge of the LendingClub investment process is not exact. When in doubt, I err on the side of dropping the feature.

In [6]:
keep_list = ['addr_state', 'annual_inc', 'application_type', 'dti', 'earliest_cr_line', 'emp_length', 'emp_title', 'fico_range_high', 'fico_range_low', 'grade', 'home_ownership', 'id', 'initial_list_status', 'installment', 'int_rate', 'issue_d', 'loan_amnt', 'loan_status', 'mort_acc', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies', 'purpose', 'revol_bal', 'revol_util', 'sub_grade', 'term', 'title', 'total_acc', 'verification_status', 'zip_code']

In [7]:
df = data.copy()

In [8]:
df = df[[col for col in df.columns if col in keep_list]]

In [9]:
df.head()

Unnamed: 0,id,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,title,zip_code,addr_state,dti,earliest_cr_line,fico_range_low,fico_range_high,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,application_type,mort_acc,pub_rec_bankruptcies
0,68407277,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,debt_consolidation,Debt consolidation,190xx,PA,5.91,Aug-2003,675.0,679.0,7.0,0.0,2765.0,29.7,13.0,w,Individual,1.0,0.0
1,68355089,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,small_business,Business,577xx,SD,16.06,Dec-1999,715.0,719.0,22.0,0.0,21470.0,19.2,38.0,w,Individual,4.0,0.0
2,68341763,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,home_improvement,,605xx,IL,10.78,Aug-2000,695.0,699.0,6.0,0.0,7869.0,56.2,18.0,w,Joint App,5.0,0.0
3,66310712,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,debt_consolidation,Debt consolidation,076xx,NJ,17.06,Sep-2008,785.0,789.0,13.0,0.0,7802.0,11.6,17.0,w,Individual,1.0,0.0
4,68476807,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,major_purchase,Major purchase,174xx,PA,25.37,Jun-1998,695.0,699.0,12.0,0.0,21929.0,64.5,35.0,w,Individual,6.0,0.0


In [10]:
len(df.columns)

31

In [11]:
# check for missing values
df.isnull().sum()

id                           0
loan_amnt                   33
term                        33
int_rate                    33
installment                 33
grade                       33
sub_grade                   33
emp_title               167002
emp_length              146940
home_ownership              33
annual_inc                  37
verification_status         33
issue_d                     33
loan_status                 33
purpose                     33
title                    23359
zip_code                    34
addr_state                  33
dti                       1744
earliest_cr_line            62
fico_range_low              33
fico_range_high             33
open_acc                    62
pub_rec                     62
revol_bal                   33
revol_util                1835
total_acc                   62
initial_list_status         33
application_type            33
mort_acc                 50063
pub_rec_bankruptcies      1398
dtype: int64

In [12]:
df.columns

Index(['id', 'loan_amnt', 'term', 'int_rate', 'installment', 'grade',
       'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'purpose', 'title',
       'zip_code', 'addr_state', 'dti', 'earliest_cr_line', 'fico_range_low',
       'fico_range_high', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util',
       'total_acc', 'initial_list_status', 'application_type', 'mort_acc',
       'pub_rec_bankruptcies'],
      dtype='object')

# Data Cleaning - should be updated
| Variable    | Description                                                                |
|-------------|----------------------------------------------------------------------------|
| amount      | Loan amount in dollars                                                     |
| term        | Loan term is 36 or 60 months                                              |
| rate        | Interest rate as a decimal                                                 |
| payment     | Monthly payment amount                                                     |
| grade       | Grade of loan: A is least risk, G is most risk                            |
| employment  | Job title of applicant                                                     |
| length      | Time continuously employed                                                 |
| home        | Home ownership: rent, own, mortgage                                        |
| income      | Annual income in dollars                                                   |
| verified    | Verification status of annual income                                       |
| status      | Loan status: DEFAULT, CURRENT, CHARGED OFF, etc.                           |
| reason      | Applicant's purpose for the loan                                           |
| state       | Two-letter state code of applicant                                          |
| debtIncRat  | Ratio monthly non-mortgage debt payment to monthly income                  |
| delinq2yr   | Number of 30+ day late payments in the last two years                      |
| inq6mth     | Number of credit checks in the past 6 months                                |
| openAcc     | Number of open credit lines                                                |
| pubRec      | Number of derogatory public records including bankruptcy filings, tax liens, etc. |
| revolRatio  | Proportion of revolving credit in use                                       |
| totalAcc    | Total number of credit lines in file, includes both open and closed accounts |
| totalPaid   | Total amount repaid to bank (THIS IS NOT A PREDICTOR SINCE IT CAN ONLY BE DETERMINED AFTER A LOAN IS


In [13]:

# Identifying columns with multiple data types
columns_with_multiple_datatypes = df.columns[df.map(type).nunique() > 1]

# Displaying the names
print("Columns with Multiple Data Types and Their Data Types:")
for column in columns_with_multiple_datatypes:
    unique_datatypes = df[column].apply(type).unique()
    print(f"{column}: {', '.join(str(dt) for dt in unique_datatypes)}")

Columns with Multiple Data Types and Their Data Types:
id: <class 'int'>, <class 'str'>
term: <class 'str'>, <class 'float'>
grade: <class 'str'>, <class 'float'>
sub_grade: <class 'str'>, <class 'float'>
emp_title: <class 'str'>, <class 'float'>
emp_length: <class 'str'>, <class 'float'>
home_ownership: <class 'str'>, <class 'float'>
verification_status: <class 'str'>, <class 'float'>
issue_d: <class 'str'>, <class 'float'>
loan_status: <class 'str'>, <class 'float'>
purpose: <class 'str'>, <class 'float'>
title: <class 'str'>, <class 'float'>
zip_code: <class 'str'>, <class 'float'>
addr_state: <class 'str'>, <class 'float'>
earliest_cr_line: <class 'str'>, <class 'float'>
initial_list_status: <class 'str'>, <class 'float'>
application_type: <class 'str'>, <class 'float'>


### 1. id - loan id

In [14]:
#check for non numeric rows in 
df['id'] = pd.to_numeric(df['id'], errors='coerce')
print(df['id'].isna().sum(),  df['id'].isnull().sum())# can be dropped during analysis as it is not add any value

33 33


In [15]:
print(df[df['id'].isna()]) # records dont have any values so dropping them ( Inspection)

         id  loan_amnt term  int_rate  installment grade sub_grade emp_title  \
421095  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
421096  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
528961  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
528962  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
651664  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
651665  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
749520  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
749521  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
877716  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
877717  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
983169  NaN        NaN  NaN       NaN          NaN   NaN       NaN       NaN   
983170  NaN        NaN  NaN       NaN   

In [16]:
df = df.dropna(subset=['id'])

In [17]:
print(df['id'].isna().sum(),  df['id'].isnull().sum())

0 0


### 2. loan_amt

In [18]:
# convert amount to numeric
df['loan_amnt'] = pd.to_numeric(df['loan_amnt'], errors='coerce')
# check for null or non numeric values
print(df['loan_amnt'].isna().sum(),  df['loan_amnt'].isnull().sum())

0 0


### 3. term 

In [19]:
df['term'].value_counts() #No cleaning required

term
36 months    1609754
60 months     650914
Name: count, dtype: int64

### 4. int_rate 

In [20]:
# convert the rate to numeric
df['int_rate'] = pd.to_numeric(df['int_rate'], errors='coerce')
# check for null or non numeric values
print(df['int_rate'].isna().sum(),  df['int_rate'].isnull().sum())

0 0


### 5. installment

In [21]:
# convert the rate to numeric
df['installment'] = pd.to_numeric(df['installment'], errors='coerce')
# check for null or non numeric values
print(df['installment'].isna().sum(),  df['installment'].isnull().sum())

0 0


### 6. grade

In [22]:
df['grade'].value_counts(dropna=False).sort_index() #No cleaning required

grade
A    433027
B    663557
C    650053
D    324424
E    135639
F     41800
G     12168
Name: count, dtype: int64

In [23]:
print(df['grade'].isna().sum(),  df['grade'].isnull().sum())#No cleaning required

0 0


We can safely remove the 'grade' feature since the information it contains is already captured within the more detailed 'sub_grade' feature. 

In [24]:
df.drop('grade', axis=1, inplace=True)

### 7. sub_grade

In [25]:
df['sub_grade'].value_counts(dropna=False).sort_index()

sub_grade
A1     86790
A2     69562
A3     73184
A4     95874
A5    107617
B1    125341
B2    126621
B3    131514
B4    139793
B5    140288
C1    145903
C2    131116
C3    129193
C4    127115
C5    116726
D1     81787
D2     72899
D3     64819
D4     56896
D5     48023
E1     33573
E2     29924
E3     26708
E4     22763
E5     22671
F1     13413
F2      9305
F3      7791
F4      6124
F5      5167
G1      4106
G2      2688
G3      2094
G4      1712
G5      1568
Name: count, dtype: int64

In [26]:
print(df['sub_grade'].isna().sum(),  df['sub_grade'].isnull().sum())#No cleaning required for sub_grade

0 0


### 8. emp_title

In [27]:
#records with null employment
print(df[df['emp_title'].isna()] )# 2784 records data not available for employment


                 id  loan_amnt        term  int_rate  installment sub_grade  \
55       68366663.0    24000.0   60 months      9.80       507.58        B3   
75       68416953.0     1500.0   36 months      6.49        45.97        A2   
93       68377020.0    15000.0   36 months      5.32       451.73        A1   
139      68516545.0    25000.0   36 months      7.49       777.55        A4   
141      68446591.0     2500.0   36 months     11.48        82.42        B5   
...             ...        ...         ...       ...          ...       ...   
2260650  89818207.0    16000.0   60 months     12.79       362.34        C1   
2260679  89976312.0    22000.0   60 months     13.99       511.79        C3   
2260683  88878506.0    30000.0   60 months      9.49       629.91        B2   
2260685  89007204.0    36400.0   60 months     14.49       856.24        C4   
2260688  89905081.0    18000.0   60 months      9.49       377.95        B2   

        emp_title emp_length home_ownership  annual

On inspection of above records we find that though the emp_title is empty most of these employees have their annual incomes submitted. Therefore we create new category called self employed to handle the cases where emp_title is not available but annual income is present.

In [28]:
df['emp_title'] = df['emp_title'].fillna('Self Employed')

In [29]:
df['emp_title'].value_counts(dropna=False).sort_index()

emp_title
\tCFO                                 1
\tMultimedia Supervisor               1
\tSlot technician                     1
\tVP - Operations                     1
                                      3
                                     ..
âAssociate Tech Support Analyst     1
âFinancial Analyst                  1
âLicense Compliance Investigator    1
âSenior IT Field Support            1
ð¨âð³                           1
Name: count, Length: 512694, dtype: int64

In [30]:
# too many emp_titles, dropping column
df.drop('emp_title', axis=1, inplace=True)

### 9. emp_length
Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.


In [31]:
df['emp_length'].value_counts(dropna=False).sort_index()

emp_length
1 year       148403
10+ years    748005
2 years      203677
3 years      180753
4 years      136605
5 years      139698
6 years      102628
7 years       92695
8 years       91914
9 years       79395
< 1 year     189988
NaN          146907
Name: count, dtype: int64

1. There are 146907 records with no info on emp_length ( Should decide while processing data if we want to retain these records or not)
2. Data does not follow the descritpion in data dictionary


In [32]:
#Get emp_length to consistent format as per data dictionary
df['emp_length'] = df['emp_length'].replace(to_replace='10+ years', value='10 years') # represent 10 or 10+ years
df['emp_length'] = df['emp_length'].replace('< 1 year', '0 years')

In [33]:
#Convert it to integer categories
df['emp_length'] = df['emp_length'].apply(lambda s: np.nan if pd.isnull(s) else int(s.split()[0]))


In [34]:
df['emp_length'].value_counts(dropna=False).sort_index()

emp_length
0.0     189988
1.0     148403
2.0     203677
3.0     180753
4.0     136605
5.0     139698
6.0     102628
7.0      92695
8.0      91914
9.0      79395
10.0    748005
NaN     146907
Name: count, dtype: int64

In [35]:
df['emp_length'].isna().sum()

146907

In [36]:
df = df.dropna(subset= ['emp_length'])

### 10. home_ownership

In [37]:
df['home_ownership'].value_counts(dropna=False).sort_index()

home_ownership
ANY             844
MORTGAGE    1045455
NONE             51
OTHER           179
OWN          222000
RENT         845232
Name: count, dtype: int64

As any and none dont give any particular information about home_ownership we are gonna merge these categories into OTHER

In [38]:
df['home_ownership'] = df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')

In [39]:
df['home_ownership'].value_counts(dropna=False).sort_index()

home_ownership
MORTGAGE    1045455
OTHER          1074
OWN          222000
RENT         845232
Name: count, dtype: int64

### 11. annual_inc

In [40]:
# convert the rate to numeric
df['annual_inc'] = pd.to_numeric(df['annual_inc'], errors='coerce')
# check for null or non numeric values
print(df['annual_inc'].isna().sum(),  df['annual_inc'].isnull().sum())

4 4


In [41]:
print(df[df['annual_inc'].isna()])

              id  loan_amnt        term  int_rate  installment sub_grade  \
1654329  79967.0     5000.0   36 months      7.43       155.38        A2   
1654330  79924.0     7000.0   36 months      7.75       218.55        A3   
1654360  79893.0     6700.0   36 months      7.75       209.18        A3   
1654413  71623.0     6500.0   36 months      8.38       204.84        A5   

         emp_length home_ownership  annual_inc verification_status   issue_d  \
1654329         0.0          OTHER         NaN        Not Verified  Aug-2007   
1654330         0.0          OTHER         NaN        Not Verified  Aug-2007   
1654360         0.0          OTHER         NaN        Not Verified  Jul-2007   
1654413         0.0          OTHER         NaN        Not Verified  Jun-2007   

                                               loan_status purpose  \
1654329  Does not meet the credit policy. Status:Fully ...   other   
1654330  Does not meet the credit policy. Status:Fully ...   other   
1654360 

 We are gonna remove these records as emp_title ( was not available originally nor annual income) and lot of pther columns are missing

In [42]:
df = df.dropna(subset=['annual_inc'])

In [43]:
print(df[df['annual_inc'].isna()])

Empty DataFrame
Columns: [id, loan_amnt, term, int_rate, installment, sub_grade, emp_length, home_ownership, annual_inc, verification_status, issue_d, loan_status, purpose, title, zip_code, addr_state, dti, earliest_cr_line, fico_range_low, fico_range_high, open_acc, pub_rec, revol_bal, revol_util, total_acc, initial_list_status, application_type, mort_acc, pub_rec_bankruptcies]
Index: []


### 12. verification_status

In [44]:
df['verification_status'].value_counts(dropna=False).sort_index()

verification_status
Not Verified       700488
Source Verified    858543
Verified           554726
Name: count, dtype: int64

Nothing to clean

### 13. issue_d

In [45]:
print(df[df['issue_d'].isnull()])

Empty DataFrame
Columns: [id, loan_amnt, term, int_rate, installment, sub_grade, emp_length, home_ownership, annual_inc, verification_status, issue_d, loan_status, purpose, title, zip_code, addr_state, dti, earliest_cr_line, fico_range_low, fico_range_high, open_acc, pub_rec, revol_bal, revol_util, total_acc, initial_list_status, application_type, mort_acc, pub_rec_bankruptcies]
Index: []


Nothing to clean

### 14. loan_status

In [46]:
df['loan_status'].value_counts(dropna=False).sort_index()

loan_status
Charged Off                                             247429
Current                                                 812847
Default                                                     35
Does not meet the credit policy. Status:Charged Off        746
Does not meet the credit policy. Status:Fully Paid        1965
Fully Paid                                             1019370
In Grace Period                                           7849
Late (16-30 days)                                         3900
Late (31-120 days)                                       19616
Name: count, dtype: int64

In our project, we're investigating what sets apart loans that were successfully repaid from those that ended up in default. We're excluding loans that are still ongoing, don't meet our credit standards, are already in default, or have incomplete status information. Our focus is solely on loans that have either been fully paid off or written off as losses. This approach helps us pinpoint the factors that influence whether a loan will be repaid in full or not

In [47]:
df = df.loc[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]

### 15. purpose

In [48]:
df['purpose'].value_counts(dropna=False).sort_index()

purpose
car                    13780
credit_card           277408
debt_consolidation    737571
educational              318
home_improvement       81253
house                   6823
major_purchase         27773
medical                14253
moving                  8884
other                  72249
renewable_energy         876
small_business         14977
vacation                8372
wedding                 2262
Name: count, dtype: int64

### 16. title

In [49]:
df['title'].value_counts(dropna=False).head()

title
Debt consolidation         623195
Credit card refinancing    234346
Home improvement            69490
Other                       61766
Major purchase              22621
Name: count, dtype: int64

In [50]:
df['title'].value_counts(dropna=False)

title
Debt consolidation             623195
Credit card refinancing        234346
Home improvement                69490
Other                           61766
Major purchase                  22621
                                ...  
advancement1                        1
Quest for Financial Freedom         1
Help Reducing Debt                  1
CCARD LOAN                          1
debt reduction/hone updates         1
Name: count, Length: 59498, dtype: int64

There are 61k titles , based on top values this info is already present in purpose so we drop this column

In [51]:
df.drop('title', axis=1, inplace=True)

### 17. zip_code

In [52]:
df['zip_code'].value_counts(dropna=False)

zip_code
945xx    14252
750xx    14086
112xx    13181
606xx    11832
300xx    11563
         ...  
872xx        1
849xx        1
429xx        1
938xx        1
525xx        1
Name: count, Length: 942, dtype: int64

900+ zip codes, too much for a categorical varibale. there are 51 states whcih is comparatively smaller we drop zip codes and keep states

In [53]:
df.drop('zip_code', axis=1, inplace=True)

### 18. addr_state

In [54]:
len(df['addr_state'].unique())

51

In [55]:
# there are 51 states

In [56]:
df['addr_state'].value_counts(dropna=False)

addr_state
CA    186323
TX    105117
NY    103549
FL     88248
IL     49012
NJ     46238
PA     43015
OH     40996
GA     40765
VA     36255
NC     35192
MI     32139
AZ     30221
MA     29814
MD     29721
CO     28295
WA     27543
MN     22974
IN     20404
MO     19888
TN     18982
CT     18781
NV     18524
WI     16697
AL     15361
OR     15218
SC     14716
LA     14668
KY     11956
OK     11512
KS     10619
UT      9579
AR      9169
NM      6768
HI      6472
NH      6123
MS      6054
RI      5586
WV      4470
MT      3589
DE      3512
NE      3407
DC      3365
AK      3057
WY      2800
SD      2657
VT      2502
ME      1856
ND      1553
ID      1530
IA         7
Name: count, dtype: int64

### 19. dti

A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.

In [57]:
# convert the rate to numeric
df['dti'] = pd.to_numeric(df['dti'], errors='coerce')
# check for null or non numeric values
print(df['dti'].isna().sum(),  df['dti'].isnull().sum())

17 17


In [58]:
print(df[df['dti'].isna()])

                  id  loan_amnt        term  int_rate  installment sub_grade  \
510144   127050332.0     2400.0   36 months     10.91        78.48        B4   
513293   126955615.0    13000.0   60 months     19.03       337.45        D3   
539478   119336653.0    25000.0   60 months     12.62       563.98        C1   
546518   119270588.0    15000.0   60 months     25.82       447.51        E4   
555158   118313801.0    25850.0   36 months     14.08       884.50        C3   
918499   109647185.0     7500.0   36 months     10.91       245.23        B4   
1407293  143845908.0    30000.0   60 months     12.98       682.29        B5   
2071206  125224506.0    25000.0   60 months     13.59       576.41        C2   
2087634  124779875.0    15000.0   36 months     10.91       490.45        B4   
2097065  123951342.0    35000.0   60 months     21.45       955.75        D5   
2130051  121289792.0    28000.0   36 months      9.44       896.14        B1   
2150868  120050520.0    40000.0   36 mon

In [59]:
#drop records where dti is not available
df = df.dropna(subset=['dti'])

### 20. earliest_cr_line
 "The month the borrower's earliest reported credit line was opened."

In [60]:
df['earliest_cr_line'].isnull().sum()

0

In [61]:
df['earliest_cr_line'].sample(5)

1309631    May-1988
155372     Sep-2000
88742      Oct-2004
304952     Jul-2002
1024557    Dec-2000
Name: earliest_cr_line, dtype: object

In [62]:
# For simplicity purpose we keep year only

In [63]:
df['earliest_cr_line'] = df['earliest_cr_line'].apply(lambda s: int(s[-4:]))

### 21. fico_range_low
The lower boundary range the borrower’s FICO at loan origination belongs to.

In [64]:
# convert the rate to numeric
df['fico_range_low'] = pd.to_numeric(df['fico_range_low'], errors='coerce')
# check for null or non numeric values
print(df['fico_range_low'].isna().sum(),  df['fico_range_low'].isnull().sum())

0 0


### 22. fico_range_high
The upper boundary range the borrower’s FICO at loan origination belongs to

In [65]:
# convert the rate to numeric
df['fico_range_high'] = pd.to_numeric(df['fico_range_high'], errors='coerce')
# check for null or non numeric values
print(df['fico_range_high'].isna().sum(),  df['fico_range_high'].isnull().sum())

0 0


### 23. open_acc
The number of open credit lines in the borrower's credit file.

In [66]:
# convert the rate to numeric
df['open_acc'] = pd.to_numeric(df['open_acc'], errors='coerce')
# check for null or non numeric values
print(df['open_acc'].isna().sum(),  df['open_acc'].isnull().sum())

0 0


### 24. pub_rec
Number of derogatory public records.

In [67]:
# convert the rate to numeric
df['pub_rec'] = pd.to_numeric(df['pub_rec'], errors='coerce')
# check for null or non numeric values
print(df['pub_rec'].isna().sum(),  df['pub_rec'].isnull().sum())

0 0


### 25. revol_bal
Total credit revolving balance.

In [68]:
# convert the rate to numeric
df['revol_bal'] = pd.to_numeric(df['revol_bal'], errors='coerce')
# check for null or non numeric values
print(df['revol_bal'].isna().sum(),  df['revol_bal'].isnull().sum())

0 0


### 26. revol_util
Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit

In [69]:
# convert the rate to numeric
df['revol_util'] = pd.to_numeric(df['revol_util'], errors='coerce')
# check for null or non numeric values
print(df['revol_util'].isna().sum(),  df['revol_util'].isnull().sum())

806 806


In [70]:
# drop records where na
df = df.dropna(subset=['revol_util'])

### 27. total_acc
The total number of credit lines currently in the borrower's credit file."

In [71]:
# convert the rate to numeric
df['total_acc'] = pd.to_numeric(df['total_acc'], errors='coerce')
# check for null or non numeric values
print(df['total_acc'].isna().sum(),  df['total_acc'].isnull().sum())

0 0


### 28. initial_list_status
The initial listing status of the loan

In [72]:
df['initial_list_status'].value_counts(dropna=False).sort_index()

initial_list_status
f    529279
w    736697
Name: count, dtype: int64

### 29. application_type

In [73]:
df['application_type'].value_counts(dropna=False).sort_index()

application_type
Individual    1243975
Joint App       22001
Name: count, dtype: int64

### 30. mort_acc
Number of mortgage accounts

In [74]:
# convert the rate to numeric
df['mort_acc'] = pd.to_numeric(df['mort_acc'], errors='coerce')
# check for null or non numeric values
print(df['mort_acc'].isna().sum(),  df['mort_acc'].isnull().sum())

45884 45884


In [75]:
df['mort_acc'].value_counts(dropna=False).sort_index()

mort_acc
0.0     493618
1.0     213537
2.0     177771
3.0     130359
4.0      88522
5.0      53720
6.0      30313
7.0      15637
8.0       7820
9.0       4003
10.0      2049
11.0      1118
12.0       600
13.0       325
14.0       232
15.0       127
16.0        91
17.0        59
18.0        46
19.0        27
20.0        24
21.0        14
22.0        14
23.0        10
24.0        12
25.0         8
26.0         6
27.0         8
28.0         3
29.0         4
30.0         2
31.0         2
32.0         2
34.0         3
35.0         1
36.0         1
37.0         2
47.0         1
51.0         1
NaN      45884
Name: count, dtype: int64

In [76]:
# drop records weere mortgage accounts in not availbale
df = df.dropna(subset= ['mort_acc'])

### 31. pub_rec_bankruptcies
Number of public record bankruptcies.

In [77]:
# convert the rate to numeric
df['pub_rec_bankruptcies'] = pd.to_numeric(df['pub_rec_bankruptcies'], errors='coerce')
# check for null or non numeric values
print(df['pub_rec_bankruptcies'].isna().sum(),  df['pub_rec_bankruptcies'].isnull().sum())

0 0


In [78]:
df['pub_rec_bankruptcies'].value_counts(dropna=False).sort_index()

pub_rec_bankruptcies
0.0     1070080
1.0      140858
2.0        7130
3.0        1461
4.0         361
5.0         133
6.0          42
7.0          14
8.0           8
9.0           3
11.0          1
12.0          1
Name: count, dtype: int64

In [79]:
# drop records weere pub_rec_bankruptcies accounts in not availbale
df = df.dropna(subset= ['pub_rec_bankruptcies'])

In [80]:
total_missing_values = df.isnull().sum().sum()

# Determine the total number of cells in the dataset
total_cells = df.size

# Calculate the percentage of missing data
percentage_missing_data = (total_missing_values / total_cells) * 100

print("Percentage of missing data in the entire dataset:", percentage_missing_data)


Percentage of missing data in the entire dataset: 0.0


## Final Data Distribution

In [81]:
 df['loan_status'].value_counts(normalize=True)

loan_status
Fully Paid     0.802742
Charged Off    0.197258
Name: proportion, dtype: float64

Final Number of Records

In [82]:
len(df)

1220092

In [87]:
#Store cleaned data
df.to_csv("../data/cleaned_data.csv", index=False)