# Bank Loan Term Prediction
---

## Import packages & read data.

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Modeling imports
from sklearn.model_selection import train_test_split


In [2]:
df1 = pd.read_csv('credit_train.csv')
df2 = pd.read_csv('credit_test.csv')
df= pd.concat([df1,df2], axis=0, ignore_index=True)   # concat two dataset
df

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.90,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.70,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110862,,,,,,,,,,,,,,,,,,,
110863,,,,,,,,,,,,,,,,,,,
110864,,,,,,,,,,,,,,,,,,,
110865,,,,,,,,,,,,,,,,,,,


In [3]:
df.shape

(110867, 19)

In [4]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110867 entries, 0 to 110866
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       110000 non-null  object 
 1   Customer ID                   110000 non-null  object 
 2   Loan Status                   100000 non-null  object 
 3   Current Loan Amount           110000 non-null  float64
 4   Term                          110000 non-null  object 
 5   Credit Score                  88865 non-null   float64
 6   Annual Income                 88865 non-null   float64
 7   Years in current job          105351 non-null  object 
 8   Home Ownership                110000 non-null  object 
 9   Purpose                       110000 non-null  object 
 10  Monthly Debt                  110000 non-null  float64
 11  Years of Credit History       110000 non-null  float64
 12  Months since last delinquent  51553 non-null

In [5]:
duplicate = df1.duplicated()
print(f'Duplicate in df1 :', duplicate.sum())
duplicate = df2.duplicated()
print(f'Duplicate in df2 :', duplicate.sum())
duplicate = df.duplicated()
print(f'Duplicate in df :', duplicate.sum())

Duplicate in df1 : 10728
Duplicate in df2 : 352
Duplicate in df : 11081


**Rename columns for easer code writing**

In [6]:
df.columns

Index(['Loan ID', 'Customer ID', 'Loan Status', 'Current Loan Amount', 'Term',
       'Credit Score', 'Annual Income', 'Years in current job',
       'Home Ownership', 'Purpose', 'Monthly Debt', 'Years of Credit History',
       'Months since last delinquent', 'Number of Open Accounts',
       'Number of Credit Problems', 'Current Credit Balance',
       'Maximum Open Credit', 'Bankruptcies', 'Tax Liens'],
      dtype='object')

In [7]:
df.columns = df.columns.str.replace(' ','_')

In [8]:
df.columns

Index(['Loan_ID', 'Customer_ID', 'Loan_Status', 'Current_Loan_Amount', 'Term',
       'Credit_Score', 'Annual_Income', 'Years_in_current_job',
       'Home_Ownership', 'Purpose', 'Monthly_Debt', 'Years_of_Credit_History',
       'Months_since_last_delinquent', 'Number_of_Open_Accounts',
       'Number_of_Credit_Problems', 'Current_Credit_Balance',
       'Maximum_Open_Credit', 'Bankruptcies', 'Tax_Liens'],
      dtype='object')

## Split the data for train, validation and test

In [9]:
# split the data for train and test
df_Train, df_test = train_test_split(df, test_size = 0.2, random_state = 30 )

# split the train for train and val
df_train, df_val = train_test_split(df_Train, test_size = 0.2, random_state = 30 )

In [10]:
print(f'Shape of train:', df_train.shape)
print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

Shape of train: (70954, 19)
Shape of validation: (17739, 19)
Shape of test: (22174, 19)


## Data Pre-processing

### Cleaning data

In [11]:
# reset index for train
df_train = df_train.reset_index(drop=True)

# reset index for val
df_val = df_val.reset_index(drop=True)

# reset index for val
df_test = df_test.reset_index(drop=True)

In [12]:
df_train.sample(20)

Unnamed: 0,Loan_ID,Customer_ID,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
28889,c5464d82-22c1-4a5b-9e4c-32cee8f879e9,d703c620-873a-490b-b877-2278195de408,Fully Paid,282040.0,Long Term,660.0,1937221.0,5 years,Home Mortgage,Debt Consolidation,11849.35,21.8,21.0,10.0,0.0,67260.0,171534.0,0.0,0.0
44621,db0c0ac5-6b27-4294-9b38-22a0d4a8f616,b9d61c58-4161-4545-bd41-e9974c8e859e,Charged Off,262922.0,Long Term,6390.0,2605736.0,10+ years,Home Mortgage,Debt Consolidation,28663.02,22.2,36.0,22.0,0.0,301530.0,416152.0,0.0,0.0
3853,c8c2dd1e-d13f-4271-bfd5-78143fd1b03c,8eb5dc89-99e5-4310-a6c1-68392e30e87c,Fully Paid,379874.0,Long Term,713.0,1018495.0,10+ years,Home Mortgage,Debt Consolidation,19520.98,22.8,27.0,17.0,0.0,227164.0,426998.0,0.0,0.0
46374,d404a49b-55a3-44d4-a26f-b72d77d48344,b3c4b7d8-b4ed-4da0-82be-739312f08000,,320694.0,Short Term,7260.0,769880.0,9 years,Own Home,Debt Consolidation,15269.54,10.6,65.0,4.0,0.0,43776.0,54384.0,0.0,0.0
41436,a1c4532d-6c0c-40f5-97e6-eb5ca8125507,a27a1738-ca9a-48d1-b6d2-0fcb548f6888,Fully Paid,778316.0,Short Term,680.0,2496676.0,7 years,Home Mortgage,Debt Consolidation,65745.7,30.1,43.0,19.0,0.0,933736.0,1127412.0,0.0,0.0
41984,c4603ec2-de7d-469e-bea2-4cc67dbf7a6d,dee05845-c164-4409-ad78-206311a8d0a5,Fully Paid,220132.0,Short Term,733.0,988589.0,8 years,Own Home,Debt Consolidation,13428.25,22.7,18.0,9.0,1.0,184699.0,587532.0,0.0,0.0
31106,755ac05d-bad8-4c30-acf2-1f4aa9c309c5,1a243590-97cb-4fd4-824b-1364bc156a30,Fully Paid,33264.0,Short Term,742.0,1662234.0,10+ years,Rent,other,12632.91,20.2,,9.0,0.0,137465.0,168608.0,0.0,0.0
60727,9b688614-7c15-4db0-a3a2-3e0c8cf2c8ce,39d6784d-9e81-4b38-ba75-ada58bdd68e5,Fully Paid,306856.0,Long Term,,,2 years,Home Mortgage,other,22074.2,9.0,,13.0,0.0,217303.0,357918.0,0.0,0.0
45470,d05a8a87-cf99-44e5-a5c3-306ccea9b194,bcea09a8-1099-43d4-b2da-8c8500357df3,Fully Paid,100342.0,Short Term,733.0,962825.0,< 1 year,Home Mortgage,other,14201.74,19.2,45.0,10.0,0.0,287052.0,588280.0,0.0,0.0
1902,eceae391-315c-4516-bbd7-956b9eb4cae1,7f9d174e-efd8-44be-8d15-73a3ad69d444,Fully Paid,538252.0,Long Term,,,10+ years,Home Mortgage,Debt Consolidation,43831.1,25.9,23.0,9.0,0.0,286767.0,617188.0,0.0,0.0


In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70954 entries, 0 to 70953
Data columns (total 19 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Loan_ID                       70374 non-null  object 
 1   Customer_ID                   70374 non-null  object 
 2   Loan_Status                   63977 non-null  object 
 3   Current_Loan_Amount           70374 non-null  float64
 4   Term                          70374 non-null  object 
 5   Credit_Score                  56789 non-null  float64
 6   Annual_Income                 56789 non-null  float64
 7   Years_in_current_job          67446 non-null  object 
 8   Home_Ownership                70374 non-null  object 
 9   Purpose                       70374 non-null  object 
 10  Monthly_Debt                  70374 non-null  float64
 11  Years_of_Credit_History       70374 non-null  float64
 12  Months_since_last_delinquent  33027 non-null  float64
 13  N

In [14]:
df_train.dtypes

Loan_ID                          object
Customer_ID                      object
Loan_Status                      object
Current_Loan_Amount             float64
Term                             object
Credit_Score                    float64
Annual_Income                   float64
Years_in_current_job             object
Home_Ownership                   object
Purpose                          object
Monthly_Debt                    float64
Years_of_Credit_History         float64
Months_since_last_delinquent    float64
Number_of_Open_Accounts         float64
Number_of_Credit_Problems       float64
Current_Credit_Balance          float64
Maximum_Open_Credit             float64
Bankruptcies                    float64
Tax_Liens                       float64
dtype: object

In [15]:
df_train.isnull().sum()

Loan_ID                           580
Customer_ID                       580
Loan_Status                      6977
Current_Loan_Amount               580
Term                              580
Credit_Score                    14165
Annual_Income                   14165
Years_in_current_job             3508
Home_Ownership                    580
Purpose                           580
Monthly_Debt                      580
Years_of_Credit_History           580
Months_since_last_delinquent    37927
Number_of_Open_Accounts           580
Number_of_Credit_Problems         580
Current_Credit_Balance            580
Maximum_Open_Credit               582
Bankruptcies                      722
Tax_Liens                         584
dtype: int64

In [16]:
# check for dublicate

# for train
duplicate = df_train.duplicated()
print(f'Duplicate in train :', duplicate.sum())

# for val
duplicate = df_val.duplicated()
print(f'Duplicate in validation :', duplicate.sum())

# for test
duplicate = df_test.duplicated()
print(f'Duplicate in test :', duplicate.sum())

Duplicate in train : 4757
Duplicate in validation : 385
Duplicate in test : 546


In [17]:
print(f'The duplicate in Loan ID in train:',df_train.Loan_ID.duplicated().sum())
print(f'The duplicate in Customer ID in train:',df_train.Customer_ID.duplicated().sum())

print(f'The duplicate in Loan ID in val:',df_val.Loan_ID.duplicated().sum())
print(f'The duplicate in Customer ID in train:',df_val.Customer_ID.duplicated().sum())

print(f'The duplicate in Loan ID in taes:',df_test.Loan_ID.duplicated().sum())
print(f'The duplicate in Customer ID in train:',df_test.Customer_ID.duplicated().sum())

The duplicate in Loan ID in train: 9395
The duplicate in Customer ID in train: 9395
The duplicate in Loan ID in val: 701
The duplicate in Customer ID in train: 701
The duplicate in Loan ID in taes: 1028
The duplicate in Customer ID in train: 1028


In [18]:
df_train['Loan_ID'].value_counts().sort_values(ascending=False)

09841ac1-c9b0-463e-bc4f-7069f0e0e2a7    2
eceb365e-ee2e-4097-9b35-f83f35bac81d    2
26642e91-7173-46df-8367-bdf9d4229b45    2
83e2491b-b4e5-45d0-bca4-94fba0f89c4c    2
6082e6cd-7d9b-411b-b3c1-6ada63ca11c0    2
                                       ..
753425fa-b2a7-4e08-91ca-630983f49e2c    1
642a2363-c22f-45b2-a8ce-8c3215a1f33a    1
55af76f6-e747-426f-8925-f109625beaf3    1
e5020fa0-f9e8-4d0f-9870-59973cc83372    1
7e2065fa-ae16-4ede-a3d1-263bc94f4585    1
Name: Loan_ID, Length: 61558, dtype: int64

In [19]:
df_train[df_train['Loan_Status'].isna()]

Unnamed: 0,Loan_ID,Customer_ID,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
21,ab6ec219-d9c4-42dc-b26b-263eeb0b3130,6cbb45c2-c9b7-4608-86ba-b9872ce257b7,,268070.0,Long Term,715.0,1678460.0,4 years,Rent,Debt Consolidation,22799.05,30.6,41.0,10.0,0.0,1006088.0,1199726.0,0.0,0.0
28,c1e7d305-0b59-4859-baf5-d923f7379c76,9892c197-194c-4af1-8c6f-4a5111cecdab,,70796.0,Short Term,718.0,487749.0,1 year,Rent,Debt Consolidation,7560.10,8.1,,11.0,0.0,69141.0,119834.0,0.0,0.0
48,dc4b78d3-8b20-4eb9-901e-ed94a72d3797,35c69687-d0e9-4005-93e8-734eae4e7610,,358644.0,Short Term,747.0,1548728.0,10+ years,Home Mortgage,Debt Consolidation,28135.20,18.4,30.0,20.0,0.0,268356.0,1204368.0,0.0,0.0
67,b2df4e19-c4ba-4765-ac4c-5d35d09edb9a,ed6d3a88-2d13-47a9-a780-6a82d4046e37,,430672.0,Long Term,677.0,1208818.0,2 years,Rent,Debt Consolidation,21053.52,12.9,,8.0,0.0,416024.0,700172.0,0.0,0.0
69,88912a4b-06bf-4e1d-8c6e-0af2a8b9d3d8,8eb08da1-bc26-4720-a620-578cd0116ade,,779834.0,Short Term,705.0,1790199.0,5 years,Rent,Debt Consolidation,17305.20,13.4,,14.0,0.0,717554.0,1308406.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70922,b4a6e31b-102b-4627-bc54-2bdf3e75f130,89983a2e-55cd-4126-a1c6-18d4701c91a6,,223520.0,Short Term,,,4 years,Rent,Debt Consolidation,12312.76,5.0,,12.0,0.0,189164.0,538142.0,0.0,0.0
70927,d3c0b600-2ad5-4515-b81b-a9aee31ccc25,08a8961c-a21c-4b8a-ba1d-2ee9b908afe0,,178332.0,Short Term,702.0,1270682.0,10+ years,Home Mortgage,Debt Consolidation,36849.74,18.7,,20.0,0.0,1071182.0,1789788.0,0.0,0.0
70934,793b9f2c-9422-49a8-98c3-993018ab52d9,9545d44d-7301-4547-bab1-07c1485fc89d,,216106.0,Short Term,727.0,802541.0,2 years,Rent,Debt Consolidation,16385.22,15.9,20.0,16.0,0.0,190247.0,490600.0,0.0,0.0
70941,853645ee-e763-4d37-99b0-cb130af90ae0,cc1f3bec-4f9d-4b71-9b0f-740f4ff06536,,99999999.0,Long Term,677.0,549385.0,6 years,Rent,Buy a Car,3355.78,9.5,,4.0,0.0,100662.0,143550.0,0.0,0.0


In [20]:
df_train[df_train['Loan_ID'] == '53bf4fc0-8951-4329-9965-a81a852df395']

Unnamed: 0,Loan_ID,Customer_ID,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
18849,53bf4fc0-8951-4329-9965-a81a852df395,0d58dc49-dd92-4887-8117-0094d2e8ea56,,62304.0,Short Term,722.0,999172.0,10+ years,Rent,Debt Consolidation,8184.82,18.7,9.0,4.0,1.0,46474.0,79134.0,1.0,0.0
54765,53bf4fc0-8951-4329-9965-a81a852df395,0d58dc49-dd92-4887-8117-0094d2e8ea56,Fully Paid,62304.0,Short Term,722.0,999172.0,10+ years,Rent,Debt Consolidation,8184.82,18.7,9.0,4.0,1.0,46474.0,79134.0,1.0,0.0


**The reasone for the repetition becouse the loan status has been changed from charged off to fully paid.**
Dropping the duplicat loan id and keeping only the fully paid loans.

In [21]:
df_train['Loan_Status'].unique()

array(['Fully Paid', 'Charged Off', nan], dtype=object)

In [22]:
df_train = df_train[~((df_train.Loan_ID.duplicated())
                      & (df_train['Loan_Status'] != 'Fully Paid' ) 
                      & (df_train['Loan_Status'] != 'Charged Off' ))]
df_val = df_val[~((df_val.Loan_ID.duplicated())
                  & (df_val['Loan_Status'] != 'Fully Paid' ) 
                  & (df_val['Loan_Status'] != 'Charged Off' ))]
df_test = df_test[~((df_test.Loan_ID.duplicated()) 
                    & (df_test['Loan_Status'] != 'Fully Paid' )
                    & (df_test['Loan_Status'] != 'Charged Off' ))]

In [23]:
df_train[df_train['Loan_ID'] == '53bf4fc0-8951-4329-9965-a81a852df395']

Unnamed: 0,Loan_ID,Customer_ID,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
18849,53bf4fc0-8951-4329-9965-a81a852df395,0d58dc49-dd92-4887-8117-0094d2e8ea56,,62304.0,Short Term,722.0,999172.0,10+ years,Rent,Debt Consolidation,8184.82,18.7,9.0,4.0,1.0,46474.0,79134.0,1.0,0.0
54765,53bf4fc0-8951-4329-9965-a81a852df395,0d58dc49-dd92-4887-8117-0094d2e8ea56,Fully Paid,62304.0,Short Term,722.0,999172.0,10+ years,Rent,Debt Consolidation,8184.82,18.7,9.0,4.0,1.0,46474.0,79134.0,1.0,0.0


In [24]:
print(f'The duplicate in Loan ID in train:',df_train.Loan_ID.duplicated().sum())
print(f'The duplicate in Customer ID in train:',df_train.Customer_ID.duplicated().sum())

print(f'The duplicate in Loan ID in val:',df_val.Loan_ID.duplicated().sum())
print(f'The duplicate in Customer ID in train:',df_val.Customer_ID.duplicated().sum())

print(f'The duplicate in Loan ID in taes:',df_test.Loan_ID.duplicated().sum())
print(f'The duplicate in Customer ID in train:',df_test.Customer_ID.duplicated().sum())

The duplicate in Loan ID in train: 8057
The duplicate in Customer ID in train: 8057
The duplicate in Loan ID in val: 522
The duplicate in Customer ID in train: 522
The duplicate in Loan ID in taes: 804
The duplicate in Customer ID in train: 804


In [25]:
# dope nulls in Loan_ID & Loan_Status

# for train
df_train = df_train.dropna(subset = ['Loan_ID'])
# df_train = df_train.dropna(subset = ['Loan_Status'])

# for val
df_val = df_val.dropna(subset = ['Loan_ID'])
# df_val = df_val.dropna(subset = ['Loan_Status'])

# for test
df_test = df_test.dropna(subset = ['Loan_ID'])
# df_test = df_test.dropna(subset = ['Loan_Status'])

In [26]:
print(f'Shape of train:', df_train.shape)
print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

Shape of train: (69615, 19)
Shape of validation: (17559, 19)
Shape of test: (21949, 19)


In [27]:
# check again for dublicate

# for train
duplicate = df_train.duplicated()
print(f'Duplicate in train :', duplicate.sum())

# for val
duplicate = df_val.duplicated()
print(f'Duplicate in validation :', duplicate.sum())

# for test
duplicate = df_test.duplicated()
print(f'Duplicate in test :', duplicate.sum())

Duplicate in train : 4178
Duplicate in validation : 254
Duplicate in test : 392


In [28]:
df_train.isnull().sum() # train

Loan_ID                             0
Customer_ID                         0
Loan_Status                      5638
Current_Loan_Amount                 0
Term                                0
Credit_Score                    13490
Annual_Income                   13490
Years_in_current_job             2899
Home_Ownership                      0
Purpose                             0
Monthly_Debt                        0
Years_of_Credit_History             0
Months_since_last_delinquent    36991
Number_of_Open_Accounts             0
Number_of_Credit_Problems           0
Current_Credit_Balance              0
Maximum_Open_Credit                 2
Bankruptcies                      142
Tax_Liens                           4
dtype: int64

In [29]:
# drop nulls in Annual_Income

# for train
df_train = df_train[~df_train.Annual_Income.isna()]

# for val
df_val = df_val[~df_val.Annual_Income.isna()]

# for test
df_test = df_test[~df_test.Annual_Income.isna()]

In [30]:
print(f'Shape of train:', df_train.shape)
print(f'Shape of validation:', df_val.shape)
print(f'Shape of test:', df_test.shape)

Shape of train: (56125, 19)
Shape of validation: (14177, 19)
Shape of test: (17795, 19)


In [31]:
df_train.isnull().sum() # train

Loan_ID                             0
Customer_ID                         0
Loan_Status                      4462
Current_Loan_Amount                 0
Term                                0
Credit_Score                        0
Annual_Income                       0
Years_in_current_job             2354
Home_Ownership                      0
Purpose                             0
Monthly_Debt                        0
Years_of_Credit_History             0
Months_since_last_delinquent    29657
Number_of_Open_Accounts             0
Number_of_Credit_Problems           0
Current_Credit_Balance              0
Maximum_Open_Credit                 1
Bankruptcies                      114
Tax_Liens                           2
dtype: int64

In [32]:
df_train['Credit_Score'].nunique()

323

In [None]:
# # fill nulls in Credit_Score with median

# # for train
# median = df_train['Credit_Score'].median()
# df_train['Credit_Score'].fillna(median, inplace=True)

# # for val
# median = df_val['Credit_Score'].median()
# df_val['Credit_Score'].fillna(median, inplace=True)

# # for test
# median = df_test['Credit_Score'].median()
# df_test['Credit_Score'].fillna(median, inplace=True)



# # fill nulls in Annual_Income with median

# # for train
# median = df_train['Annual_Income'].median()
# df_train['Annual_Income'].fillna(median, inplace=True)

# # for val
# median = df_val['Annual_Income'].median()
# df_val['Annual_Income'].fillna(median, inplace=True)

# # for test
# median = df_test['Annual_Income'].median()
# df_test['Annual_Income'].fillna(median, inplace=True)

In [33]:
# fill nulls in Years_in_current_job & Months_since_last_delinquent

# for train
df_train['Years_in_current_job'] = df_train['Years_in_current_job'].fillna(0)
df_train['Months_since_last_delinquent'] = df_train['Months_since_last_delinquent'].fillna(0)


# for val
df_val['Years_in_current_job'] = df_val['Years_in_current_job'].fillna(0)
df_val['Months_since_last_delinquent'] = df_val['Months_since_last_delinquent'].fillna(0)


# for test
df_test['Years_in_current_job'] = df_test['Years_in_current_job'].fillna(0)
df_test['Months_since_last_delinquent'] = df_test['Months_since_last_delinquent'].fillna(0)

In [34]:
df_train.isnull().sum()

Loan_ID                            0
Customer_ID                        0
Loan_Status                     4462
Current_Loan_Amount                0
Term                               0
Credit_Score                       0
Annual_Income                      0
Years_in_current_job               0
Home_Ownership                     0
Purpose                            0
Monthly_Debt                       0
Years_of_Credit_History            0
Months_since_last_delinquent       0
Number_of_Open_Accounts            0
Number_of_Credit_Problems          0
Current_Credit_Balance             0
Maximum_Open_Credit                1
Bankruptcies                     114
Tax_Liens                          2
dtype: int64

In [35]:
# drop nulls 

# for train
df_train = df_train.dropna()

# for val
df_val = df_val.dropna()

# for test
df_test = df_test.dropna()

In [36]:
df_train.isnull().sum()

Loan_ID                         0
Customer_ID                     0
Loan_Status                     0
Current_Loan_Amount             0
Term                            0
Credit_Score                    0
Annual_Income                   0
Years_in_current_job            0
Home_Ownership                  0
Purpose                         0
Monthly_Debt                    0
Years_of_Credit_History         0
Months_since_last_delinquent    0
Number_of_Open_Accounts         0
Number_of_Credit_Problems       0
Current_Credit_Balance          0
Maximum_Open_Credit             0
Bankruptcies                    0
Tax_Liens                       0
dtype: int64