In [187]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
pd.set_option('future.no_silent_downcasting', True)

In [188]:
df_train = pd.read_csv('ressources/datasets/train_loans.csv')
display(df_train.head())

df_test = pd.read_csv('ressources/datasets/test_loans.csv')
display(df_test.head())

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [189]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [190]:
df_train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [191]:
nb_row = df_train.isnull().any(axis=1).sum()
print(f"There is {nb_row} rows with at least one missing value")

There is 134 rows with at least one missing value


Some values are missing in the training dataset, specially for the credit history

In [192]:
df_test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [None]:
def fill_clean_variables(dataset):
    # Get numerical and categorical datas
    df_nb = dataset.select_dtypes(include=['int64', 'float64'])
    df_ob = dataset.select_dtypes(include=['object'])

    # Remove useless 'Load_ID' column
    df_ob = df_ob.drop('Loan_ID', axis=1)

    # Fill NaN numerical values with the mean one
    df_nb = df_nb.fillna(df_nb.mean())

    # Adapt Incomes variables in one column
    df_nb['Total_Income'] = df_nb['ApplicantIncome'] + df_nb['CoapplicantIncome']
    df_nb.drop(columns=['ApplicantIncome', 'CoapplicantIncome'], axis=1)

    for column in df_ob.columns:
        encoder = LabelEncoder()

        # Fill NaN categorical value with the most repetitive one
        if(df_ob[column].isnull().sum() > 0):
            mode_value = df_ob[column].mode()[0]
            df_ob[column] = df_ob[column].fillna(mode_value)

        # Encode categorical values
        df_ob[column] = encoder.fit_transform(df_ob[column].astype(str))

    # Replace values from 'Loan_Status' by 0 or 1
    if 'Loan_Status' in df_ob.columns:
        df_ob['Loan_Status'] = df_ob['Loan_Status'].replace({'Y': 1, 'N': 0})
        df_ob['Loan_Status'] = df_ob['Loan_Status'].infer_objects(copy=False)  # Retain old behavior

    # For Property_Urban --> Rural: 0, Semiurban: 1, Urban: 2 
        
    # Concat numerical and categorical sub dataset
    df_final = pd.concat([df_nb, df_ob], axis=1)
    return df_final


In [217]:
df_train_final = fill_clean_variables(df_train)
df_test_final = fill_clean_variables(df_test)

In [None]:
# standardisé valeurs numériques ?


Unnamed: 0,ApplicantIncome,CoapplicantIncome
0,5849,0.0
1,4583,1508.0
2,3000,0.0


0    5849.0
1    6091.0
2    3000.0
Name: Total_Income, dtype: float64

In [210]:
display(df_train[['ApplicantIncome', 'ApplicantIncomeLog']])

Unnamed: 0,ApplicantIncome,ApplicantIncomeLog
0,5849,8.674197
1,4583,8.430327
2,3000,8.006701
3,2583,7.857094
4,6000,8.699681
...,...,...
609,2900,7.972811
610,4106,8.320448
611,8072,8.996280
612,7583,8.933796
