In [1]:
# email: sengtian@yahoo.com
#
# dataset:
#    1) dataset/load_prediction_train.csv
#    2) https://datahack.analyticsvidhya.com/contest/practice-problem-loan-prediction-iii/?utm_source=auto-email
#
# Course at Analytics Vidhya:
#    https://courses.analyticsvidhya.com/courses/take/loan-prediction-practice-problem-using-python/texts/6119745-model-building-part-ii
#
# This one use PyCaret to iterate to all available/supported models

In [2]:
%config IPCompleter.greedy=True

import pandas as pd
import os

# to display all the columns of the dataframe in the notebook
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [3]:
# print version of PyCaret
from pycaret.utils import version

version()


ModuleNotFoundError: No module named 'pycaret'

In [None]:
# load the dataset
dataset_dir = 'dataset'
dataset_filename = 'loan_prediction_train.csv'

dataset_fullname = os.path.join(os.getcwd(), dataset_dir)
dataset_fullname = os.path.join(dataset_fullname, dataset_filename)

print(dataset_fullname)

# load the data and print the first few rows
df = pd.read_csv(dataset_fullname)
df.head(10)


In [None]:
print(df.shape)


In [None]:
# drop the Loan_ID since it will not be needed in training and prediction
df = df.drop('Loan_ID', axis=1)

print(df.shape)


In [None]:
# import the module and initializing setup
from pycaret.classification import *

experiment = setup(
    session_id=123,
    data=df,  
    feature_selection=True,
    train_size=0.75, 
    categorical_imputation='constant',
    categorical_features=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area'],
    numeric_imputation='median',
    numeric_features=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term'],
    target='Loan_Status',
    silent=True
)


In [None]:
compare_models(blacklist=[], turbo=False)

In [None]:
# from the above, Ridge and Linear Discriminant Analysis are of the same accuracy

# Ridge Classifier

In [None]:
ridge = create_model('ridge')

In [None]:
tuned_ridge = tune_model('ridge')

In [None]:
predict_model(tuned_ridge)

# Linear Discriminant Analysis

In [None]:
lda = create_model('lda')

In [None]:
tuned_lda = tune_model('lda')

In [None]:
predict_model(tuned_lda)

In [None]:
# Ridge Classifier has a higher score than Linear Discriminant Analysis

# Predict on Unseen Data

In [None]:
# load the dataset
dataset_dir = 'dataset'
dataset_filename = 'loan_prediction_test.csv'

dataset_fullname = os.path.join(os.getcwd(), dataset_dir)
dataset_fullname = os.path.join(dataset_fullname, dataset_filename)

print(dataset_fullname)

# load the data and print the first few rows
t_df = pd.read_csv(dataset_fullname)
t_df.head(10)


In [None]:
unseen_predictions = predict_model(tuned_ridge, data=t_df)
unseen_predictions.head(10)

In [None]:
unseen_predictions = unseen_predictions.rename(columns={'Label': 'Loan_Status'})
unseen_predictions.head(10)

In [None]:
unseen_predictions['Loan_Status'] = unseen_predictions['Loan_Status'].map({1: 'Y', 0: 'N'})
unseen_predictions.head(10)

In [None]:
save_unseen_predictions = unseen_predictions.filter(['Loan_ID', 'Loan_Status'])
save_unseen_predictions.head(10)

In [None]:
t_dataset_dir = 'dataset'
t_dataset_filename = 'loan_prediction_submission_pycaret.csv'

t_dataset_fullname = os.path.join(os.getcwd(), dataset_dir)
t_dataset_fullname = os.path.join(t_dataset_fullname, t_dataset_filename)
save_unseen_predictions.to_csv(t_dataset_fullname, index=False)

print('Prediction result is written to:\n{}'.format(t_dataset_fullname))

In [None]:
# Achieved a score of 0.7778

<div style="width: 512px; height: 224px;">
    <img src="02.png" width="70%" height="70%" align="left">
</div>