## Loading and Analysis of data

In [65]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [66]:
train_raw = pd.read_csv('train.csv',index_col='Loan_ID')
test_main = pd.read_csv('test.csv',index_col='Loan_ID')
train_raw.info()


<class 'pandas.core.frame.DataFrame'>
Index: 614 entries, LP001002 to LP002990
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 62.4+ KB


## Separating the output column and processing data
After separating output column both the data will be appended and preprocessing will be done

In [67]:
#Taking the label encoded values in y 
y = train_raw.Loan_Status.map({'Y':1,'N':0})
train_raw.drop('Loan_Status',inplace=True,axis=1)

In [68]:
rown_train_data = train_raw.shape[0]
#appending both the data
X = train_raw.append(test_raw)
X.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [69]:
objList = X.select_dtypes(include = "object").columns
for obj_Column in objList:
    print(X[obj_Column].unique())


['Male' 'Female' nan]
['No' 'Yes' nan]
['0' '1' '2' '3+' nan]
['Graduate' 'Not Graduate']
['No' 'Yes' nan]
['Urban' 'Rural' 'Semiurban']


## Label encoding the categorial faetures
As the values also contain NAN so there will be selective label encoding followed by imputing

In [72]:
# Encoders for the training data
encoders = dict()

for col_name in X.columns:
        series = X[col_name]
        label_encoder = LabelEncoder()
        X[col_name] = pd.Series(
            label_encoder.fit_transform(series[series.notnull()]),
            index=series[series.notnull()].index
        )
        encoders[col_name] = label_encoder

X.info()


          Gender  Married  Dependents  Education  Self_Employed  \
Loan_ID                                                           
LP001002     1.0      0.0         0.0          0            0.0   
LP001003     1.0      1.0         1.0          0            0.0   
LP001005     1.0      1.0         0.0          0            1.0   
LP001006     1.0      1.0         0.0          1            0.0   
LP001008     1.0      0.0         0.0          0            0.0   
...          ...      ...         ...        ...            ...   
LP002971     1.0      1.0         3.0          1            1.0   
LP002975     1.0      1.0         0.0          0            0.0   
LP002980     1.0      0.0         0.0          0            0.0   
LP002986     1.0      1.0         0.0          0            0.0   
LP002989     1.0      0.0         0.0          0            1.0   

          ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
Loan_ID                                           

In [75]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

#Converting again to dataframe
X = pd.DataFrame(X)


## Splitting the data into train and validation sets 

In [80]:
train_X = X.iloc[:rown_train_data,]
final_testing_data = X.iloc[rown_train_data:,]
seed=7
#getting columns back 
train_X.columns = test_raw.columns 
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y = train_test_split(train_X,y,random_state=seed)

# Robustly checking for which algorithm will perform best here

In [107]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

models=[]
models.append(("logreg",LogisticRegression()))
models.append(("tree",DecisionTreeClassifier()))
models.append(("svc",SVC()))
models.append(("rndf",RandomForestClassifier()))

from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score
result=[]
name=[]

In [None]:

for name,model in models:
    cv_result=cross_val_score(model,train_X,train_y,cv=10,scoring='accuracy')
    result.append(cv_result.mean())
    names.append(name)

#printing all the results
for result,name in zip(name,result):
    print(name)
    print(result)

## Conclusion : Logistic Regression and Random Forest perform equally well
Hence Ada boosting with the Random Forest Classifier : hyper parameter optimised 

In [166]:
from sklearn.ensemble import AdaBoostClassifier
check = [20,30,40,45,50,55,60,70,100,150]
for estimators in check:
    ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=10,min_samples_leaf=28), n_estimators=estimators,
        algorithm="SAMME.R", learning_rate=0.25)
    ada_clf.fit(train_X,train_y)
    print("Score for estimator {} train data{}".format(estimators,ada_clf.score(train_X,train_y)))
    print("Test data {}".format(ada_clf.score(test_X,test_y)))


Score for estimator 20 train data0.8956521739130435
Test data 0.8116883116883117
Score for estimator 30 train data0.9347826086956522
Test data 0.8181818181818182
Score for estimator 40 train data0.9695652173913043
Test data 0.7857142857142857
Score for estimator 45 train data0.9717391304347827
Test data 0.8051948051948052
Score for estimator 50 train data0.9804347826086957
Test data 0.8051948051948052
Score for estimator 55 train data0.9847826086956522
Test data 0.8116883116883117
Score for estimator 60 train data0.9869565217391304
Test data 0.7987012987012987
Score for estimator 70 train data0.991304347826087
Test data 0.8051948051948052
Score for estimator 100 train data0.9978260869565218
Test data 0.8051948051948052
Score for estimator 150 train data1.0
Test data 0.8116883116883117


## Measuring the accuracy of the model

In [168]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

pred = ada_clf.predict(test_X)
print(accuracy_score(test_y,pred))
print(confusion_matrix(test_y,pred))

0.8116883116883117
[[28 20]
 [ 9 97]]
