## This is a Loan Prediction Dataset from AV Hackathon
- Here we need to predict the loan status of the customer 
- We'll use the Classification techniqur for this problem

In [1]:
#importing the libraries
import pandas as pd 
import numpy as np

In [2]:
train = pd.read_csv('train.csv', index_col=0)

In [3]:
#size of the dataset
train.shape

(614, 12)

In [4]:
test = pd.read_csv('test.csv', index_col=0)

In [5]:
#test size
test.shape

(367, 11)

In [6]:
#seperating the target variable
target = train.Loan_Status

In [7]:
train.drop(['Loan_Status'], 1, inplace=True)

In [8]:
#combining the train and the test dataset in order to prevent duplication of opertions on the features
#we'll analyze the training set and if we apply the any operations , then we'll apply it on the data 
#as then both the train and test dataset will get manipulated 
data = train.append(test)

In [10]:
train.head()

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [11]:
#shape of the combined dataset
data.shape

(981, 11)

In [12]:
train.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
dtype: int64

        Let's apply feature engineering on all the features one by one
- Gender

In [16]:
#get dummies will convert the feature into categorical
#also this is a faster method than normal categorization
data.Gender.fillna('Male', inplace=True)
dummy = pd.get_dummies(data['Gender'], prefix='Gender')
data = pd.concat([data, dummy], axis=1)

- Married

In [17]:
#filling the null values first
data.Married.fillna('Yes', inplace=True)
dummy = pd.get_dummies(data['Married'], prefix='Married')
data = pd.concat([data, dummy], axis=1)

- Education 

In [18]:
train.groupby([target])['Education'].value_counts(dropna=False)

Loan_Status  Education   
N            Graduate        140
             Not Graduate     52
Y            Graduate        340
             Not Graduate     82
Name: Education, dtype: int64

- clearly graduated person are more likely to get the loan than non-graduated

In [19]:
dummy = pd.get_dummies(data['Education'], prefix='Education')
data = pd.concat([data, dummy], axis=1)

- Dependents

In [20]:
train.groupby(target)['Dependents'].value_counts(dropna=False)

Loan_Status  Dependents
N            0             107
             1              36
             2              25
             3+             18
             NaN             6
Y            0             238
             2              76
             1              66
             3+             33
             NaN             9
Name: Dependents, dtype: int64

In [21]:
data.replace('3+', 4, inplace=True)

In [22]:
dummy = pd.get_dummies(data['Dependents'], prefix='Dependents')
data = pd.concat([data, dummy], axis=1)

- Self Employed

In [23]:
train.Self_Employed.value_counts(dropna=False)

No     500
Yes     82
NaN     32
Name: Self_Employed, dtype: int64

     not very clear distributed categories , however mostly are no, hence not of much significance

In [24]:
data.drop(['Self_Employed'], 1, inplace=True)

- Credit History

In [25]:
train.groupby(target)['Credit_History'].value_counts(dropna=False)

Loan_Status  Credit_History
N            1.0                97
             0.0                82
             NaN                13
Y            1.0               378
             NaN                37
             0.0                 7
Name: Credit_History, dtype: int64

- It's very clear that people having credit history has claimed the loan 
- Hence, there is not much this feature can contribute

In [26]:
data.drop(['Credit_History'], 1, inplace=True)

In [27]:
data.fillna(1, inplace=True)

- Property Area

In [28]:
train.groupby(target)['Property_Area'].value_counts(dropna=False)

Loan_Status  Property_Area
N            Rural             69
             Urban             69
             Semiurban         54
Y            Semiurban        179
             Urban            133
             Rural            110
Name: Property_Area, dtype: int64

- Rural area people are less prone of getting the loan
- Both semiurban and urban people have good chances of getting the loan , hence we combine the both

In [29]:
#merging the two above categories
data.Property_Area.replace('Semiurban', 'Urban', inplace=True)

In [30]:
data.Property_Area.unique()

array(['Urban', 'Rural'], dtype=object)

In [31]:
dummy = pd.get_dummies(data['Property_Area'], prefix='Property_Area')
data = pd.concat([data, dummy], axis=1)

- Now let's drop the processed features of which we have creates dummies of

In [32]:
data.drop(['Gender', 'Married', 'Education', 'Dependents', 'Property_Area'], 1, inplace=True)

- Applicant's Income doesn't need to be altered as there's no null value in it
- Co-Applicant's Income

In [33]:
#calculating the no of 0 values in the training set
train[train.CoapplicantIncome == 0].CoapplicantIncome.count()

273

- As you can see that more tha one third of the records are nil of the training data
- Filling these value with the mean or the median value won't be much appreciating considering we already have the applocantIncome feature
- So it'll be wise to drop this feature

In [34]:
data.drop(['CoapplicantIncome'], 1, inplace=True)

- Loan Amount

In [35]:
data.LoanAmount.fillna(data.LoanAmount.mean(), inplace=True)

In [36]:
data.LoanAmount.isnull().sum()

0

- Loan Amount Term
- We'll look over the data rather than only the train data 

In [37]:
data.groupby(target)['Loan_Amount_Term'].value_counts(dropna=False)

Loan_Status  Loan_Amount_Term
N            360.0               153
             180.0                15
             480.0                 9
             1.0                   6
             300.0                 5
             36.0                  2
             84.0                  1
             240.0                 1
Y            360.0               359
             180.0                29
             1.0                   8
             300.0                 8
             480.0                 6
             84.0                  3
             120.0                 3
             240.0                 3
             60.0                  2
             12.0                  1
Name: Loan_Amount_Term, dtype: int64

In [38]:
data.Loan_Amount_Term.fillna(360, inplace=True)

In [39]:
#we'll need to minimize the categorize
data[data.Loan_Amount_Term == 360].Loan_Amount_Term = 1
data[data.Loan_Amount_Term == 180].Loan_Amount_Term = 2
data[data.Loan_Amount_Term == 1].Loan_Amount_Term = 3
data[data.Loan_Amount_Term == 300].Loan_Amount_Term = 3
data[data.Loan_Amount_Term == 480].Loan_Amount_Term = 3
data[data.Loan_Amount_Term == 240].Loan_Amount_Term = 4
data[data.Loan_Amount_Term == 84].Loan_Amount_Term = 4
data[data.Loan_Amount_Term == 120].Loan_Amount_Term = 4
data[data.Loan_Amount_Term == 36].Loan_Amount_Term = 5
data[data.Loan_Amount_Term == 60].Loan_Amount_Term = 5
data[data.Loan_Amount_Term == 12].Loan_Amount_Term = 5
data[data.Loan_Amount_Term == 6].Loan_Amount_Term = 5
data[data.Loan_Amount_Term == 350].Loan_Amount_Term = 5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [40]:
#now we'll create dummies for this feature
dummy = pd.get_dummies(data['Loan_Amount_Term'], prefix='Loan_Amount_Term')
data = pd.concat([data, dummy], axis=1)

In [41]:
#now we'll drop this feature
data.drop(['Loan_Amount_Term'], 1, inplace=True)

In [42]:
data.head()

Unnamed: 0,ApplicantIncome,LoanAmount,Gender_Female,Gender_Male,Married_No,Married_Yes,Education_Graduate,Education_Not Graduate,Dependents_4,Dependents_0,...,Loan_Amount_Term_36.0,Loan_Amount_Term_60.0,Loan_Amount_Term_84.0,Loan_Amount_Term_120.0,Loan_Amount_Term_180.0,Loan_Amount_Term_240.0,Loan_Amount_Term_300.0,Loan_Amount_Term_350.0,Loan_Amount_Term_360.0,Loan_Amount_Term_480.0
0,5849,1.0,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,4583,128.0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3000,66.0,0,1,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,2583,120.0,0,1,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
4,6000,141.0,0,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [43]:
#making the train and test partition from data
train_data = data.iloc[:614]
test_data = data.iloc[614:]

In [44]:
print(train_data.shape)
test_data.shape

(614, 27)


(367, 27)

### Now we'll implement learning algorithms to train our data model

- Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
log = LogisticRegression()

In [47]:
log.fit(train_data, target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [77]:
log.score(train_data, target)

0.6856677524429967

In [48]:
pred = log.predict(test_data)

In [49]:
file = pd.DataFrame(test.index)

In [50]:
file['Loan_Status'] = pred

In [51]:
file.to_csv('log.csv', index=False)

- K Nearest Neighbours

In [52]:
from sklearn.neighbors import KNeighborsClassifier

In [53]:
knn = KNeighborsClassifier(n_neighbors=3)

In [54]:
knn.fit(train_data, target)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [76]:
knn.score(train_data, target)

0.7915309446254072

In [55]:
pred = knn.predict(test_data)

In [56]:
file = pd.DataFrame(test.index)

In [57]:
file['Loan_Status'] = pred

In [58]:
file.to_csv('knn.csv', index=False)

- Decision Tree

In [59]:
from sklearn.tree import DecisionTreeClassifier

In [108]:
tree = DecisionTreeClassifier(max_depth=3)

In [109]:
tree.fit(train_data, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [110]:
tree.score(train_data, target)

0.7166123778501629

In [111]:
pred = tree.predict(test_data)

In [112]:
file = pd.DataFrame(test.index)

In [113]:
file['Loan_Status'] = pred

In [114]:
file.to_csv('tree.csv', index=False)

- Random Forest Classifier

In [68]:
from sklearn.ensemble import RandomForestClassifier

In [69]:
forest = RandomForestClassifier()

In [70]:
forest.fit(train_data, target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [79]:
forest.score(train_data, target)

0.9853420195439739

In [71]:
pred = forest.predict(test_data)

In [72]:
file = pd.DataFrame(test.index)

In [73]:
file['Loan_Status'] = pred

In [74]:
file.to_csv('forest.csv', index=False)

### Hence we complete this dataset by applying the required techniques of machine learning and comapring them based on the accuracy score 
- As this is a hackthon competion dataset we can evaluate the test_data score
- However we found that the decision tree and the forest classifier has produced the best training accuracy, so they are likely to perform well on the test dataset as well !!