# Loan Data
This dataset consists of more than 9,500 loans with information on the loan structure, the borrower, and whether the loan was pain back in full. This data was extracted from LendingClub.com, which is a company that connects borrowers with investors.

In [3]:
import pandas as pd
loan_data = pd.read_csv('datasets/loan_data.csv')
print('Shape of DataFrame:', loan_data.shape)
loan_data.head(5)

Shape of DataFrame: (9578, 14)


Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


## Data dictionary

|    | Variable          | Explanation                                                                                                             |
|---:|:------------------|:------------------------------------------------------------------------------------------------------------------------|
|  0 | credit_policy     | 1 if the customer meets the credit underwriting criteria; 0 otherwise.                                                  |
|  1 | purpose           | The purpose of the loan.                                                                                                |
|  2 | int_rate          | The interest rate of the loan (more risky borrowers are assigned higher interest rates).                                |
|  3 | installment       | The monthly installments owed by the borrower if the loan is funded.                                                    |
|  4 | log_annual_inc    | The natural log of the self-reported annual income of the borrower.                                                     |
|  5 | dti               | The debt-to-income ratio of the borrower (amount of debt divided by annual income).                                     |
|  6 | fico              | The FICO credit score of the borrower.                                                                                  |
|  7 | days_with_cr_line | The number of days the borrower has had a credit line.                                                                  |
|  8 | revol_bal         | The borrower's revolving balance (amount unpaid at the end of the credit card billing cycle).                           |
|  9 | revol_util        | The borrower's revolving line utilization rate (the amount of the credit line used relative to total credit available). |
| 10 | inq_last_6mths    | The borrower's number of inquiries by creditors in the last 6 months.                                                   |
| 11 | delinq_2yrs       | The number of times the borrower had been 30+ days past due on a payment in the past 2 years.                           |
| 12 | pub_rec           | The borrower's number of derogatory public records.                                                                     |
| 13 | not_fully_paid    | 1 if the loan is not fully paid; 0 otherwise.   

[Source](https://www.kaggle.com/itssuru/loan-data) of dataset.

In [5]:
# Goal: Inspecting the data closely.

# Extracting the summary statistics using the describe() method.
loan_apps_description = loan_data.describe()
#print(loan_apps_description)

# Using the info() method to get more information about the DataFrame.
loan_apps_info = loan_data.info()
print(loan_apps_info)

print("\n")

# Printing the last few rows to identify any missing entries.
loan_data.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9578 entries, 0 to 9577
Data columns (total 14 columns):
credit.policy        9578 non-null int64
purpose              9578 non-null object
int.rate             9578 non-null float64
installment          9578 non-null float64
log.annual.inc       9578 non-null float64
dti                  9578 non-null float64
fico                 9578 non-null int64
days.with.cr.line    9578 non-null float64
revol.bal            9578 non-null int64
revol.util           9578 non-null float64
inq.last.6mths       9578 non-null int64
delinq.2yrs          9578 non-null int64
pub.rec              9578 non-null int64
not.fully.paid       9578 non-null int64
dtypes: float64(6), int64(7), object(1)
memory usage: 1.0+ MB
None




Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
9573,0,all_other,0.1461,344.76,12.180755,10.39,672,10474.0,215372,82.1,2,0,0,1
9574,0,all_other,0.1253,257.7,11.141862,0.21,722,4380.0,184,1.1,5,0,0,1
9575,0,debt_consolidation,0.1071,97.81,10.596635,13.09,687,3450.041667,10036,82.9,8,0,0,1
9576,0,home_improvement,0.16,351.58,10.819778,19.18,692,1800.0,0,3.2,5,0,0,1
9577,0,debt_consolidation,0.1392,853.43,11.264464,16.28,732,4740.0,37879,57.0,6,0,0,1


In [6]:
# Goal: Splitting the data into train and test sets.

# Importing the train_test_split() method from sklearn.
from sklearn.model_selection import train_test_split

# Split the data into train and test sets, using a split ratio 0.33 and a random state of 42.
loan_data_train, loan_data_test = train_test_split(loan_data, test_size=0.33, random_state=42)

# Printing the first few rows of the loan applications DataFrame.
loan_data.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


In [7]:
pd.isnull(loan_data).sum()

credit.policy        0
purpose              0
int.rate             0
installment          0
log.annual.inc       0
dti                  0
fico                 0
days.with.cr.line    0
revol.bal            0
revol.util           0
inq.last.6mths       0
delinq.2yrs          0
pub.rec              0
not.fully.paid       0
dtype: int64

In [8]:
loan_data_train = pd.get_dummies(loan_data_train)
loan_data_test = pd.get_dummies(loan_data_test)

loan_data_test = loan_data_test.reindex(columns = loan_data_train.columns, fill_value=0)

In [9]:
# Goal: Rescale the features of the data.

# Importing the MinMaxScaler from sklearn's preprocessing module.
from sklearn.preprocessing import MinMaxScaler

# Segregating the features and labels into separate variables, using iloc.
X_train, y_train = loan_data_train.iloc[:, :-1].values, loan_data_train.iloc[:, [-1]].values
X_test, y_test = loan_data_test.iloc[:, :-1].values, loan_data_test.iloc[:, [-1]].values

# Instantiating the MinMaxScaler  and setting the feature_range to (0,1).
scaler = MinMaxScaler((0,1))

# Fitting the scaler to the training data.
rescaledX_train = scaler.fit_transform(X_train)

# Using the scaler to transform the test data.
rescaledX_test = scaler.transform(X_test)

In [10]:
# Goal: Fitting a logistic regression model to the scaled training set.

# Importing the LogisticRegression module from sklearn's linear_model section.
from sklearn.linear_model import LogisticRegression

# Instantiating a LogisticRegression classifier with default parameter values.
logreg = LogisticRegression()

# Fitting logreg to the train set.
logreg.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
# Goal: Making predictions and evaluating performance.

# Importing the confusion_matrix() function from sklearn's metrics module.
from sklearn.metrics import confusion_matrix

# Using logreg to predict instances from the test set and store it.
y_pred = logreg.predict(rescaledX_test)

# Getting the accuracy score of logreg model and print it.
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

# Printing the confusion matrix of the logreg model.
print(confusion_matrix(y_test, y_pred))

Accuracy of logistic regression classifier:  1.0
[[2956    0]
 [   0  205]]


In [12]:
# Goal: Define the grid of parameter values for which grid searching is to be performed.

# Importing GridSearchCV from sklearn's model_selection module.
from sklearn.model_selection import GridSearchCV

# Defining the grid of values for tol and max_iter.
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Creating a dictionary where tol and max_iter are keys and the lists of their values are corresponding values.
param_grid = dict(tol=tol, max_iter=max_iter)

In [13]:
# Goal: Find the best score and parameters for the model using GridSearchCV.

# Instantiating GridSearchCV with the required parameters.
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Fitting the rescaled training data into the grid_model.
grid_model_result = grid_model.fit(rescaledX_train, y_train)

# Summarizing the best score and parameters within their respective variables.
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_

# Printing the best score and parameters.
print("Best: %f using %s" % (best_score, best_params))

# Extracting the best model using grid_model_result.best_estimator_.
best_model = grid_model_result.best_estimator_
print("Accuracy of best model: ", best_model.score(rescaledX_test, y_test))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best: 1.000000 using {'max_iter': 100, 'tol': 0.01}
Accuracy of best model:  1.0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
