# ROBOLOAN

## 1. Data

In [None]:
# import needed libraries
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

### 1.1 Data description

In [None]:
# Review the first data set: Accepted loans
original_data_1 = pd.read_csv('accepted_2007_to_2018_100T.csv', low_memory=False)
original_data_1.head()

In [None]:
# Review the second data set: Rejected loans

original_data_2 = pd.read_csv('rejected_2007_to_2018_100T.csv', low_memory=False, encoding="ISO-8859-1")
original_data_2.head()

### 1.2 Attributes' Explanation 

In [None]:
# remove all columns and rows with NaN values and choose some needed attributes

data_accepted = original_data_1[['loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'emp_length', 'home_ownership', 
                     'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'purpose', 'open_acc', 'dti', 'total_acc', 
                     'fico_range_low', 'fico_range_high', 'addr_state', 'policy_code']]
data_accepted = data_accepted.dropna()

data_accepted.head(5)

In [None]:
# list of attributes of accepted loans dataset 

data_accepted.info()

In [None]:
# function for converting ordinal values to numeric values
def ordinal_feature(data, columns):
    """ 
        - Takes only ordinal variables (in quantative or alphabetic order) to convert it into integer
        - Important: Takes two arguments
        - param data: Dataset
        - param columns: features, which have to be converted to ordinals
    """
    for column in columns:
        converted_order = []
        count = 0
        values = list(data[column].unique())
        values = sorted(values)
        for value in values:
            value = str(value)
            if value.startswith("<"):
                converted_order.insert(0, value)
            elif "+" in value:
                converted_order.insert(count + 1, value)
            else:
                converted_order.insert(count, value)
                count += 1
        data[column] = data[column].apply(lambda x: converted_order.index(x))
    return data

In [None]:
# converting ordinal values to numeric values
ordinals = ["emp_length"]
ordinal_feature(data_accepted, ordinals)

data_accepted.head(5)

In [None]:
# Rolling statistics of accepted loans
data_accepted.describe().round(4)

In [None]:
# Choose the needed features from original rejected data set and clear from NaN values
data_rejected = original_data_2[['Amount Requested', 'Application Date', 'Loan Title', 'Risk_Score', 'Debt-To-Income Ratio', 
                                 'Employment Length','State', 'Policy Code;;']]
data_rejected = data_rejected.dropna()

# remove the persentage and unneeded signs from the values
data_rejected['Debt-To-Income Ratio'] =  data_rejected['Debt-To-Income Ratio'].str.strip('%').astype('float')
data_rejected['Policy Code;;'] = data_rejected['Policy Code;;'].str.strip(';;').astype('float')

# convert the values to numerical type
data_rejected['Amount Requested'] = data_rejected['Amount Requested'].astype('float')
data_rejected['Risk_Score'] = data_rejected['Risk_Score'].astype('float')

In [None]:
# convert ordinal features to numeric values in the rejected loan data set

ordinals = ["Employment Length"]
ordinal_feature(data_rejected, ordinals)

data_rejected.head(10)

In [None]:
# Attribute explanations of rejected loans? 

data_rejected.describe().round(2)

In [None]:
data_rejected.info()

### 1.3 Merge both data sets

In [None]:
# Generelize the common features of both data sets 
common_features = {"loan_amnt":"Loan Amount", "Amount Requested":"Loan Amount", "issue_d": "Application Date", 
                   "purpose": "Loan Title", "dti": "Debt-To-Income Ratio", "emp_length": "Employment Length",
                   "addr_state": "State", "Policy Code;;": "Policy Code", "policy_code": "Policy Code"}

# Attribute Risk score is in different form. In accepted data it is splited in low and high fico range. We take the mean of them
data_accepted["Risk_Score"] = data_accepted[['fico_range_low', 'fico_range_high']].mean(axis=1)
data_accepted = data_accepted.drop(['fico_range_low', 'fico_range_high'], axis=1)

data_accepted_m = data_accepted.rename(columns=common_features)
data_rejected_m = data_rejected.rename(columns=common_features)

for i in data_accepted_m.columns:
    if i not in data_rejected_m.columns:
        data_accepted_m = data_accepted_m.drop([i], axis=1)
        
merged_data = pd.concat([data_accepted_m, data_rejected_m], ignore_index=True, sort=True)
merged_data.head(10)

In [None]:
merged_data.tail(10)

In [None]:
merged_data.info()

## 2. Descriptive statistics

In [None]:
# Style, font and formats for plotting discriptive statistics

plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'Arial'
mpl.rcParams['font.size'] = '14'

### 2.1 Loan and interest rate

In [None]:
plt.figure(figsize=(13, 16))
sns.set_theme()
sns.set_context("paper")

# Loan Amount
plt.subplot(4, 1, 1, facecolor='#eeefff')
sns.histplot(x='loan_amnt', data=data_accepted, color='#21AFC3', kde=True)
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.title('Loan Amount', fontsize=16)
plt.tight_layout(pad=2)

# Interest Rate
plt.subplot(4, 1, 2, facecolor='#eeefff')
sns.lineplot(data=sorted(data_accepted['int_rate'].unique()), color='#EA2B6B')
plt.plot(data_accepted['int_rate'].unique(), 'bo')
plt.xlabel("Unique values")
plt.ylabel('Rate')
plt.title('Interest rate', fontsize=16)
plt.ylim(0, data_accepted['int_rate'].max() + 5)
plt.xlim(0, len(data_accepted['int_rate'].unique()))
plt.tight_layout(pad=2)


### 2.2 Distributions of other features

In [None]:
plt.figure(figsize=(13, 20))
sns.set_context("paper")

# Purpose of taking loan 
plt.subplot(4, 2, 1, facecolor = '#eeefff')
data_accepted['purpose'].value_counts().plot(kind='bar', color='#03B9FD')
plt.ylabel('Frequency')
plt.title('Purpose', fontsize=16)

# Duration of loan
plt.subplot(4, 2, 2, facecolor='#eeefff')
data_accepted['term'].value_counts().plot(kind='bar', color='#EA2B6B')
plt.title('Duration of loan', fontsize=16)

# Information about home ownership of applicants
plt.subplot(4, 2, 3, facecolor = '#eeefff')
data_accepted['home_ownership'].value_counts().plot(kind='bar', color='#FCBF8C')
plt.ylabel('Frequency')
plt.title('Home ownership', fontsize=16)
plt.tight_layout(pad=2)

# Information about grades frequency
plt.subplot(4, 2, 4, facecolor='#eeefff')
sns.countplot(x='grade', data=data_accepted, palette="rocket")
plt.xlabel(None)
plt.title('Grades', fontsize=16)

# Verification_status
plt.subplot(4, 2, 5, facecolor='#eeefff')
data_accepted['verification_status'].value_counts().plot(kind='bar', color='#21AFC3')
plt.ylabel('Frequency')
plt.xlabel(None)
plt.title('Varification Status', fontsize=16)

# Loan status
plt.subplot(4, 2, 6, facecolor='#eeefff')
data_accepted['loan_status'].value_counts().plot(kind='bar', color='#FDA603')
plt.xlabel(None)
plt.title('Loan Status', fontsize=16)

# Employment lenght
plt.subplot(4, 2, 7, facecolor = '#eeefff')
sns.countplot(x='emp_length', data=data_accepted, palette="GnBu_d")
plt.ylabel('Frequency')
plt.title('Employment Length', fontsize=16)

### 2.3 Contingency tables

In [None]:
# Loan status vs grade

pd.crosstab(data_accepted['loan_status'], data_accepted['grade']).style.background_gradient(cmap = "Purples")

In [None]:
# Loan status vs home ownership

pd.crosstab(data_accepted['loan_status'], data_accepted['home_ownership']).style.background_gradient(cmap = "Reds")

In [None]:
# Loan status and employment lenght

pd.crosstab(data_accepted['loan_status'], data_accepted['emp_length']).style.background_gradient(cmap = "Blues")

In [None]:
# loan status vs loan amount

data_accepted.groupby(by='loan_status')['loan_amnt'].describe()

In [None]:
plt.figure(figsize=(14, 4))
sns.set_theme(style="ticks")
sns.set_context("paper")

# Plot the distribution of loan amounts with laon statuses
sns.histplot(data=data_accepted, x="loan_amnt", hue="loan_status", stat='count', multiple="stack", palette="Set1", bins=20)

In [None]:
# grade vs employment length

pd.crosstab(data_accepted['grade'], data_accepted['emp_length']).style.background_gradient(cmap = "Blues")

In [None]:
# grade vs home ownership

pd.crosstab(data_accepted['grade'], data_accepted['home_ownership']).style.background_gradient(cmap = "Reds")

In [None]:
plt.figure(figsize=(15, 5))
sns.set_theme(style="ticks")
sns.set_context("paper")

# loan amount and homeownership
sns.histplot(data=data_accepted, x="loan_amnt", hue="home_ownership", multiple="stack", palette="deep")

In [None]:
# Frequency of the loans' amount
plt.figure(figsize=(15, 12))
sns.set_context("paper")

# Information about grades frequency 
plt.subplot(2, 2, 1, facecolor = '#eeefff')
sns.violinplot(data=data_accepted, x="home_ownership", y="loan_amnt", split=True, palette='coolwarm')
plt.ylabel("Loan Amount")
plt.xlabel(None)
plt.title("Homer Ownership with Loan Amount Distribuition", fontsize=16)

plt.subplot(2, 2, 2, facecolor = '#eeefff')
sns.violinplot(data=data_accepted, x="home_ownership", y="int_rate", split=True, palette='Spectral')
plt.ylabel("Interest Rate")
plt.xlabel(None)
plt.title("Homer Ownership with Interest Rate", fontsize=16)

plt.subplot(2, 2, 3, facecolor = '#eeefff')
sns.violinplot(data=data_accepted, x="home_ownership", y="int_rate", hue="term", split=True, palette='coolwarm')
plt.ylabel("Interest Rate")
plt.xlabel(None)
plt.title("Homer Ownership with Interest Rate", fontsize=16)

plt.subplot(2, 2, 4, facecolor = '#eeefff')
sns.boxplot(data=data_accepted, x="grade", y="int_rate", palette='pastel')
plt.ylabel("Interest Rate")
plt.xlabel(None)
plt.title("Grades with Interest Rate", fontsize=16)

In [None]:
plt.figure(figsize=(13, 15))
sns.set_context("paper")
sns.set_theme(style="ticks", palette="pastel")

# Homeownership and duration of loan
plt.subplot(3, 2, 1)
sns.countplot(x='term', data=data_accepted, hue='home_ownership')
plt.legend(loc=1)
plt.title("Homer Ownership with Duration", fontsize=16)

# Homeownership and duration of loan
plt.subplot(3, 2, 2)
sns.countplot(x='term', data=data_accepted, hue='verification_status')
plt.legend(loc=1)
plt.tight_layout(pad=2)

# Homeownership, interest rate and varification status
plt.subplot(3, 2, 3, facecolor = '#eeefff')
sns.boxplot(data=data_accepted, x="home_ownership", y="int_rate", hue="verification_status", palette='pastel')
plt.ylabel("Interest Rate")
plt.xlabel(None)
plt.legend(loc=1)
plt.title("Homer Ownership with Interest Rate", fontsize=16)

# Homeownership, interest rate and varification status
plt.subplot(3, 2, 4, facecolor = '#eeefff')
sns.boxplot(data=data_accepted, x="home_ownership", y="int_rate", hue="term", palette='pastel')
plt.ylabel("Interest Rate")
plt.xlabel(None)
plt.legend(loc=1)
plt.title("Homer Ownership with Interest Rate", fontsize=16)

# Grade and duration of loan
plt.subplot(3, 2, 5)
sns.countplot(x='term', data=data_accepted, hue='grade')
plt.legend(loc=1)
plt.title("Homer Ownership with Duration", fontsize=16)

# Homeownership, interest rate and varification status
plt.subplot(3, 2, 6, facecolor = '#eeefff')
sns.boxplot(data=data_accepted, x="annual_inc", hue="grade", palette='pastel')
plt.legend(loc=1)
plt.title("", fontsize=16)


### 2.4 Correlation matrix

In [None]:
# function for plotting correlation matrices
def correlation_matrix(data):
    """ 
        Takes as an input a data set.
        Plots correlation matrix
    """
    labels_cr = data.columns
    corr_table = np.array(data.round(3))
    
    fig, ax = plt.subplots(figsize=(10, 12))
    ax.imshow(corr_table, cmap='twilight_shifted')
    
    ax.set_xticks(np.arange(len(labels_cr)))
    ax.set_yticks(np.arange(len(labels_cr)))
    ax.set_xticklabels(labels_cr)
    ax.set_yticklabels(labels_cr)
    ax.grid(False)

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor", fontsize=14)
    plt.setp(ax.get_yticklabels(), fontsize=14)

    for i in range(len(labels_cr)):
        for j in range(len(labels_cr)):
            text = ax.text(j, i, corr_table[i, j], ha='center', va='center', color='w', fontsize=14)
        
    ax.set_title("Correlation of features".upper(), fontsize=14)
    fig.tight_layout()  

    fig.colorbar(ax.imshow(corr_table, cmap='twilight_shifted'), orientation='horizontal', label='Correlation')

In [None]:
# correlation of features of the accepted loan data set
data_accepted.drop(["policy_code"], axis=1).corr().round(4)

In [None]:
# Correlation matrix plot of accepted loan data set
correlation_matrix(data_accepted.drop(["policy_code"], axis=1).corr())

In [None]:
data_rejected.drop(["Policy Code;;"], axis=1).corr()

In [None]:
correlation_matrix(data_rejected.drop(["Policy Code;;"], axis=1).corr())

## 3. Implementing Machine learning models 


### 3.1 Data preparation for ML models

In [None]:
# convert duration to numerical value
data_accepted['term'] = data_accepted['term'].str.strip('months').astype('int')

# drop issued date
data_accepted = data_accepted.drop(['issue_d'], axis=1)

In [None]:
# convert grades to numerical order using function ordinal feature
ordinals = ["grade"]
ordinal_feature(data_accepted, ordinals)

data_accepted.head(10)

In [None]:
data_accepted.columns

In [None]:
# function for converting categorical variables to dummy values
def onehot_encode(data, columns):
    data = data.copy()
    for column in columns:
        dummies = pd.get_dummies(data[column], prefix=column)
        data = pd.concat([data, dummies], axis=1)
        data = data.drop(column, axis=1)
    return data

In [None]:
# converting values to dummy values
onehot_encode(data_accepted, columns=['home_ownership', 'verification_status', 'purpose', 'addr_state'])

### 3.2 Unsupervised Machine Learning model - K-means



In [None]:
# Importing necessery libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [None]:
# converting the values to dummy values
X = onehot_encode(data_accepted, columns=[['home_ownership', 'verification_status', 'purpose', 'addr_state']])

In [None]:
# seperating the target variable from the predictor variables 
target_column = data_accepted['loan_status'] 
X = X.drop(['loan_status'], axis=1)

In [None]:
# creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values to labels and storing in another column
target_column = labelencoder.fit_transform(target_column)

In [None]:
# Importing model K-means from scikit library
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=7)
KModel = kmeans.fit(X)

In [None]:
# predicted categories
KModel.labels_

In [None]:
# clusters' centers
KModel.cluster_centers_

In [None]:
# Comparason of predicted clusters with predefined given categories (target column)
pd.crosstab(target_column, KModel.labels_)

In [None]:
# Check the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy_score(target_column, KModel.labels_)

### 3.3 K-Nearest Neighbors (Supervised model)

1. Find K records that have similar features (i.e., similar predictor values).
2. For classification, find out what the majority class is among those similar records and assign that class to the new record.
3. For prediction (also called KNN regression), find the average among those similar records, and predict that average for the new record.

In [None]:
merged_data = merged_data.drop(['Application Date', 'Loan Title', "State"], axis=1)

In [None]:
#  K-Nearest Neighbors we take concated dataset just to show how it works. The outcome is the feature risk score. 
merged_data.tail()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# define the features for prediction
predictors = ["Debt-To-Income Ratio", "Employment Length", "Loan Amount", "Risk_Score"]
outcome = "Policy Code"

# define the prediction variables and outcome variabales
X = merged_data.loc[:, predictors].values
y = merged_data.loc[:, outcome].values

# Split data into training data and testing data (30% of data).  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# standardize values
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit the model with certain number of neighbors (Ks)
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

In [None]:
# Save the prediction of test part (30% data for test without outcomes)
y_pred = knn.predict(X_test)

In [None]:
# Check the quality of the model using confusion matrix.
print(classification_report(y_test, y_pred))

In [None]:
# Check the quality of the model
accuracy_score(y_test, y_pred)

### References

- https://learning.oreilly.com/library/view/machine-learning-with/9780134845708/ch03.xhtml#ch03
- https://learning.oreilly.com/library/view/practical-statistics-for/9781492072935/ch06.html#StatisticalML
- https://learning.oreilly.com/library/view/machine-learning-for/9781789136364/ch02s02.html
- https://learning.oreilly.com/library/view/python-for-finance/9781492024323/ch13.html
- https://pandas.pydata.org
- https://matplotlib.org
- https://www.kaggle.com/wordsforthewise
- https://scikit-learn.org
- https://towardsdatascience.com/categorical-encoding-using-label-encoding-and-one-hot-encoder-911ef77fb5bd