Load the required packages

In [None]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import seaborn as sns

Chapter 2 - Data analysis

Load Zopa loan dataset into dataframe from local directory (this can be modified as necessary)

In [None]:
os.chdir('C:/Users/einat/Documents/Matteo/Msc/Project/')
ld = pd.read_csv('loanbook_extract.csv')

Display dataframe dimensions, lenght exclusing duplicates

In [None]:
ld.info()
print(len(ld.drop_duplicates()))

Display sample set of data

In [None]:
ld.head()

Summary statistics of the numerical dimensions

In [None]:
ld.describe().round(1)

Plot univariate distribution of the loan amount variable

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (5,5)
sns.distplot(ld["Original Loan Amount"], label = "Density Plot of Loan Amount")

Plot histogram of the loan term observations

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (5,5)
sns.distplot(ld["Term"], kde=False, rug=True, bins = 15)

Breakdown of loans by "Term"

In [None]:
ld["Term"].value_counts()

Bin "Disbursal date" dimension observations into calendar Years and append new dimension

In [None]:
ld['Disbursal date'].head()
dt_series = pd.to_datetime(ld['Disbursal date'])
ld['Year'] = dt_series.dt.year

Plot histogram of loans distribution by "Postcode"

In [None]:
postcode = ld.groupby("PostCode")
postcode.head()

plt.figure(figsize=(15,10))
postcode.size().sort_values(ascending=False).plot.bar()
plt.xticks(rotation=50)
plt.xlabel("Postcode")
plt.ylabel("Number of loans")
plt.show()

Plot loan count by "Year of issuance"

In [None]:
sns.lineplot('Year', 'Original Loan Amount', data=ld)

Plot Interest collected by "Year of issuance"

In [None]:
sns.lineplot('Year', 'Interest Collected', data=ld)

Chapter 3 - Credit risk

Display loan status variable and plot histogram and pie chart of associated distribution

In [None]:
ld["Latest Status"].value_counts()

In [None]:
plt.rcParams['figure.figsize'] = (5,5)
sns.countplot(ld["Latest Status"])

In [None]:
plt.rcParams['figure.figsize'] = (5,5)
ld.groupby("Latest Status").size().plot(kind='pie')

Simple function to calculate percentage ratio of loan in Default status

In [None]:
def_counter = 0
for i in range (len(ld)):
    status = ld['Latest Status'].iloc[i]
    if status == "Default":
        def_counter = def_counter + 1
    i = i+1
print((def_counter / (len(ld))*100))

Simple function to calculate percentage ratio of loan in Late status

In [None]:
late_counter = 0
for i in range (len(ld)):
    status = ld['Latest Status'].iloc[i]
    if status == "Late":
        late_counter = late_counter + 1
    i = i+1
print((late_counter / (len(ld))*100))

Generate correlation matrix for all numerical dimensions

In [None]:
plt.rcParams['figure.figsize'] = (8,8)
corr= ld.corr()
corr = (corr)
sns.heatmap(corr, cbar = True,  square = True, annot=True, fmt= '.2f',
            annot_kws={'size': 10}, 
            xticklabels=corr.columns.values, 
            yticklabels=corr.columns.values).set_title('Correlation Matrix')


Clean and prepare data for classification: remove less useful dimensions, replace full disbursal and last payment date variables with month, categorize status as binary variable (default/no default)

In [None]:
ld = ld.drop(columns = ["Snapshot Date","Encrypted Loan ID", "Encrypted Borrower ID",
                        "Date of Default","PostCode"])

ld['Disbursal date'] = pd.to_datetime(ld['Disbursal date']).dt.month

ld['Last payment date'] = pd.to_datetime(ld['Last payment date']).dt.month

def default_status(text):
    if (text == "Default" or text =="Late"):
        return 1
    else:
        return 0
ld['Latest Status']=ld['Latest Status'].apply(default_status)

ld.dropna()
ld.info()
ld.head()

Create training input and target datasets using dedicated Scikit function

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ld, test_size=0.2)
x_train = train.iloc[:,[0,1,2,3,4,6,7]]
y_train = train.iloc[:,8].dropna()

Create testing input and target datasets

In [None]:
x_test = test.iloc[:,[0,1,2,3,4,6,7]]
y_test = test.iloc[:,8].dropna()

Scale training and testing input datasets

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler() 
scale.fit(x_train)
x_train = scale.transform(x_train)
x_test = scale.transform(x_test) 

Logistic regression

Initiate and calibrate logistic regression - Liblinear solver

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

Calculate accuracy score of prediction

In [None]:
from sklearn.metrics import accuracy_score
y_pred_lr = lr.predict(x_test)
print(accuracy_score(y_test, y_pred_lr))

Generate confusion matrix to evaluate the accuracy of classification (Liblinear) on test dataset

In [None]:
from sklearn.metrics import confusion_matrix
cf_lr = confusion_matrix(y_test, y_pred_lr)
print(cf_lr)

Initiate and calibrate logistic regression - SAGA solver

In [None]:
from sklearn.linear_model import LogisticRegression
lr_1 = LogisticRegression(solver = 'saga')
lr_1.fit(x_train, y_train)

Calculate accuracy score of prediction

In [None]:
from sklearn.metrics import accuracy_score
y_pred_lr_1 = lr_1.predict(x_test)
print(accuracy_score(y_test, y_pred_lr_1))

Generate confusion matrix to evaluate the accuracy of classification (SAGA solver) on test dataset

In [None]:
from sklearn.metrics import confusion_matrix
cf_lr_1 = confusion_matrix(y_test, y_pred_lr_1)
print(cf_lr_1)

MLP model

Initiate and calibrate MLP using Stochastic Gradient Descent (SGD) 

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(5, 20), random_state=1)
mlp.fit(x_train, y_train)

Calculate mean accuracy on the given test data and labels.

In [None]:
mlp.score(x_train, y_train)

Calculate log of probability estimates

In [None]:
mlp.predict_proba(x_test)

In [None]:
mlp.score(x_test,y_test)

Generate prediction, confusion matrix and accuracy score using MLP (SGD) for test dataset

In [None]:
y_pred_mlp = mlp.predict(x_test)
cf_mlp = confusion_matrix(y_test, y_pred_mlp)
print(cf_mlp)

In [None]:
print(accuracy_score(y_test, y_pred_mlp))

Initiate and calibrate MLP using ADAM algo

In [None]:
mlp_1 = MLPClassifier(solver='adam', alpha=1e-5, 
                      hidden_layer_sizes=(5, 20), random_state=1)
mlp_1.fit(x_train, y_train)

Calculate mean accuracy on the given test data and labels.

In [None]:
mlp_1.score(x_train, y_train)

In [None]:
mlp_1.predict_proba(x_test)

In [None]:
mlp_1.score(x_test, y_test)

Generate prediction, confusion matrix and accuracy score using MLP (ADAM algo) for test dataset

In [None]:
y_pred_mlp1 = mlp_1.predict(x_test)
cf_mlp1 = confusion_matrix(y_test, y_pred_mlp1)
print(cf_mlp1)

In [None]:
print(accuracy_score(y_test, y_pred_mlp1))

Chapter 4 - Cross analysis

Create subset dataframe with dates and Zopa lending rates only

In [None]:
ld.columns
ld2 = ld.loc[:,['Disbursal date','Lending rate']]
ld2.info()

Transform "Disbursal date" dimension in a datetime variable type and extract key statistics

In [None]:
ld2["Disbursal date"] = pd.to_datetime(ld2["Disbursal date"], 
                                       infer_datetime_format=True)

ld2.info()
ld2.head()

In [None]:
ld2.describe()

Sort "Lending rates" by date

In [None]:
ld2 = ld2.sort_values(by=["Disbursal date"], ascending=True)

Plot Zopa P2P lending rate distribution

In [None]:
plt.rcParams['figure.figsize'] = (5,5)
sns.distplot(ld2['Lending rate'])

Plot Zopa P2P lending rates by year of issuance

In [None]:
ld2['Year'] = ld2["Disbursal date"].dt.year
ld2.head()
sns.lineplot(x = 'Year', y = 'Lending rate', data = ld2)

Generate log returns of Zopa P2P rates data variable and plot distribution

In [None]:
loan_rate = (ld2[ 'Lending rate'])
loan_rate_log = np.log(loan_rate) - np.log(loan_rate.iloc[0])
loan_rate_log.head()


In [None]:
sns.distplot(loan_rate_log, bins = 100)

Load Libor overnight (O/N) unsecured rates (source Bloomberg)

In [None]:
ois_rate = pd.read_csv('uk_libor_on.csv')

Summary statistics for Libor (O/N)rates

In [None]:
ois_rate.describe()

Group Libor (O/N) by year and plot rates

In [None]:
dt_series = pd.to_datetime(ois_rate['Date'])
ois_rate['Year'] = dt_series.dt.year
ois_rate.head()

In [None]:
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (5,5)
sns.lineplot(x = 'Year', y = 'Libor O/N', data = ois_rate)

Plot Libor (O/N) rates distribution

In [None]:
sns.distplot(ois_rate['Libor O/N'])

Generate log returns of Libor (O/N) rates 

In [None]:
ois_rate_log = np.log(ois_rate['Libor O/N']) - np.log(ois_rate['Libor O/N'].iloc[0])
sns.distplot(ois_rate_log)

In [None]:
sns.lineplot(data = ois_rate_log)

Load joint rates dataset

In [None]:
lending_rates1 = pd.read_csv('lending_rates.csv')
lending_rates1.dropna()
lending_rates1.info()
lending_rates1.head()

Investigate cross correlation between Libor O/N and Zopa P2P lending rates

In [None]:
sns.scatterplot(ois_rate_log,loan_rate_log)

In [None]:
sns.pairplot(ois_rate_log,loan_rate_log)

Load Signal package from Scikit to perform time series analysis

In [None]:
from scipy import signal

Generate cross correation between Libor O/N and Zopa P2P lending rates and compute Lag

In [None]:
xcorr = sp.signal.correlate(ois_rate_log,loan_rate_log, mode  = 'full')
print(len(xcorr))
np.max(xcorr)

Plot cross correlation function

In [None]:
sns.lineplot(data = xcorr, legend = 'full')

Generate signal coherence

In [None]:
f, cxy = sp.signal.coherence(ois_rate_log,loan_rate_log)
plt.semilogy(f, cxy)
plt.xlabel('frequency [Hz]')
plt.ylabel('Coherence')
plt.show()

Generate FFT convolution

In [None]:
conv = sp.signal.fftconvolve(ois_rate_log,loan_rate_log, mode='full')
np.average(conv)

In [None]:
 sp.signal.choose_conv_method(ois_rate_log,loan_rate_log, mode='full')

Plot periodogram for Zopa P2P rates

In [None]:
f, pxx_den = sp.signal.periodogram(ois_rate_log)
plt.semilogy(f, pxx_den)