In [125]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras import layers
import tensorflow as tf

In [126]:
# loading and preprocessing data
df = pd.read_excel('tmp.xlsx', header=1,)

# removing the ruble symbol and non-printing characters
df['Loan issued'] = pd.to_numeric(df['Loan issued'].str.replace('₽', '').str.replace('\u00A0', ''))
df['Earned interest'] = pd.to_numeric(df['Earned interest'].str.replace('₽', '').str.replace('\u00A0', ''))
df['Unpaid,  full amount'] = pd.to_numeric(df['Unpaid,  full amount'].str.replace('₽', '').str.replace('\u00A0', ''))

# casting some columns to percentage format
df['Comission, %'] = df['Comission, %'] * 100
df['EL'] = df['EL'] * 100
df = df.drop(columns=['Unnamed: 6'], axis=1)

# column definition
df = df[['Comission, %', 'Rating', 'Loan issued', 'Earned interest', 'Unpaid,  full amount', 'EL']]

In [127]:
# calculation of required values
df['loss'] = df['Unpaid,  full amount'] * df['EL']
df['InvestorProfit'] = df['Earned interest'] - df['loss']

df['Commission'] = df['Loan issued'] * df['Comission, %'] / 100  # calculation of the absolute value of the commission

df['Profit'] = df['InvestorProfit'] + df['Commission']
df['Profit%'] = df['Profit'] / df['Loan issued']


In [128]:
# defining labels for classification

label1 = np.where(df['Profit%'] > 0)
df['label'] = 0
df['label'].iloc[label1] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'].iloc[label1] = 1


In [129]:
# Dataset making
X, Y_reg, Y_class = [], [], []

for i in df.index:
    X.append(df[['Comission, %', 'Rating', 'Loan issued', 'Earned interest', 'Unpaid,  full amount', 'EL']].iloc[i].values)
    Y_reg.append(df['Profit%'][i])
    Y_class.append(df['label'][i])
    

In [130]:
# Data scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Regression. Forecasting expected profit value.

In [131]:
# Slicing original dataset to train and test samples
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_reg, test_size=0.2, random_state=42)

### Neural network (CNN for regression)

In [132]:
# Defining model
inputs = keras.Input(shape=(6,1))
x = layers.BatchNormalization()(inputs)
x = layers.Conv1D(filters=20, kernel_size=(2), activation='relu')(x)
x = layers.Conv1D(filters=15, kernel_size=(2), activation='relu')(x)
x = layers.Conv1D(filters=10, kernel_size=(1), activation='relu')(x)
x = layers.Dense(250, activation='relu')(x)
x = layers.Dense(200, activation='relu')(x)
x = layers.Flatten()(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1, activation='linear')(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="regression-model")
model.summary()

Model: "regression-model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_24 (InputLayer)       [(None, 6, 1)]            0         
                                                                 
 batch_normalization_23 (Ba  (None, 6, 1)              4         
 tchNormalization)                                               
                                                                 
 conv1d_19 (Conv1D)          (None, 5, 20)             60        
                                                                 
 conv1d_20 (Conv1D)          (None, 4, 15)             615       
                                                                 
 conv1d_21 (Conv1D)          (None, 4, 10)             160       
                                                                 
 dense_90 (Dense)            (None, 4, 250)            2750      
                                                  

In [133]:
model.compile(loss=tf.keras.losses.LogCosh(),
              optimizer=tf.keras.optimizers.legacy.Adam(0.001),
              metrics=['mse', 'mae',])

In [134]:
# Model training
ephs = 28

history = model.fit(X_train, np.array(y_train), batch_size=8 , epochs=ephs, validation_split=0.3)
test_scores = model.evaluate(X_test, np.array(y_test), verbose=1)

print("Test loss:", test_scores[0])
print("Test mse:", test_scores[1])
print("Test mae:", test_scores[2])

Epoch 1/28
Epoch 2/28
Epoch 3/28
Epoch 4/28
Epoch 5/28
Epoch 6/28
Epoch 7/28
Epoch 8/28
Epoch 9/28
Epoch 10/28
Epoch 11/28
Epoch 12/28
Epoch 13/28
Epoch 14/28
Epoch 15/28
Epoch 16/28
Epoch 17/28
Epoch 18/28
Epoch 19/28
Epoch 20/28
Epoch 21/28
Epoch 22/28
Epoch 23/28
Epoch 24/28
Epoch 25/28
Epoch 26/28
Epoch 27/28
Epoch 28/28
Test loss: 0.9557790160179138
Test mse: 16.87584686279297
Test mae: 1.2410218715667725


### Classic ML models (regression)

In [135]:
# Testing classic ML algorithms 

from sklearn import linear_model
reg_Lasso = linear_model.Lasso(alpha=0.1)
reg_Lasso.fit(X_train,y_train)

from sklearn.linear_model import ARDRegression
reg_ard = ARDRegression()
reg_ard.fit(X_train, y_train)

from sklearn.linear_model import ElasticNet
reg_enet = ElasticNet(alpha=0.08, l1_ratio=0.5)
reg_enet.fit(X_train, y_train)


In [136]:
# Making a forecast using the resulting models 
predicted_y_Lasso = reg_Lasso.predict(X_test)
predicted_y_ard = reg_ard.predict(X_test)
predicted_y_enet = reg_enet.predict(X_test)

In [137]:
# Receiving model metrics

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

EVS_Lasso = explained_variance_score(y_test, predicted_y_Lasso)
MAE_Lasso = mean_absolute_error(y_test, predicted_y_Lasso)
MSE_Lasso = mean_squared_error(y_test, predicted_y_Lasso)

EVS_ard = explained_variance_score(y_test, predicted_y_ard)
MAE_ard = mean_absolute_error(y_test, predicted_y_ard)
MSE_ard = mean_squared_error(y_test, predicted_y_ard)

EVS_enet = explained_variance_score(y_test, predicted_y_enet)
MAE_enet = mean_absolute_error(y_test, predicted_y_enet)
MSE_enet = mean_squared_error(y_test, predicted_y_enet)

### Regression results

In [138]:
# Returns summary table for gotten model metrics

reg_summary = pd.DataFrame({'Lasso':[EVS_Lasso, MAE_Lasso, MSE_Lasso],
                            'ARDRegression':[EVS_ard, MAE_ard, MSE_ard],
                            'ElasticNet':[EVS_enet, MAE_enet, MSE_enet],
                            'CNN':[None, test_scores[2], test_scores[1]]},
                           index=['EVS', 'MAE', 'MSE']).T
reg_summary

Unnamed: 0,EVS,MAE,MSE
Lasso,0.778256,1.803692,43.906529
ARDRegression,0.835284,1.827691,32.8148
ElasticNet,0.375667,3.724602,123.186156
CNN,,1.241022,16.875847


Вывод: для классификации предлагается использовать модели глубокого обучения,
т.к. они показывают лучшие результаты. В частности, сверточную нейронную сеть.
Каждую полученную модель можно сохранить и использовать в дальнейшем.

# Classification. Predicts 1 when profit > 0, and 0 otherwise.

In [153]:
# Slicing original dataset to train and test samples
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y_class, test_size=0.2, random_state=42)

### Neural networks

Convolutional Neural network

In [140]:
# Defining model
inputs = keras.Input(shape=(6,1))
x = layers.BatchNormalization()(inputs)

x = layers.Conv1D(filters=5, kernel_size=(2), activation='relu')(x)
x = layers.Dense(300, activation='relu')(x)
x = layers.Dense(150, activation='relu')(x)
x = layers.Dense(75, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Flatten()(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)

model_class = keras.Model(inputs=inputs, outputs=outputs, name="classification-model")
model_class.summary()

Model: "classification-model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_25 (InputLayer)       [(None, 6, 1)]            0         
                                                                 
 batch_normalization_24 (Ba  (None, 6, 1)              4         
 tchNormalization)                                               
                                                                 
 conv1d_22 (Conv1D)          (None, 5, 5)              15        
                                                                 
 dense_93 (Dense)            (None, 5, 300)            1800      
                                                                 
 dense_94 (Dense)            (None, 5, 150)            45150     
                                                                 
 dense_95 (Dense)            (None, 5, 75)             11325     
                                              

In [141]:
model_class.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=keras.optimizers.legacy.Adam(learning_rate=0.002),
    metrics=[ 'accuracy', tf.keras.metrics.Precision(),tf.keras.metrics.Recall()],
)

In [142]:
# Model training
ephs = 16

history = model_class.fit(X_train, np.array(y_train), batch_size=8 , epochs=ephs, validation_split=0.2)
test_scores_CNN_class = model_class.evaluate(X_test, np.array(y_test), verbose=1)


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


fully connected neural network

In [143]:
# Defining model
inputs = keras.Input(shape=(6,1))
x = layers.BatchNormalization()(inputs)
x = layers.Dense(300, activation='relu')(x)
x = layers.Dense(150, activation='relu')(x)
x = layers.Dense(70, activation='relu')(x)
x = layers.Dropout(0.2)(x)
x = layers.Flatten()(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs=inputs, outputs=outputs, name="classification-model")
model.summary()

Model: "classification-model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_26 (InputLayer)       [(None, 6, 1)]            0         
                                                                 
 batch_normalization_25 (Ba  (None, 6, 1)              4         
 tchNormalization)                                               
                                                                 
 dense_97 (Dense)            (None, 6, 300)            600       
                                                                 
 dense_98 (Dense)            (None, 6, 150)            45150     
                                                                 
 dense_99 (Dense)            (None, 6, 70)             10570     
                                                                 
 dropout_20 (Dropout)        (None, 6, 70)             0         
                                              

In [144]:
model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=keras.optimizers.legacy.Adam(learning_rate=0.001),
    metrics=[ 'accuracy', tf.keras.metrics.Precision(),tf.keras.metrics.Recall()],
)

In [145]:
# Model training
ephs = 10

history = model.fit(X_train, np.array(y_train), batch_size=1 , epochs=ephs, validation_split=0.2)
test_scores_DNN_class = model.evaluate(X_test, np.array(y_test), verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Classic ML models

In [146]:
# Testing classic ML algorithms 

from sklearn import svm
clf_SVC = svm.SVC()
clf_SVC.fit(X_train,y_train)

from sklearn.linear_model import SGDClassifier
clf_SGD = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
clf_SGD.fit(X_train, y_train)

from sklearn.neighbors import NearestCentroid
clf_NC = NearestCentroid()
clf_NC.fit(X_train, y_train)



In [147]:
# method for calculating precision and recall for received data
def metrics_class(y_test, y_pred):
    TN, TP, FN, FP = 0, 0, 0, 0

    for i in range(len(y_test)):
        if y_pred[i] == 0 and y_test[i] == 0:
            TN += 1
        if y_pred[i] == 1 and y_test[i] == 1:
            TP += 1
        if y_pred[i] == 0 and y_test[i] == 1:
            FN += 1
        if y_pred[i] == 1 and y_test[i] == 0:
            FP += 1
    
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    
    return precision, recall

In [148]:
# Making predictions using the resulting models 
predicted_y_SVC = clf_SVC.predict(X_test)
predicted_y_SGD = clf_SGD.predict(X_test)
predicted_y_NC = clf_NC.predict(X_test)

precision_SVC, recall_SVC = metrics_class(y_test, predicted_y_SVC)
precision_SGD, recall_SGD = metrics_class(y_test, predicted_y_SGD)
precision_NC, recall_NC = metrics_class(y_test, predicted_y_NC)

In [150]:
# Receiving model F1-metric
F1_SVC = 2 * (precision_SVC * recall_SVC) / (precision_SVC + recall_SVC)
F1_SVD = 2 * (precision_SGD * recall_SGD) / (precision_SGD + recall_SGD)
F1_NC = 2 * (precision_NC * recall_NC) / (precision_NC + recall_NC)
F1_CNN = 2 * (test_scores_CNN_class[2] * test_scores_CNN_class[3]) / (test_scores_CNN_class[2] + test_scores_CNN_class[3])
F1_DNN = 2 * (test_scores_DNN_class[2] * test_scores_DNN_class[3]) / (test_scores_DNN_class[2] + test_scores_DNN_class[3])

### Classification results

In [152]:
# Returns summary table for gotten model metrics

class_summary = pd.DataFrame({'SVC':[precision_SVC, recall_SVC, F1_SVC],
                            'SGD':[precision_SGD, recall_SGD, F1_SVD],
                            'NC':[precision_NC, recall_NC, F1_NC],
                            'DNN':[test_scores_DNN_class[2], test_scores_DNN_class[3], F1_DNN],
                            'CNN':[test_scores_CNN_class[2], test_scores_CNN_class[3], F1_CNN]},
                           index=['precision', 'recall', 'F1']).T
class_summary

Unnamed: 0,precision,recall,F1
SVC,1.0,0.066667,0.125
SGD,0.666667,0.133333,0.222222
NC,0.1875,0.4,0.255319
DNN,1.0,0.2,0.333333
CNN,0.846154,0.733333,0.785714


Вывод: для классификации предлагается использовать модели глубокого обучения, 
т.к. они показывают лучшие результаты. В частности, сверточную нейронную сеть.
Каждую полученную модель можно сохранить и использовать в дальнейшем.