In [2]:
# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
import seaborn as sns
sns.set(style="darkgrid")
%matplotlib inline

# Modeling and Forecasting
# ==============================================================================
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import statsmodels.api as sm

from joblib import dump, load

# Warnings configuration
# ======================================================================================
import warnings
# warnings.filterwarnings('ignore')

In [2]:
def reading_data(file_name):
    file_data = pd.read_csv(file_name)
    return file_data

In [3]:
def pre_processing(data):
    pd.set_option('display.float_format',  '{:,.2f}'.format)
    data = data.rename(
    columns={
    'Month': 'date', 'Core capital': 'CC', 'Total assets': 'TA', 'Non-performing loans': 'NPL',
    'Gross Loan Portifolio/Total loans': 'GLP-TL', 'Non-earning assets': 'NEA',
    'General loan loss reserve': 'GLLR', 'Gross loans': 'GL', 'Write-offs': 'WO', 'Recoveries': 'RCV'})
    data[["CC", "TA", "NPL", "GLP-TL", "GLLR", "GL", "WO", "RCV"]] = data[["CC", "TA", "NPL", "GLP-TL", "GLLR", "GL", "WO", "RCV"]].apply(pd.to_numeric)
    data['date'] = pd.to_datetime(data['date'], format='%Y/%m/%d')
    data = data.drop('xxxx', axis=1)
    data = data.set_index('date')
    return data

In [4]:
def handle_plain_negatives(row):
    if row < 0:
        # whatever  you logic
        return 0
    else:
        return row

In [5]:
def handle_percentage_negatives(row):
    if row < 0:
        # whatever  you logic
        return 0
    else:
        return row * 100

In [6]:
def further_preprocessing(data):
    # data['y1'] = data.apply(lambda row: row.CC / row.TA, axis = 1)
    
    #Capital adequacy
    data["y1"] = data["CC"] / data["TA"]
    # data.loc[data['y1'] <= 0, 'y1'] = 0
    # data.loc[data['y1'] > 0, 'y1'] = data['y1'] * 100
    
    #Asset quality 1
    data["y2"] = data["NPL"] / data["GLP-TL"]
    
    #Asset quality 2
    data["y3"] = data["NEA"] / data["TA"]
    
    #Asset quality 3
    # data["y4"] = data["GLLR"] / data["GL"]
    
    #Asset quality 4
    # data["y5"] = (data["WO"]- data["RCV"]) / data["TA"]
    
    data = data.asfreq('M')
    data = data.sort_index()
    
    # handling negative values
    data["CC"] =data["CC"].apply(handle_plain_negatives)
    data['TA'] = data['TA'].apply(handle_plain_negatives)
    data["NPL"] = data['NPL'].apply(handle_plain_negatives)
    data["GLP-TL"] = data['GLP-TL'].apply(handle_plain_negatives)
    data["GLLR"] = data['GLLR'].apply(handle_plain_negatives)
    data["GL"] = data['GL'].apply(handle_plain_negatives)
    data["WO"] = data['WO'].apply(handle_plain_negatives)
    data["RCV"] = data['RCV'].apply(handle_plain_negatives)
    
    data["y1"] = data['y1'].apply(handle_percentage_negatives)
    data["y2"] = data['y2'].apply(handle_percentage_negatives)
    return data
    

In [7]:
def define_x_y(data, y_column):
    if y_column == "y1":
#         x = data_df.drop(['PE'], axis=1).values 
        x = data[['CC', 'TA']].values      
        y = data[y_column]
        return x, y
    else:
        return False    

In [8]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [24]:
def save_model(model, name):
    dump(model, name+'_model.joblib')

In [10]:
data = reading_data("saccoss-performance.csv")
data = pre_processing(data)
data = further_preprocessing(data)
data.head()

Unnamed: 0_level_0,CC,TA,NPL,GLP-TL,NEA,GLLR,GL,WO,RCV,y1,y2,y3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-31,0.0,1647185345.0,25605561,1471439693,162450652.0,0,0,0,0,0.0,1.74,0.1
2015-02-28,0.0,1392856588.0,25605561,1179087930,200973658.0,0,0,0,0,0.0,2.17,0.14
2015-03-31,132923349.0,2588543756.0,38147858,2348820110,226428646.0,0,0,0,0,5.14,1.62,0.09
2015-04-30,119016272.0,2552087975.0,38147858,2222072340,316720635.0,0,0,0,0,4.66,1.72,0.12
2015-05-31,132034626.06,2584122690.0,38147858,2251849017,318978673.0,0,0,0,0,5.11,1.69,0.12


In [11]:
x_y_splits = define_x_y(data, "y1")
X = x_y_splits[0]
y = x_y_splits[1]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 0)

In [13]:
cross_val_score(LinearRegression(), X, y)

array([0.99930436, 0.99904362, 0.99903948, 0.99727141, 0.99877632])

In [14]:
cross_val_score(RandomForestRegressor(n_estimators=40), X, y)

array([1.        , 1.        , 0.99999863, 1.        , 1.        ])

In [15]:
ml_linear = LinearRegression()
ml_linear.fit(X_train, y_train)

LinearRegression()

In [16]:
y_pred = ml_linear.predict(X_test)

In [17]:
ml_linear.predict(np.array([[1,0]]))

array([-0.39495414])

In [25]:
save_model(ml_linear, "y1")

In [None]:
r2_score(y_test, y_pred)

In [None]:
plt.figure(figsize=(9, 5))
plt.scatter(y_test, y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs. Predicted')

In [None]:
pred_y_df = pd.DataFrame({'Actual Value': y_test, 'Predicted Value': y_pred, 'Difference': y_test - y_pred})
pred_y_df[0:20]