# Data Analysis

## Table of Contents
- Load Data
- Sample Description
    - Loan Related
    - User Related
- RFM Analysis
    - Scores Definitions
    - Segment Definitions
    - Segment Analysis
- Explanatory Analysis
    - Loan Profile
    - Repayment Profile
    - User Profile

In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd

# visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# plotly config
import plotly.io as pio
pio.renderers.default = "browser"

template = "plotly_white"
colors = {
    "red": "rgb(255,0,0)",
    "dark_blue": "rgb(30,144,255)",
    "medium_blue": "rgb(0,191,255)",
    "light_blue": "rgb(135,206,250)",
    "dark_purple": "rgb(202, 105, 157)",
    "medium_purple": "rgb(221, 136, 172)",
    "light_purple": "rgb(234, 169, 189)",
    "dark_gray": "rgb(169,169,169)",
    "medium_gray": "rgb(192,192,192)",
    "light_gray": "rgb(211,211,211)"
}
title_font_size = 22
font_size = 18

# Load Data

In [3]:
# analysis based on the training set loans and statistics
df_loans = pd.read_pickle("../data/interim/df_loans_train_raw.pkl")

df_loans

Unnamed: 0,id,user_id,amount,total_amount,due_amount,due_date,status,created_at,status_cleaned,loan_term,...,status_paid,rate_default,median_repayment_amount,num_repayments,ratio_repaid_total,max_repayment_delay,due_date_exceeded,num_late_repayment,median_days_lag_repayment,loan_quality
0,86,1,6000.0,6045.28,6459.00,2022-05-03,debt_collection,2022-02-02,debt,90,...,1,0.000000,39.00,1,0.006451,0,0,0,1.0,good_loan
2,1744,3,6000.0,6045.28,6458.80,2022-07-18,repaid,2022-04-18,repaid,91,...,18,0.052632,270.00,19,1.037472,0,0,0,3.0,good_loan
5,6428,4,6000.0,6045.28,7749.16,2022-12-18,repaid,2022-09-19,repaid,90,...,25,0.074074,169.50,27,1.068647,0,0,0,1.0,bad_loan
6,2222,5,6000.0,6045.28,6458.80,2022-07-21,repaid,2022-04-22,repaid,90,...,28,0.000000,129.58,28,1.011629,0,0,0,1.0,good_loan
7,3380,5,6000.0,6045.28,6458.80,2022-08-25,repaid,2022-05-27,repaid,90,...,50,0.019608,115.06,51,1.023666,0,0,0,1.0,good_loan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6583,1186,3153,6000.0,6045.28,6458.80,2022-06-13,repaid,2022-03-15,repaid,90,...,28,0.034483,101.25,29,1.017945,220,1,1,1.0,good_loan
6584,3111,3153,6000.0,6045.28,6458.80,2022-08-02,repaid,2022-05-04,repaid,90,...,13,0.071429,466.50,14,1.008317,0,0,0,1.0,good_loan
6585,3856,3153,6000.0,6045.28,6458.78,2022-09-13,repaid,2022-06-15,repaid,90,...,13,0.000000,207.00,13,1.010607,0,0,0,1.0,good_loan
6586,4358,3153,6000.0,6045.28,6458.80,2022-10-02,repaid,2022-07-04,repaid,90,...,17,0.055556,78.00,18,1.018403,0,0,0,1.0,good_loan


In [4]:
loan_ids = set(df_loans["id"].unique())

df_loan_repayments = pd.read_pickle("../data/processed/df_loan_repayments.pkl")
df_loan_repayments = df_loan_repayments[df_loan_repayments["loan_id"].isin(loan_ids)]

df_loan_repayments

Unnamed: 0,id,loan_id,type,amount,status,created_at,status_cleaned
0,1,2,autopilot,269.70,paid,2022-02-01,paid
1,2,4,autopilot,2550.00,paid,2022-02-01,paid
2,3,53,pix,1500.00,defaulted,2022-02-01,defaulted
3,4,22,autopilot,630.00,paid,2022-02-02,paid
4,5,70,autopilot,120.00,paid,2022-02-02,paid
...,...,...,...,...,...,...,...
172440,172441,4524,pix,362.75,defaulted,2023-03-21,defaulted
172441,172442,4524,pix,362.75,refunded,2023-03-22,paid
172442,172443,4524,autopilot,239.12,refunded,2023-03-22,paid
172443,172444,4524,pix,362.75,defaulted,2023-03-22,defaulted


In [5]:
df_users = pd.read_pickle("../data/processed/df_users.pkl")

df_users["rate_transactions_installment"] = df_users["rate_transactions_installment"].astype(float)

df_users

Unnamed: 0,user_id,recency,frequency,monetary,median_intallments,median_lag_transaction,rate_denied,rate_transactions_installment,rate_credit_debit,ratio_online_person,...,median_repayment_amount,max_repayment_delay,median_days_lag_repayment,num_loans,rate_due_date_exceeded,num_repayments,num_late_repayment,rate_late_repayment,rate_manual_auto_repayments,rate_repaid_loans
0,1,346,10,25154.18,2.0,5.0,0.300000,0.700000,5.000000,0.090909,...,39.0000,0.0,1.0,1.0,0.000000,1.0,0.0,0.000000,0.500000,0.000
1,2,340,35,66491.00,10.0,1.0,0.057143,0.971429,17.500000,0.027778,...,,,,,,,,,,
2,3,191,78,117555.00,5.0,1.0,0.115385,1.000000,79.000000,0.025641,...,270.0000,0.0,3.0,1.0,0.000000,19.0,0.0,0.000000,0.105263,1.000
3,4,15,286,423169.31,1.0,0.0,0.090909,0.419580,1.666667,0.006993,...,169.5000,0.0,1.0,1.0,0.000000,27.0,0.0,0.000000,0.115385,1.000
4,5,54,793,440568.44,1.0,0.0,0.083228,0.480454,131.500000,0.001259,...,122.3200,0.0,1.0,2.0,0.000000,79.0,0.0,0.000000,0.051948,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3038,3149,216,231,96516.80,1.0,0.0,0.069264,0.333333,2.065789,0.008658,...,40.8000,8.0,1.0,1.0,1.000000,15.0,1.0,0.066667,0.062500,0.000
3039,3150,8,849,395379.93,1.0,0.0,0.174323,0.398115,2.590717,0.291351,...,217.1850,10.0,1.0,8.0,0.250000,146.0,2.0,0.013699,0.129771,0.875
3040,3151,140,227,73475.80,2.0,0.0,0.092511,0.568282,2.271429,0.022321,...,87.5625,1.0,1.5,2.0,0.500000,69.0,1.0,0.014493,0.028986,1.000
3041,3152,39,1118,359051.73,3.0,0.0,0.135063,0.952594,61.222222,0.029412,...,110.5750,0.0,1.0,4.0,0.000000,177.0,0.0,0.000000,0.022857,1.000


# Sample Description

- Loan Related
- User Related

## Loan Debt

**Note:** 23% of the loans have resulted in debt

In [6]:
df_loans["status"] = df_loans["status"].str.replace("_", " ")

In [7]:
fig = px.histogram(
    df_loans[df_loans["status"] != "error"],
    x="status",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>23% of the loans have resulted in debt</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="loan status",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_1.png](attachment:ba50122c-7780-47f5-b309-492cd2da1322.png)

## Loan Principal Amount

**Note:** Loans typically range from more than R$6k

In [8]:
fig = px.histogram(
    df_loans, 
    x='amount_bin', 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Loans typically range from more than R$6k</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="principal loan amount (R$)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["1k - 2k", "2k - 3k", "3k - 4k", "4k - 5k", "5k - 6k", "6k - 7k"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_2.png](attachment:9597374f-46f1-49e9-a223-79bff8915c61.png)

## Loan Fees

**Note:** Loan fees typically range from R$40-50

In [9]:
fig = px.histogram(
    df_loans, 
    x='loan_fees_bin', 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Loan fees typically range from R$40-50</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="loan fees (R$)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["0 - 10", "10 - 20", "20 - 30", "30 - 40", "40 - 50"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_3.png](attachment:00f7af94-f844-431b-8922-0e6f60ad1a0f.png)

## Loan Due Amount

**Note:** Loans can range up to R$9k, if there are no repayments during the contract period

In [10]:
fig = px.histogram(
    df_loans, 
    x='due_amount_bin', 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Loans can range up to R$9k, if there are no repayments during the contract period</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="due loan amount (R$)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["1k - 2k", "2k - 3k", "3k - 4k", "4k - 5k", "5k - 6k", "6k - 7k", "7k - 8k", "8k - 9k"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_4.png](attachment:31e1120b-62ba-4cde-82bf-a0e7230bbaf3.png)

## Loan Interest Rate

**Note:** Only 18% of the loans have a high interest rate (exceeding 100%)

In [11]:
fig = px.histogram(
    df_loans, 
    x="interest_rate_bin", 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Only 18% of the loans have a high interest rate (exceeding 100%)</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="loan interest rate (%)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["small", "medium", "large"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_5.png](attachment:8db842db-f88b-4bfd-a74a-19ee49c97afe.png)

## Loan Term

**Note:** Loans typically need to be repaid in the next 3 months

In [12]:
df_loans['loan_term'].describe()

count    5270.000000
mean       90.058824
std         0.351707
min        90.000000
25%        90.000000
50%        90.000000
75%        90.000000
max        96.000000
Name: loan_term, dtype: float64

## Loan Repayment Default

**Note:** Only 5% of the loan repayments have gone to default

In [13]:
fig = px.histogram(
    df_loan_repayments,
    x="status_cleaned",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Only 5% of the loan repayments have gone to default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="repayment status",
    yaxis_title="% of repayments",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_6.png](attachment:63e161ce-9997-45a3-954f-c149905dc2de.png)

## Number of Loan Repayments

**Note:** Loans are typically repaid in 10-40 repayments

In [14]:
fig = px.histogram(
    df_loans,
    x="num_repayments",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Loans are typically repaid in 10-40 repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="number of repayments",
    yaxis_title="% of loans",
    template=template,
)

fig.show()

![img_9.png](attachment:a36b18ae-a221-4513-8bb7-e7f814ae4dad.png)

In [15]:
df_loans["num_repayments"].describe()

count    5270.000000
mean       26.242125
std        21.082838
min         1.000000
25%        10.000000
50%        21.000000
75%        38.000000
max       271.000000
Name: num_repayments, dtype: float64

## Loan Repaid Ratio

**Note:** Loans resulting in debt, can be partially repaid up to 45%

In [16]:
# calculate repaid ratio distribution only for loans that resulted in debt
df_loans[df_loans["status_cleaned"] == "debt"]["ratio_repaid_total"].describe()

count    1195.000000
mean        0.355631
std         0.269146
min         0.000000
25%         0.117972
50%         0.315534
75%         0.554898
max         0.945293
Name: ratio_repaid_total, dtype: float64

## Loan Due Date Exceeded

**Note:** 
- 81% of the loans have exceeded repayment due date
- On average, loans exceed due date by 2-6 repayments

In [17]:
df_loans["due_date_exceeded"] = df_loans["due_date_exceeded"].replace({0: "not exceeded", 1: "exceeded"})

In [18]:
fig = px.histogram(
    df_loans,
    x="due_date_exceeded",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>81% of the loans have exceeded repayment due date</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="due date exceeded",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_10.png](attachment:efa049e2-7f94-4bcd-8e44-28f3da2b5f50.png)

In [19]:
fig = px.histogram(
    df_loans[df_loans["due_date_exceeded"] == "exceeded"],
    x="num_late_repayment",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>On average, loans exceed due date by 2-6 repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="number of late repayments",
    yaxis_title="% of loans",
    template=template,
)

fig.show()

![img_11.png](attachment:885887e6-6eb6-48ea-9aad-5360cb7e733f.png)

In [20]:
df_loans[df_loans["due_date_exceeded"] == "exceeded"]["num_late_repayment"].describe()

count    1015.00000
mean        4.46798
std         3.50777
min         1.00000
25%         2.00000
50%         3.00000
75%         6.00000
max        24.00000
Name: num_late_repayment, dtype: float64

## Loan Repayment Lag

**Note:** Repayments happen every 1-3 days

In [21]:
df_loans['median_days_lag_repayment'].describe()

count    5270.000000
mean        2.899620
std         7.023669
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max       241.000000
Name: median_days_lag_repayment, dtype: float64

## Loan Quality

**Note:** 1/4 loans has led to repayment conditions that carry a degree of risk

In [22]:
df_loans["loan_quality"] = df_loans["loan_quality"].replace({"good_loan": "high", "bad_loan": "low"})

In [23]:
fig = px.histogram(
    df_loans,
    x="loan_quality",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>1/4 loans has led to repayment conditions that carry a degree of risk</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="loan quality",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_12.png](attachment:6eec6674-1447-44cb-acf6-5b71103c3959.png)

## Card Type Preference

**Note:** 82% of the users prefer to make purchases using a credit card

In [7]:
df_users["credit_debit_preference"] = np.where(df_users['rate_credit_debit'] <= 1, "debit", "credit")

In [8]:
fig = px.histogram(
    df_users,
    x="credit_debit_preference",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>82% of the users prefer to make purchases using a credit card</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="card type preference",
    yaxis_title="% of users",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_13.png](attachment:c30acee2-817d-4409-9300-ee3b5fdc82a8.png)

## Installment Preference

**Note:** Users typically finance 25% to 75% of their purchases through installment plans

In [9]:
fig = px.histogram(
    df_users,
    x="rate_transactions_installment",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users typically finance 25% to 75% of their purchases through installment plans</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of transactions with installments",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_14.png](attachment:2ef09a23-78be-4b57-9037-f4c88d629b40.png)

In [10]:
df_users["rate_transactions_installment"].astype(float).describe()

count    3043.000000
mean        0.502685
std         0.288692
min         0.000000
25%         0.256330
50%         0.522388
75%         0.749419
max         1.000000
Name: rate_transactions_installment, dtype: float64

## Installments

**Note:** Users typically opt for installment plans ranging from 2-4 payments for their purchases

In [14]:
df_users["median_intallments"] = df_users["median_intallments"].round()

In [17]:
fig = px.histogram(
    df_users,
    x="median_intallments",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users typically opt for installment plans ranging from 2-4 payments for their purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="number of installments",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_15.png](attachment:48616426-77d1-4133-b28c-be3d0ad23790.png)

In [16]:
df_users["median_intallments"].describe()

count    3043.000000
mean        2.901084
std         2.801640
min         1.000000
25%         1.000000
50%         2.000000
75%         4.000000
max        12.000000
Name: median_intallments, dtype: float64

## Denied Transactions

**Note:** Users encounter denied transactions, less than 20% of time

In [18]:
fig = px.histogram(
    df_users,
    x="rate_denied",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users encounter denied transactions, less than 20% of time</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="denied transactions ratio",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_16.png](attachment:87a234ed-a3c4-43a7-b5a4-5a9805ebcf6b.png)

In [19]:
df_users["rate_denied"].describe()

count    3043.000000
mean        0.172152
std         0.133161
min         0.000000
25%         0.080000
50%         0.126404
75%         0.229460
max         0.857585
Name: rate_denied, dtype: float64

## Purchases Mode

**Note:** Our base consists of users who value in-person purchases

In [23]:
df_users["online_person_preference"] = np.where(df_users["ratio_online_person"] <= 1, "in-person", "online")

In [24]:
fig = px.histogram(
    df_users,
    x="online_person_preference",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Our base consists of users who value in-person purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="purchase type",
    yaxis_title="% of users",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_17.png](attachment:788e9bdd-9e03-40b2-938c-3b3ef0f42e52.png)

## User Number of Loans

**Note:** The majority of the users have taken 1 to 2 loans

In [40]:
df_users["num_loans"].describe()

count    2660.000000
mean        1.980451
std         1.648159
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        21.000000
Name: num_loans, dtype: float64

## User Default Ratio

**Note:** Half of our user base consists of individuals with a default rate from 4%-15%

In [33]:
fig = px.histogram(
    x=df_users["rate_default"] * 100,
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Half of our user base consists of individuals with a default rate from 4%-15%</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="default rate (%)",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_18.png](attachment:2f693076-b7e1-482b-807c-d4ead28704b8.png)

## User Due Date Exceeded Rate

**Note:** 
- We observe that more than 50% of the users consistently exceed the due date.
- Due date exceeded rate is skewed since most users have taken 1 to 2 loans.

In [45]:
df_users[df_users["rate_due_date_exceeded"] > 0]["rate_due_date_exceeded"].describe()

count    944.000000
mean       0.796124
std        0.281451
min        0.090909
25%        0.500000
50%        1.000000
75%        1.000000
max        1.000000
Name: rate_due_date_exceeded, dtype: float64

## User Late Repayment Rate

**Note:** Users who exceed the due date, tend to delay between 4%-15% of the repayments

In [51]:
fig = px.histogram(
    x=df_users[df_users["rate_late_repayment"] > 0]["rate_late_repayment"] * 100,
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users who exceed the due date, tend to delay between 4%-15% of the repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="late repayments rate (%)",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_19.png](attachment:814c1840-9e04-4560-95bc-0826e2add9b5.png)

In [52]:
df_users[df_users["rate_late_repayment"] > 0]["rate_late_repayment"].describe()

count    944.000000
mean       0.116205
std        0.109339
min        0.005988
25%        0.048583
50%        0.096774
75%        0.146423
max        1.000000
Name: rate_late_repayment, dtype: float64

## Max Repayment Delay

**Note:** Users who exceed the due date, tend to delay the repayments from 4-13 days

In [53]:
df_users[df_users["max_repayment_delay"] > 0]["max_repayment_delay"].describe()

count    944.000000
mean       9.710805
std       17.185102
min        1.000000
25%        4.000000
50%       10.000000
75%       13.000000
max      391.000000
Name: max_repayment_delay, dtype: float64

## Manual Repayment Preference

**Note:** Users typically prefer an automated plan of loan repayment

In [64]:
fig = px.histogram(
    x=df_users["rate_manual_auto_repayments"] * 100,
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users typically prefer an automated plan of loan repayment</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="preference <br>manual to auto repayment (%)",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_20.png](attachment:0f0736de-4cee-43b1-9565-18491284cdd3.png)

In [54]:
df_users["rate_manual_auto_repayments"].describe()

count    2660.000000
mean        0.331841
std         0.860075
min         0.006494
25%         0.052632
50%         0.111111
75%         0.269231
max        16.000000
Name: rate_manual_auto_repayments, dtype: float64

# RFM Analysis

- Customer (User) Segmentation based on RFM Analysis
- RFM segments the consumer (user) base by their purchasing patterns or habits based on Recency (R), Frequency (F) and Monetary Value (M)
- *Recency:* how long ago they made a purchase
- *Frequency:* how often they make purchases
- *Monetary Value:* how much money they spend

## Scores Definitions

Typically, RFM scores are in a scale of 1-5. 

However, for the purpose of loan repayment analysis, the scores are considered in a scale of 1-3 (the lower, the better the results for a customer) in order to avoid segments that have similar behavior regarding loan repayment.

| Recency | Frequency | Monetary |
| :-----: | :-------: | :------: |
| up to 1 month (1) | more than 500 (1) | more than R\\$ 500k (1) |
| 1 to 3 months (2) | 100 to 500 (2) | R\\$ 100k to 500k (2) |
| more than 3 months (3) | less than 100 (3) | less than R\\$100k (3) |t |

In [None]:
df_users[["recency", "frequency", "monetary"]].describe()

In [None]:
# Recency buckets
# Higher values indicate users who haven't made a purchase for a long time (not active)
# q1, q3 = df_users["recency"].quantile(.25), df_users["recency"].quantile(.75)

df_users.loc[df_users["recency"] <= 30, "recency_bin"] = "1"
df_users.loc[df_users["recency"].between(30, 90, inclusive="right"), "recency_bin"] = "2"
df_users.loc[df_users["recency"] > 90, "recency_bin"] = "3"

In [None]:
# Frequency buckets
# Higher values indicate users who make frequent purchases (more active)
q1, q3 = df_users["frequency"].quantile(.25), df_users["frequency"].quantile(.75)

df_users.loc[df_users["frequency"] <= 100, "frequency_bin"] = "3"
df_users.loc[df_users["frequency"].between(100, 500, inclusive="right"), "frequency_bin"] = "2"
df_users.loc[df_users["frequency"] > 500, "frequency_bin"] = "1"

In [None]:
# Monetary buckets
# Higher values indicate users with higher spendings
q1, q3 = df_users["monetary"].quantile(.25), df_users["monetary"].quantile(.75)

df_users.loc[df_users["monetary"] <= 100_000, "monetary_bin"] = "3"
df_users.loc[df_users["monetary"].between(100_000, 500_000, inclusive="right"), "monetary_bin"] = "2"
df_users.loc[df_users["monetary"] > 500_000, "monetary_bin"] = "1"

## Segment Definitions

Customer (user) segments identified:
- **Champions:** Extremely active cutomers (high frequency and low recency) with moderate to high monetary value
- **Big Spenders:** Active customers (medium recency and frequency) with high monetary value
- **Promising:** Active customers (medium recency and frequency) with low to moderate monetary value
- **Recent:** Customers who entered our base recently (low frequency and moderate to high recency) with low to moderate monetary value
- **Inactive:** Customers with extremely low activity (low recency and low to moderate frequency); monetary value isn't a factor here

In [None]:
df_users["rfm_segment"] = df_users["recency_bin"] + df_users["frequency_bin"] + df_users["monetary_bin"]

In [None]:
# define customer clusters based on RFM Analysis
cluster_1 = ["111", "113", "112"]  # Champions
cluster_2 = ["121", "231", "221", "211"]  # Big Spenders
cluster_3 = ["213", "122", "123", "212", "222", "223", "311", "313", "312"]  # Promising Customers
cluster_4 = ["131", "132", "133", "232", "233"]  # Recent Customers
cluster_5 = ["331", "332", "333", "321", "322", "323"]  # Inactive Customers

clusters = list(
    zip(
        [cluster_1, cluster_2, cluster_3, cluster_4, cluster_5],
        ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]
    )
)

for cluster, label in clusters:
    df_users.loc[df_users["rfm_segment"].isin(cluster), "rfm_segment_desc"] = label

In [None]:
fig = px.histogram(
    df_users,
    x="rfm_segment_desc",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>83% of the users are frequent buyers, with moderate to modest spending habits<br><sub>(Promising and Inactive segments)</sub></b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="% of user",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_1.png](attachment:2b74a31e-4d67-45dc-a91b-edd2673ee339.png)

## Segment Analysis

**Notes:**
- Champions and Promising customers equally prefer purchasing with credit and debit
- Big Spenders have a slight preference for online purchases
- Big Spenders and Recent customers have a higher likelihood of transaction rejection
- Big Spenders, Recent and Inactive customers consistently opt for installment plans for their purchases

In [None]:
fig = px.box(
    df_users, 
    x="rfm_segment_desc", 
    y="recency",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Recency distribution per user (customer) segment</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="Recency (days)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_2.png](attachment:86d0af64-d206-4ff8-9e72-e49c9b8eec93.png)

In [None]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["recency"].describe()

In [None]:
fig = px.box(
    df_users, 
    x="rfm_segment_desc", 
    y="frequency",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Frequency distribution per user (customer) segment</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="Frequency<br>(number of purchases)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_3.png](attachment:4c65844d-5175-4931-b3bf-9d2dc884503b.png)

In [None]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["frequency"].describe()

In [None]:
fig = px.box(
    df_users, 
    x="rfm_segment_desc", 
    y="monetary",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Monetary amount distribution per user (customer) segment</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="Monetary amount (R$)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_4.png](attachment:73784bd9-dc3a-4aab-9746-0431ce91f086.png)

In [None]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["monetary"].describe()

In [None]:
fig = px.box(
    df_users, 
    y="rate_credit_debit", 
    x="rfm_segment_desc",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Champions and Promising customers equally prefer purchasing with credit and debit</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="credit to debit ratio",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_5.png](attachment:85ee3df1-9b5b-4746-95c2-7af7d4bffbda.png)

In [None]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["rate_credit_debit"].describe()

In [None]:
labels = ["Inactive", "Recent", "Promising", "Big Spenders", "Champions",]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_users[(df_users["rfm_segment_desc"] == label) & (df_users["online_person_preference"] == "in-person")]["user_id"].count() \
               / df_users[df_users["rfm_segment_desc"] == label]["user_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="in-person"),
    go.Bar(y=labels,
           x=[
               df_users[(df_users["rfm_segment_desc"] == label) & (df_users["online_person_preference"] == "online")]["user_id"].count() \
               / df_users[df_users["rfm_segment_desc"] == label]["user_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_gray"],
           orientation="h",
           name="online")
])

fig.update_layout(
    title="<b>Big Spenders have a sligh preference for online purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of users",
    yaxis_title="user segment",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        title="purchase",
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="outside"
)

fig.show()

![img_6.png](attachment:8519787c-748c-4dd4-9a5c-fb5ed83bae97.png)

In [None]:
fig = px.box(
    df_users, 
    y="rate_denied", 
    x="rfm_segment_desc",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Big Spenders and Recent cstomers have a higher likelihood of transaction rejection</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="transactions rejection rate",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_7.png](attachment:2f7e4d95-2aa3-42c6-af81-088811fff877.png)

In [None]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["rate_denied"].describe()

In [None]:
fig = px.box(
    df_users, 
    y="rate_transactions_installment", 
    x="rfm_segment_desc",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Big Spenders, Recent and Inactive customers<br>consistently opt for installment plans for their purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="preference of installment plans<br>(0 - 100%)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_8.png](attachment:0677b4f0-107e-4408-ad91-22e2098997fd.png)

In [None]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["rate_transactions_installment"].describe()

## ✅ checkpoint

In [None]:
df_users_segments = df_users[["user_id", "rfm_segment_desc"]]

In [None]:
df_users_segments.to_pickle("../data/processed/df_users_segments.pkl")

# Explanatory Analysis

In [None]:
df_loans[["id", "user_id", "amount_bin", "due_amount_bin", "interest_rate_bin"]]

df_users = df_users[["user_id", "median_spending", "avg_intallments", "avg_lag_transaction", 
                     "rate_denied", "rate_transactions_installment", "rate_credit_debit", 
                     "credit_debit_preference", "ratio_online_person", "online_person_preference", "rfm_segment_desc"]]

df_loans_users = df_loans.merge(df_users, how="left", on="user_id").rename(columns={"id": "loan_id"})

In [None]:
df_loan_repayments = df_loan_repayments[["id", "loan_id", "type", "amount", "status_cleaned", "repayment_amount_bin",
                                         "days_since_loan_created", "days_since_due_date", "due_date_exceeded", "days_lag_repayment",
                                          "num_late_repayments", 'prev_status']]
df_loan_repayments = df_loan_repayments.rename(columns={"id": "repayment_id", "amount": "repayment_amount"})

df_data = df_loan_repayments.merge(df_loans_users, how="left", on="loan_id")

In [None]:
df_data = df_data[~df_data["rfm_segment_desc"].isna()]

## RFM Segment

**Note:** Customers with lower activity levels (purchasing frequency and expenditure) present a higher risk of default

In [None]:
labels = ["Inactive", "Recent", "Promising", "Big Spenders", "Champions",]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["rfm_segment_desc"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["rfm_segment_desc"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["rfm_segment_desc"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["rfm_segment_desc"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title="<b>Customers with lower activity levels (purchasing frequency and expenditure)<br>present a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="user segment",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_1.png](attachment:54075042-f458-4e92-8f16-3ff5799a2f91.png)

## Loan Principal Amount

**Note:** Smaller loans exhibit higher chances for default

In [None]:
labels = ["1k - 2k", "2k - 3k", "3k - 4k", "4k - 5k", "5k - 6k", "6k - 7k"]

fig = go.Figure([
    go.Bar(x=labels,
           y=[
               df_data[(df_data["amount_bin"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           name="paid"),
    go.Bar(x=labels,
           y=[
               df_data[(df_data["amount_bin"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           name="defaulted")
])

fig.update_layout(
    title='<b>Smaller loans exhibit higher chances for default</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="principal loan amount (R$)",
    yaxis_title="% of repayments",
    barmode="group",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="inside"
)

fig.show()

![img_2.png](attachment:e1a0c18e-d239-4c4c-a13a-a7f7194a6f0e.png)

## Loan Interest Rate

**Note:** Recent and Inactive customers tend to default when loans have a higher interest rate

In [None]:
labels = ["small", "medium", "large"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Recent and Inactive customers tend to default when loans have a higher interest rate</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_3.png](attachment:e08daaf8-b1e4-4af3-bd34-96750235057c.png)

## Loan Repayments

**Note:** Users who opt for manual loan repayments are more likely to default

In [None]:
labels = ["autopilot", "pix"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["type"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["type"] == label]["repayment_id"].count() 
               for label in labels
           ],
           orientation="h",
           marker_color=colors["dark_blue"],
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["type"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["type"] == label]["repayment_id"].count() 
               for label in labels
           ],
           orientation="h",
           marker_color=colors["dark_purple"],
           name="defaulted")
])

fig.update_layout(
    title='<b>Users who opt for manual loan repayments are more likely to default</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="repayment type",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="outside"
)

fig.show()

![img_4.png](attachment:434c0fd8-2511-4da7-8ebe-94cf46b63b60.png)

## Repayment Amount

**Note:** Larger loan repayment amounts have a high potential to lead to defaulted repayments

In [None]:
labels = ["small", "small-medium", "medium", "medium-large", "large"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["repayment_amount_bin"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["repayment_amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["repayment_amount_bin"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["repayment_amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title="<b>Larger loan repayment amounts have a high potential to lead to defaulted repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="repayment amount",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_5.png](attachment:c7d641ce-9261-41b1-a4c2-8fb51b6a4019.png)

## Late Repayments

**Note:**
- There is still significant risk for a user to default even when the loan due date has been exceeded
- Big Spenders and Recent customers, once they have exceeded the loan due date, possess the hishest risk of default

In [None]:
labels = ["not exceeded", "exceeded"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["due_date_exceeded"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["due_date_exceeded"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title='<b>There is still significant risk of defaulting even when the loan due date has been exceeded</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="due date exceeded",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_6.png](attachment:b00bc32e-ea7f-4904-be8e-cf2b68619d77.png)

In [None]:
labels = ["not exceeded", "exceeded"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Recent customers, once they have exceeded the loan due date,<br>possess the hishest risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_7.png](attachment:e9db14ea-b8cb-4639-823f-c045627e7c70.png)

## Repayment Lag

**Note:** Frequent repayments don't necessarily result in repayments

In [None]:
fig = px.box(
    df_data[df_data["due_date_exceeded"] == "not exceeded"], 
    y="days_lag_repayment", 
    x="status_cleaned",
    color="status_cleaned",
    color_discrete_sequence=[colors["dark_blue"], colors["dark_purple"]],
)

fig.update_layout(
    title="<b>Frequent repayments don't necessarily result in repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title=None,
    yaxis_title="repayment lag (days)",
    showlegend=False,
    template=template,
)

fig.show()

![img_8.png](attachment:46e44eb4-8a19-44df-8c2b-d32227eee119.png)

In [None]:
df_data[df_data["due_date_exceeded"] == "not exceeded"].groupby("status_cleaned", as_index=False)["days_lag_repayment"].describe()

## Card Type Preference

**Note:** Big Spenders and Recent customers who prefer credit cards over debit cards tend to have a higher risk of default

In [None]:
labels = ["debit", "credit"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Recent customers who prefer credit cards over debit cards<br>tend to have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_9.png](attachment:4d1d4086-0315-4625-90ba-7290371d0cce.png)

## Installment Preference

**Note:** Big Spenders and Inactive customers who prefer installment plans for their purchases have a higher risk of default

In [None]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_transactions_installment"],
            showlegend=False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_transactions_installment"], 
            showlegend=False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Inactive customers who prefer installment plans<br>for their purchases have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="preference of installment plans<br>(0 - 100%)",
    template=template,
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_10.png](attachment:c86e3404-f83b-4446-9992-8e7dfd509ba3.png)

In [None]:
df_data.groupby(["rfm_segment_desc", "status_cleaned"], as_index=False)["rate_transactions_installment"].describe()

In [None]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_transactions_installment"].median()], 
            x=["paid"],
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_transactions_installment"].median()],
            x=["defaulted"],
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Inactive customers who prefer installment plans<br>for their purchases have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="preference of installment plans<br>(0 - 100%)",
    barmode="group",
    bargroupgap=0.1,
    showlegend=False,
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="inside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_11.png](attachment:ea225f9b-3acb-4bc5-991d-99ef1c3a1feb.png)

## Denied Transactions

**Note:** Big Spenders, Recent and Inactive customers whose transactions are denied frequently have a higher risk of default

In [None]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_denied"],
            showlegend=False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_denied"], 
            showlegend=False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders, Recent and Inactive customers whose transactions<br>are denied frequently have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="transactions rejection rate",
    template=template,
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_12.png](attachment:3c7494a2-4832-4915-9916-293a28600669.png)

In [None]:
df_data.groupby(["rfm_segment_desc", "status_cleaned"], as_index=False)["rate_denied"].describe()

In [None]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_denied"].median()], 
            x=["paid"],
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_denied"].median()],
            x=["defaulted"],
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders, Recent and Inactive customers whose transactions<br>are denied frequently have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="transactions rejection rate",
    barmode="group",
    bargroupgap=0.1,
    showlegend=False,
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="inside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_13.png](attachment:5cf2875e-be09-449a-b349-d7c603089e41.png)

## Purchase Mode

**Note:** A preference of in-person over online purchases (and vise versa), doesn't impact significantly the risk of a user defaulting

In [None]:
labels = ["in-person", "online"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>A preference of in-person over online purchases (and vise versa),<br>doesn't impact significantly the risk of a user defaulting</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_14.png](attachment:6fe11584-abd3-41c8-8053-d5eeb6d4d7ab.png)

## Previous Repayment Status

**Note:** 24% of prior repayment defaults lead to subsequent defaults, posing a significant risk

In [None]:
labels = ["paid", "defaulted"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["prev_status"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["prev_status"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["prev_status"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["prev_status"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title='<b>24% of prior repayment defaults lead to subsequent defaults, posing a significant risk</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="prev. repayment status",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_15.png](attachment:140f443a-5d27-469f-9a77-ff19ff9fc8dd.png)