# Data Analysis

## Table of Contents
- Load Data
- Univariate Analysis
    - Loan Features
    - Loan Repayment Features
    - User Features
- RFM Analysis
    - Scores Definitions
    - Segment Definitions
    - Segment Analysis
- Explanatory Analysis
    - Loan Profile
    - Repayment Profile
    - User Profile

In [1]:
from copy import deepcopy
import numpy as np
import pandas as pd

# visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
# plotly config
import plotly.io as pio
pio.renderers.default = "browser"

template = "plotly_white"
colors = {
    "red": "rgb(255,0,0)",
    "dark_blue": "rgb(30,144,255)",
    "medium_blue": "rgb(0,191,255)",
    "light_blue": "rgb(135,206,250)",
    "dark_purple": "rgb(202, 105, 157)",
    "medium_purple": "rgb(221, 136, 172)",
    "light_purple": "rgb(234, 169, 189)",
    "dark_gray": "rgb(169,169,169)",
    "medium_gray": "rgb(192,192,192)",
    "light_gray": "rgb(211,211,211)"
}
title_font_size = 22
font_size = 18

# Load Data

In [3]:
df_loans = pd.read_pickle("../data/processed/df_loans.pkl")

df_loans

Unnamed: 0,id,user_id,amount,total_amount,due_amount,due_date,status,created_at,loan_term,amount_bin,due_amount_bin,loan_fees,loan_fees_bin,interest_rate,interest_rate_bin
0,0,3070,6000.0,6045.28,6459.00,2022-05-02,repaid,2022-02-01,90,6k - 7k,6k - 7k,45.28,40 - 50,31.0,small
1,1,2546,6000.0,6045.28,6459.00,2022-05-02,repaid,2022-02-01,90,6k - 7k,6k - 7k,45.28,40 - 50,31.0,small
2,2,2413,6000.0,6045.28,6459.00,2022-05-02,repaid,2022-02-01,90,6k - 7k,6k - 7k,45.28,40 - 50,31.0,small
3,3,2585,6000.0,6045.28,6459.00,2022-05-02,debt_collection,2022-02-01,90,6k - 7k,6k - 7k,45.28,40 - 50,31.0,small
4,4,2556,6000.0,6045.28,6459.00,2022-05-02,repaid,2022-02-01,90,6k - 7k,6k - 7k,45.28,40 - 50,31.0,small
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6741,6741,2130,2500.0,2518.87,3228.82,2023-01-01,repaid,2022-10-03,90,2k - 3k,3k - 4k,18.87,10 - 20,118.0,large
6742,6742,549,6000.0,6045.28,7749.16,2023-01-01,repaid,2022-10-03,90,6k - 7k,7k - 8k,45.28,40 - 50,118.0,large
6743,6743,1414,6000.0,6045.28,7749.16,2023-01-01,repaid,2022-10-03,90,6k - 7k,7k - 8k,45.28,40 - 50,118.0,large
6744,6744,2070,6000.0,6045.28,7749.16,2023-01-01,debt_repaid,2022-10-03,90,6k - 7k,7k - 8k,45.28,40 - 50,118.0,large


In [4]:
df_loan_repayments = pd.read_pickle("../data/processed/df_loan_repayments.pkl")

df_loan_repayments

Unnamed: 0,id,loan_id,type,amount,status,created_at,status_cleaned,repayment_amount_bin,days_since_loan_created,days_since_due_date,due_date_exceeded,days_lag_repayment,num_late_repayments,prev_status,prev_amount,cumsum_amount,ratio_repaid_total
50,51,0,autopilot,9.75,paid,2022-02-03,paid,small,2,0,0,2.0,0,init,0.00,9.75,0.001613
112,113,0,autopilot,45.30,paid,2022-02-04,paid,small,3,0,0,1.0,0,paid,9.75,55.05,0.009106
216,217,0,autopilot,25.50,paid,2022-02-05,paid,small,4,0,0,1.0,0,paid,45.30,80.55,0.013324
496,497,0,autopilot,10.50,paid,2022-02-08,paid,small,7,0,0,3.0,0,paid,25.50,91.05,0.015061
645,646,0,autopilot,226.50,paid,2022-02-09,paid,small,8,0,0,1.0,0,paid,10.50,317.55,0.052529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171446,171447,6745,autopilot,6.00,paid,2022-12-14,paid,small,72,0,0,1.0,0,paid,36.00,6387.14,1.056550
171543,171544,6745,autopilot,93.00,paid,2022-12-15,paid,small,73,0,0,1.0,0,paid,6.00,6480.14,1.071934
171881,171882,6745,autopilot,126.74,paid,2022-12-20,paid,small,78,0,0,5.0,0,paid,93.00,6606.88,1.092899
171963,171964,6745,autopilot,203.37,paid,2022-12-21,paid,small,79,0,0,1.0,0,paid,126.74,6810.25,1.126540


In [5]:
df_users = pd.read_pickle("../data/processed/df_users.pkl")

df_users["rate_transactions_installment"] = df_users["rate_transactions_installment"].astype(float)

df_users

Unnamed: 0,user_id,recency,frequency,monetary,median_spending,avg_intallments,avg_lag_transaction,rate_denied,rate_denied_approved,rate_transactions_installment,rate_credit_debit,ratio_online_person,card_preference
0,1,346,10,25154.18,270.0,4.500000,6.111111,0.300000,0.375000,0.700000,4.500000,0.000000,card_internation
1,2,340,35,66491.00,1130.0,8.257143,1.852941,0.057143,0.058824,0.971429,17.000000,0.000000,card_brazil
2,3,191,78,117555.00,1000.0,5.551282,1.740260,0.115385,0.128571,1.000000,78.000000,0.012821,card_internation
3,4,15,286,423169.31,307.5,2.646853,0.884211,0.090909,0.099617,0.419580,1.657407,0.003497,card_internation
4,5,54,793,440568.44,172.0,1.952081,0.334596,0.083228,0.090659,0.480454,131.333333,0.000000,card_internation
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3043,3149,216,231,96516.80,240.0,1.935065,0.682609,0.069264,0.074074,0.333333,2.052632,0.004329,card_internation
3044,3150,8,849,395379.93,105.0,1.618375,0.459906,0.174323,0.210826,0.398115,2.586498,0.289833,card_internation
3045,3151,140,227,73475.80,207.0,2.687225,1.137168,0.092511,0.101449,0.568282,2.257143,0.017857,card_internation
3046,3152,39,1118,359051.73,240.1,3.213775,0.317816,0.135063,0.155992,0.952594,61.166667,0.028493,card_internation


# Univariate Analysis

- Loan Features
- Loan Repayment Features
- User Features

## Loan Status

**Note:** 23% of the loans have exceeded due date

In [6]:
df_loans["status"] = df_loans["status"].str.replace("_", " ")

In [7]:
fig = px.histogram(
    df_loans[df_loans["status"] != "error"],
    x="status",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>23% of the loans have exceeded due date</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="loan status",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_1.png](attachment:5bf9b9c6-200a-45bd-9d66-cd27f4ab4924.png)

## Loan Principal Amount

**Note:** Loans typically range from more than R$6k

In [8]:
fig = px.histogram(
    df_loans, 
    x='amount_bin', 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Loans typically range from more than R$6k</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="principal loan amount (R$)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["1k - 2k", "2k - 3k", "3k - 4k", "4k - 5k", "5k - 6k", "6k - 7k"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_2.png](attachment:b0934db5-ad4f-4c98-917f-fdb2822702ce.png)

## Loan Fees

**Note:** Loan fees typically range from R$40-50

In [9]:
fig = px.histogram(
    df_loans, 
    x='loan_fees_bin', 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Loan fees typically range from R$40-50</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="loan fees (R$)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["0 - 10", "10 - 20", "20 - 30", "30 - 40", "40 - 50"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_3.png](attachment:b14caa24-1281-43f3-b7e6-5b0683ba414d.png)

## Loan Due Amount

**Note:** Loans can range up to R$9k, if there are no repayments during the contract period

In [10]:
fig = px.histogram(
    df_loans, 
    x='due_amount_bin', 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Loans can range up to R$9k, if there are no repayments during the contract period</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="due loan amount (R$)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["1k - 2k", "2k - 3k", "3k - 4k", "4k - 5k", "5k - 6k", "6k - 7k", "7k - 8k", "8k - 9k"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_4.png](attachment:7eae2d13-e9d7-45e2-95ce-41967b742838.png)

## Loan Interest Rate

**Note:** Only 18% of the loans have a high interest rate (exceeding 100%)

In [11]:
fig = px.histogram(
    df_loans, 
    x="interest_rate_bin", 
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title='<b>Only 18% of the loans have a high interest rate (exceeding 100%)</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="loan interest rate (%)",
    yaxis_title="% of loans",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=["small", "medium", "large"]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_5.png](attachment:c3560170-a708-44e0-a787-38deb6140797.png)

## Loan Term

**Note:** Loans typically need to be repaid in the next 3 months

In [12]:
df_loans['loan_term'].describe()

count    6598.000000
mean       90.067899
std         0.474862
min        90.000000
25%        90.000000
50%        90.000000
75%        90.000000
max       112.000000
Name: loan_term, dtype: float64

## Repayment Type

**Note:** 91% of the loan repayments are automated

In [13]:
fig = px.histogram(
    df_loan_repayments,
    x="type",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>91% of the loan repayments are automated</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="repayment type",
    yaxis_title="% of repayments",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_6.png](attachment:44b2fad0-e98a-4f20-80e0-0bf7d1dd2915.png)

## Repayment Status

**Note:** Only 5% of the loan repayments have gone to default

In [14]:
fig = px.histogram(
    df_loan_repayments,
    x="status_cleaned",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Only 5% of the loan repayments have gone to default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="repayment status",
    yaxis_title="% of repayments",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_7.png](attachment:3468e78a-23c8-4148-87ac-febf17218753.png)

## Repayment Amount

**Note:** A typical loan repayment can range from R$27-195

In [15]:
fig = px.histogram(
    df_loan_repayments,
    x="amount",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>A typical loan repayment can range from R$27-195</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="repayment amount (R$)",
    yaxis_title="% of repayments",
    template=template,
)

fig.show()

![img_8.png](attachment:b3e3097d-ce45-440b-9164-5e3a76706d58.png)

In [16]:
df_loan_repayments["amount"].describe()

count    172445.000000
mean        234.329357
std         562.388191
min           0.010000
25%          27.750000
50%          75.690000
75%         195.000000
max        7726.080000
Name: amount, dtype: float64

## Days Passed From Loan Creation To Repayment

**Note:** Typically, the first repayment happens at maximum in the next 3 days since the loan creation date

In [17]:
# time between first repayment and loan creation date
# typically, the first repayment happens at maximum in the next 3 days since the loan creation date
df_loan_repayments \
.sort_values(by=["loan_id", "created_at"]) \
.groupby("loan_id", as_index=False) \
.first() \
["days_since_loan_created"].describe()

count    6598.000000
mean        4.019097
std         9.044913
min         0.000000
25%         1.000000
50%         1.000000
75%         3.000000
max       104.000000
Name: days_since_loan_created, dtype: float64

In [18]:
fig = px.histogram(
    df_loan_repayments,
    x="days_since_loan_created",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Days passed from loan creation to repayment distribution</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="days since loan creation",
    yaxis_title="% of repayments",
    template=template,
)

fig.show()

![img_9.png](attachment:1557e1c9-ea8b-4b2f-b74b-ce435ec25d51.png)

In [19]:
df_loan_repayments["days_since_loan_created"].describe()

count    172445.000000
mean         36.041700
std          26.544416
min           0.000000
25%          14.000000
50%          30.000000
75%          55.000000
max         481.000000
Name: days_since_loan_created, dtype: float64

## Late Repayment

**Note:** 
- Only 3% of loan repayments have exceeded the loan due date.
- Late repayments can happen between 1-10 days after the loan due date.
- On average, loans exceed due date by 3-4 repayments

In [20]:
df_loan_repayments["due_date_exceeded"] = df_loan_repayments["due_date_exceeded"].replace({0: "not exceeded", 1: "exceeded"})

In [21]:
fig = px.histogram(
    df_loan_repayments,
    x="due_date_exceeded",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Only 3% of loan repayments have exceeded the loan due date</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="due date exceeded",
    yaxis_title="% of repayments",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_10.png](attachment:65839e98-5157-4b07-822a-43471eaf0a88.png)

In [22]:
fig = px.histogram(
    df_loan_repayments[df_loan_repayments["days_since_due_date"] > 0],
    x="days_since_due_date",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Late repayments can happen between 1-10 days after the loan due date</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="days since due date",
    yaxis_title="% of repayments",
    template=template,
)

fig.show()

![img_11.png](attachment:c9f17b5b-9bde-4efa-b2a8-a6fb1df78a01.png)

In [23]:
df_loan_repayments[df_loan_repayments["days_since_due_date"] > 0]["days_since_due_date"].describe()

count    5627.000000
mean        6.673716
std        10.546824
min         1.000000
25%         2.000000
50%         6.000000
75%        10.000000
max       391.000000
Name: days_since_due_date, dtype: float64

In [24]:
df_temp = df_loan_repayments.groupby("loan_id", as_index=False)["num_late_repayments"].max()

fig = px.histogram(
    df_temp[df_temp["num_late_repayments"] > 0],
    x="num_late_repayments",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>On average, loans exceed due date by 3-4 repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="number of late repayments",
    yaxis_title="% of loan",
    template=template,
)

fig.show()

![img_12.png](attachment:607a3307-0a64-44e9-be47-9d86fd915930.png)

In [25]:
df_temp[df_temp["num_late_repayments"] > 0]["num_late_repayments"].describe()

count    1278.000000
mean        4.402973
std         3.450224
min         1.000000
25%         2.000000
50%         3.000000
75%         6.000000
max        24.000000
Name: num_late_repayments, dtype: float64

## Repayment Lag 

**Note:** Repayments happen every 1-2 days

In [26]:
df_loan_repayments["days_lag_repayment"].describe()

count    172445.000000
mean          2.140306
std           4.220587
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         480.000000
Name: days_lag_repayment, dtype: float64

## Median Spendings

**Note:** Users spend around R$90-660 in purchases

In [27]:
# avg_basket
fig = px.histogram(
    df_users,
    x="median_spending",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users spends around R$90-660 in purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="median spendings (R$)",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_13.png](attachment:ca385f38-8035-4747-8c1d-e3a48a3815e0.png)

In [28]:
df_users["median_spending"].describe()

count     3048.000000
mean       634.436460
std       1272.040417
min          1.100000
25%         80.000000
50%        230.500000
75%        660.000000
max      18000.000000
Name: median_spending, dtype: float64

## Card Type Preference

**Note:** 82% of the users prefer to make purchases using a credit card

In [29]:
df_users["credit_debit_preference"] = np.where(df_users['rate_credit_debit'] <= 1, "debit", "credit")

In [30]:
fig = px.histogram(
    df_users,
    x="credit_debit_preference",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>82% of the users prefer to make purchases using a credit card</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="card type preference",
    yaxis_title="% of users",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_14.png](attachment:3697631b-86a3-4071-af3c-59009d3f6f92.png)

## Installment Preference

**Note:** Users typically finance 25% to 75% of their purchases through installment plans

In [31]:
fig = px.histogram(
    df_users,
    x="rate_transactions_installment",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users typically finance 25% to 75% of their purchases through installment plans</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of transactions with installments",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_15.png](attachment:9265742f-8be8-493b-9a90-f81fab159c28.png)

In [32]:
df_users["rate_transactions_installment"].astype(float).describe()

count    3048.000000
mean        0.502776
std         0.288628
min         0.000000
25%         0.256370
50%         0.522134
75%         0.749128
max         1.000000
Name: rate_transactions_installment, dtype: float64

## Installments

**Note:** Users typically opt for installment plans ranging from 2-5 payments for their purchases

In [33]:
df_users["avg_intallments"] = df_users["avg_intallments"].round()

In [34]:
fig = px.histogram(
    df_users,
    x="avg_intallments",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users typically opt for installment plans ranging from 2-5 payments for their purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="number of installments",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_16.png](attachment:75c7d745-aae3-4c43-9f38-24fe00937636.png)

In [35]:
df_users["avg_intallments"].describe()

count    3048.000000
mean        3.510827
std         2.216361
min         1.000000
25%         2.000000
50%         3.000000
75%         5.000000
max        12.000000
Name: avg_intallments, dtype: float64

## Denied Transactions

**Note:** Users encounter denied transactions, less than 20% of time

In [36]:
fig = px.histogram(
    df_users,
    x="rate_denied",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Users encounter denied transactions, less than 20% of time</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="denied transactions ratio",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_17.png](attachment:e18cf8ca-28fa-4767-8763-c99fff006771.png)

In [37]:
df_users["rate_denied"].describe()

count    3048.000000
mean        0.172197
std         0.133207
min         0.000000
25%         0.080000
50%         0.126421
75%         0.229436
max         0.857585
Name: rate_denied, dtype: float64

## Transaction Lag

**Note:** 
- Our base consists of the users that are active purchasers
- Active is considered someone who makes frequent purchases, with a maximum lag of a week

In [38]:
fig = px.histogram(
    df_users,
    x="avg_lag_transaction",
    histnorm="percent",
    marginal="box",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>96% of the users are active purchasers<br><sup>active is considered someone who makes frequent purchases, with a maximum lag of a week</sup></b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="transaction lag (days)",
    yaxis_title="% of users",
    template=template,
)

fig.show()

![img_18.png](attachment:ee0d1b9e-585a-4906-a02a-7ca7792bb6c7.png)

In [39]:
df_users["avg_lag_transaction"].describe()

count    3046.000000
mean        1.836044
std         2.827839
min         0.007171
25%         0.462581
50%         1.045914
75%         2.225168
max        62.000000
Name: avg_lag_transaction, dtype: float64

## Purchases Mode

**Note:** Our base consists of users who value in-person purchases

In [40]:
df_users["online_person_preference"] = np.where(df_users["ratio_online_person"] <= 1, "in-person", "online")

In [41]:
fig = px.histogram(
    df_users,
    x="online_person_preference",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>Our base consists of users who value in-person purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="purchase type",
    yaxis_title="% of users",
    template=template,
)
fig.update_xaxes(
    categoryorder="total descending"
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="outside"
)

fig.show()

![img_19.png](attachment:1184ff80-1c5a-48a5-80a3-5eb47f9c59d0.png)

# RFM Analysis

- Customer (User) Segmentation based on RFM Analysis
- RFM segments the consumer (user) base by their purchasing patterns or habits based on Recency (R), Frequency (F) and Monetary Value (M)
- *Recency:* how long ago they made a purchase
- *Frequency:* how often they make purchases
- *Monetary Value:* how much money they spend

## Scores Definitions

Typically, RFM scores are in a scale of 1-5. 

However, for the purpose of loan repayment analysis, the scores are considered in a scale of 1-3 (the lower, the better the results for a customer) in order to avoid segments that have similar behavior regarding loan repayment.

| Recency | Frequency | Monetary |
| :-----: | :-------: | :------: |
| up to 1 month (1) | more than 500 (1) | more than R\\$ 500k (1) |
| 1 to 3 months (2) | 100 to 500 (2) | R\\$ 100k to 500k (2) |
| more than 3 months (3) | less than 100 (3) | less than R\\$100k (3) |t |

In [42]:
df_users[["recency", "frequency", "monetary"]].describe()

Unnamed: 0,recency,frequency,monetary
count,3048.0,3048.0,3048.0
mean,125.481955,473.312008,658325.5
std,91.24315,1046.560185,19501450.0
min,0.0,1.0,41.5
25%,42.0,73.75,53503.12
50%,105.0,178.5,103272.9
75%,196.0,461.25,198277.6
max,358.0,25660.0,1000132000.0


In [43]:
# Recency buckets
# Higher values indicate users who haven't made a purchase for a long time (not active)
# q1, q3 = df_users["recency"].quantile(.25), df_users["recency"].quantile(.75)

df_users.loc[df_users["recency"] <= 30, "recency_bin"] = "1"
df_users.loc[df_users["recency"].between(30, 90, inclusive="right"), "recency_bin"] = "2"
df_users.loc[df_users["recency"] > 90, "recency_bin"] = "3"

In [44]:
# Frequency buckets
# Higher values indicate users who make frequent purchases (more active)
q1, q3 = df_users["frequency"].quantile(.25), df_users["frequency"].quantile(.75)

df_users.loc[df_users["frequency"] <= 100, "frequency_bin"] = "3"
df_users.loc[df_users["frequency"].between(100, 500, inclusive="right"), "frequency_bin"] = "2"
df_users.loc[df_users["frequency"] > 500, "frequency_bin"] = "1"

In [45]:
# Monetary buckets
# Higher values indicate users with higher spendings
q1, q3 = df_users["monetary"].quantile(.25), df_users["monetary"].quantile(.75)

df_users.loc[df_users["monetary"] <= 100_000, "monetary_bin"] = "3"
df_users.loc[df_users["monetary"].between(100_000, 500_000, inclusive="right"), "monetary_bin"] = "2"
df_users.loc[df_users["monetary"] > 500_000, "monetary_bin"] = "1"

## Segment Definitions

Customer (user) segments identified:
- **Champions:** Extremely active cutomers (high frequency and low recency) with moderate to high monetary value
- **Big Spenders:** Active customers (medium recency and frequency) with high monetary value
- **Promising:** Active customers (medium recency and frequency) with low to moderate monetary value
- **Recent:** Customers who entered our base recently (low frequency and moderate to high recency) with low to moderate monetary value
- **Inactive:** Customers with extremely low activity (low recency and low to moderate frequency); monetary value isn't a factor here

In [46]:
df_users["rfm_segment"] = df_users["recency_bin"] + df_users["frequency_bin"] + df_users["monetary_bin"]

In [47]:
# define customer clusters based on RFM Analysis
cluster_1 = ["111", "113", "112"]  # Champions
cluster_2 = ["121", "231", "221", "211"]  # Big Spenders
cluster_3 = ["213", "122", "123", "212", "222", "223", "311", "313", "312"]  # Promising Customers
cluster_4 = ["131", "132", "133", "232", "233"]  # Recent Customers
cluster_5 = ["331", "332", "333", "321", "322", "323"]  # Inactive Customers

clusters = list(
    zip(
        [cluster_1, cluster_2, cluster_3, cluster_4, cluster_5],
        ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]
    )
)

for cluster, label in clusters:
    df_users.loc[df_users["rfm_segment"].isin(cluster), "rfm_segment_desc"] = label

In [48]:
fig = px.histogram(
    df_users,
    x="rfm_segment_desc",
    histnorm="percent",
    color_discrete_sequence=[colors["dark_blue"]],
)

fig.update_layout(
    title="<b>83% of the users are frequent buyers, with moderate to modest spending habits<br><sub>(Promising and Inactive segments)</sub></b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="% of user",
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)
fig.update_traces(
    texttemplate="%{y:,.0f}%",
    textposition="inside"
)

fig.show()

![img_1.png](attachment:2b74a31e-4d67-45dc-a91b-edd2673ee339.png)

## Segment Analysis

**Notes:**
- Champions and Promising customers equally prefer purchasing with credit and debit
- Big Spenders have a slight preference for online purchases
- Big Spenders and Recent customers have a higher likelihood of transaction rejection
- Big Spenders, Recent and Inactive customers consistently opt for installment plans for their purchases

In [87]:
fig = px.box(
    df_users, 
    x="rfm_segment_desc", 
    y="recency",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Recency distribution per user (customer) segment</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="Recency (days)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_2.png](attachment:86d0af64-d206-4ff8-9e72-e49c9b8eec93.png)

In [88]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["recency"].describe()

Unnamed: 0,rfm_segment_desc,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,84.0,38.654762,25.22071,1.0,17.0,38.0,56.25,89.0
1,Champions,234.0,15.209402,8.483951,0.0,7.0,14.0,22.0,30.0
2,Inactive,1458.0,200.22428,64.003044,91.0,158.0,192.5,246.0,358.0
3,Promising,1079.0,68.266914,52.968671,0.0,34.5,54.0,84.0,293.0
4,Recent,193.0,52.207254,24.212569,2.0,33.0,54.0,74.0,89.0


In [90]:
fig = px.box(
    df_users, 
    x="rfm_segment_desc", 
    y="frequency",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Frequency distribution per user (customer) segment</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="Frequency<br>(number of purchases)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_3.png](attachment:4c65844d-5175-4931-b3bf-9d2dc884503b.png)

In [91]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["frequency"].describe()

Unnamed: 0,rfm_segment_desc,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,84.0,665.071429,1467.146109,35.0,198.75,298.5,498.25,10713.0
1,Champions,234.0,1761.388889,1903.618172,506.0,704.75,1040.0,1922.75,15517.0
2,Inactive,1458.0,124.915638,111.077924,1.0,45.0,86.0,171.0,500.0
3,Promising,1079.0,723.357739,1228.848701,101.0,219.0,416.0,800.0,25660.0
4,Recent,193.0,62.145078,25.583598,9.0,41.0,66.0,84.0,100.0


In [93]:
fig = px.box(
    df_users, 
    x="rfm_segment_desc", 
    y="monetary",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Monetary amount distribution per user (customer) segment</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="Monetary amount (R$)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_4.png](attachment:73784bd9-dc3a-4aab-9746-0431ce91f086.png)

In [94]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["monetary"].describe()

Unnamed: 0,rfm_segment_desc,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,84.0,17886420.0,116795200.0,500928.83,588990.5825,722995.25,1083493.0,1000132000.0
1,Champions,234.0,357511.1,367237.2,45570.97,159925.98,251296.575,430517.1,3313743.0
2,Inactive,1458.0,126565.9,593098.3,41.5,37067.325,66243.92,117956.1,20469100.0
3,Promising,1079.0,197342.8,596292.7,8776.76,84446.88,139180.37,226928.6,14062410.0
4,Recent,193.0,119137.6,131588.6,3508.37,41413.04,79474.43,143526.5,934090.7


In [95]:
fig = px.box(
    df_users, 
    y="rate_credit_debit", 
    x="rfm_segment_desc",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Champions and Promising customers equally prefer purchasing with credit and debit</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="credit to debit ratio",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_5.png](attachment:85ee3df1-9b5b-4746-95c2-7af7d4bffbda.png)

In [96]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["rate_credit_debit"].describe()

Unnamed: 0,rfm_segment_desc,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,84.0,39.567398,73.604259,0.459673,3.70612,7.722222,35.4,338.0
1,Champions,234.0,7.49355,36.52843,0.212712,0.661599,1.173782,2.217703,447.0
2,Inactive,1458.0,13.705506,27.369278,0.089552,1.896904,4.809091,14.0,357.0
3,Promising,1079.0,7.610574,25.374779,0.133124,0.93669,1.867521,4.99569,367.5
4,Recent,193.0,14.723898,17.157063,0.243902,3.9,8.1,18.5,96.0


In [97]:
labels = ["Inactive", "Recent", "Promising", "Big Spenders", "Champions",]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_users[(df_users["rfm_segment_desc"] == label) & (df_users["online_person_preference"] == "in-person")]["user_id"].count() \
               / df_users[df_users["rfm_segment_desc"] == label]["user_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="in-person"),
    go.Bar(y=labels,
           x=[
               df_users[(df_users["rfm_segment_desc"] == label) & (df_users["online_person_preference"] == "online")]["user_id"].count() \
               / df_users[df_users["rfm_segment_desc"] == label]["user_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_gray"],
           orientation="h",
           name="online")
])

fig.update_layout(
    title="<b>Big Spenders have a sligh preference for online purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of users",
    yaxis_title="user segment",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        title="purchase",
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="outside"
)

fig.show()

![img_6.png](attachment:8519787c-748c-4dd4-9a5c-fb5ed83bae97.png)

In [98]:
fig = px.box(
    df_users, 
    y="rate_denied", 
    x="rfm_segment_desc",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Big Spenders and Recent cstomers have a higher likelihood of transaction rejection</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="transactions rejection rate",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_7.png](attachment:2f7e4d95-2aa3-42c6-af81-088811fff877.png)

In [99]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["rate_denied"].describe()

Unnamed: 0,rfm_segment_desc,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,84.0,0.29986,0.179658,0.034714,0.162625,0.272992,0.384588,0.818966
1,Champions,234.0,0.103354,0.062709,0.027778,0.064654,0.088713,0.116038,0.525194
2,Inactive,1458.0,0.202178,0.148713,0.0,0.093064,0.16,0.281199,0.857585
3,Promising,1079.0,0.129426,0.088826,0.020134,0.072814,0.104101,0.157407,0.77038
4,Recent,193.0,0.212733,0.138137,0.0,0.103448,0.192982,0.283951,0.7


In [103]:
fig = px.box(
    df_users, 
    y="rate_transactions_installment", 
    x="rfm_segment_desc",
    color_discrete_sequence=[colors["dark_blue"]]
)

fig.update_layout(
    title="<b>Big Spenders, Recent and Inactive customers<br>consistently opt for installment plans for their purchases</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="user segment",
    yaxis_title="preference of installment plans<br>(0 - 100%)",
    showlegend=False,
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=[label for cluster, label in clusters]
)

fig.show()

![img_8.png](attachment:0677b4f0-107e-4408-ad91-22e2098997fd.png)

In [104]:
df_users.groupby(["rfm_segment_desc"], as_index=False)["rate_transactions_installment"].describe()

Unnamed: 0,rfm_segment_desc,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,84.0,0.663006,0.243419,0.001867,0.557653,0.697031,0.849693,0.993421
1,Champions,234.0,0.282384,0.227728,0.0,0.095447,0.241956,0.414874,0.992481
2,Inactive,1458.0,0.585951,0.269382,0.0,0.38169,0.617454,0.8125,1.0
3,Promising,1079.0,0.389939,0.268455,0.0,0.148611,0.363914,0.615963,0.967972
4,Recent,193.0,0.702753,0.208507,0.128205,0.583333,0.75,0.87234,1.0


## ✅ checkpoint

In [55]:
df_users_segments = df_users[["user_id", "rfm_segment_desc"]]

In [56]:
df_users_segments.to_pickle("../data/processed/df_users_segments.pkl")

# Explanatory Analysis

In [57]:
df_loans[["id", "user_id", "amount_bin", "due_amount_bin", "interest_rate_bin"]]

df_users = df_users[["user_id", "median_spending", "avg_intallments", "avg_lag_transaction", 
                     "rate_denied", "rate_transactions_installment", "rate_credit_debit", 
                     "credit_debit_preference", "ratio_online_person", "online_person_preference", "rfm_segment_desc"]]

df_loans_users = df_loans.merge(df_users, how="left", on="user_id").rename(columns={"id": "loan_id"})

In [58]:
df_loan_repayments = df_loan_repayments[["id", "loan_id", "type", "amount", "status_cleaned", "repayment_amount_bin",
                                         "days_since_loan_created", "days_since_due_date", "due_date_exceeded", "days_lag_repayment",
                                          "num_late_repayments", 'prev_status']]
df_loan_repayments = df_loan_repayments.rename(columns={"id": "repayment_id", "amount": "repayment_amount"})

df_data = df_loan_repayments.merge(df_loans_users, how="left", on="loan_id")

In [68]:
df_data = df_data[~df_data["rfm_segment_desc"].isna()]

## RFM Segment

**Note:** Customers with lower activity levels (purchasing frequency and expenditure) present a higher risk of default

In [59]:
labels = ["Inactive", "Recent", "Promising", "Big Spenders", "Champions",]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["rfm_segment_desc"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["rfm_segment_desc"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["rfm_segment_desc"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["rfm_segment_desc"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title="<b>Customers with lower activity levels (purchasing frequency and expenditure)<br>present a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="user segment",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_1.png](attachment:54075042-f458-4e92-8f16-3ff5799a2f91.png)

## Loan Principal Amount

**Note:** Smaller loans exhibit higher chances for default

In [60]:
labels = ["1k - 2k", "2k - 3k", "3k - 4k", "4k - 5k", "5k - 6k", "6k - 7k"]

fig = go.Figure([
    go.Bar(x=labels,
           y=[
               df_data[(df_data["amount_bin"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           name="paid"),
    go.Bar(x=labels,
           y=[
               df_data[(df_data["amount_bin"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           name="defaulted")
])

fig.update_layout(
    title='<b>Smaller loans exhibit higher chances for default</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="principal loan amount (R$)",
    yaxis_title="% of repayments",
    barmode="group",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="inside"
)

fig.show()

![img_2.png](attachment:e1a0c18e-d239-4c4c-a13a-a7f7194a6f0e.png)

## Loan Interest Rate

**Note:** Recent and Inactive customers tend to default when loans have a higher interest rate

In [84]:
labels = ["small", "medium", "large"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["interest_rate_bin"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Recent and Inactive customers tend to default when loans have a higher interest rate</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_3.png](attachment:e08daaf8-b1e4-4af3-bd34-96750235057c.png)

## Loan Repayments

**Note:** Users who opt for manual loan repayments are more likely to default

In [62]:
labels = ["autopilot", "pix"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["type"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["type"] == label]["repayment_id"].count() 
               for label in labels
           ],
           orientation="h",
           marker_color=colors["dark_blue"],
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["type"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["type"] == label]["repayment_id"].count() 
               for label in labels
           ],
           orientation="h",
           marker_color=colors["dark_purple"],
           name="defaulted")
])

fig.update_layout(
    title='<b>Users who opt for manual loan repayments are more likely to default</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="repayment type",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="outside"
)

fig.show()

![img_4.png](attachment:434c0fd8-2511-4da7-8ebe-94cf46b63b60.png)

## Repayment Amount

**Note:** Larger loan repayment amounts have a high potential to lead to defaulted repayments

In [63]:
labels = ["small", "small-medium", "medium", "medium-large", "large"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["repayment_amount_bin"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["repayment_amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["repayment_amount_bin"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["repayment_amount_bin"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title="<b>Larger loan repayment amounts have a high potential to lead to defaulted repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="repayment amount",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_5.png](attachment:c7d641ce-9261-41b1-a4c2-8fb51b6a4019.png)

## Late Repayments

**Note:**
- There is still significant risk for a user to default even when the loan due date has been exceeded
- Big Spenders and Recent customers, once they have exceeded the loan due date, possess the hishest risk of default

In [65]:
labels = ["not exceeded", "exceeded"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["due_date_exceeded"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["due_date_exceeded"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title='<b>There is still significant risk of defaulting even when the loan due date has been exceeded</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="due date exceeded",
    barmode="stack",
    bargroupgap=0.1,
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_yaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_6.png](attachment:b00bc32e-ea7f-4904-be8e-cf2b68619d77.png)

In [70]:
labels = ["not exceeded", "exceeded"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["due_date_exceeded"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Recent customers, once they have exceeded the loan due date,<br>possess the hishest risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_7.png](attachment:e9db14ea-b8cb-4639-823f-c045627e7c70.png)

## Repayment Lag

**Note:** Frequent repayments don't necessarily result in repayments

In [71]:
fig = px.box(
    df_data[df_data["due_date_exceeded"] == "not exceeded"], 
    y="days_lag_repayment", 
    x="status_cleaned",
    color="status_cleaned",
    color_discrete_sequence=[colors["dark_blue"], colors["dark_purple"]],
)

fig.update_layout(
    title="<b>Frequent repayments don't necessarily result in repayments</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title=None,
    yaxis_title="repayment lag (days)",
    showlegend=False,
    template=template,
)

fig.show()

![img_8.png](attachment:46e44eb4-8a19-44df-8c2b-d32227eee119.png)

In [72]:
df_data[df_data["due_date_exceeded"] == "not exceeded"].groupby("status_cleaned", as_index=False)["days_lag_repayment"].describe()

Unnamed: 0,status_cleaned,count,mean,std,min,25%,50%,75%,max
0,defaulted,8128.0,2.232776,6.303368,0.0,0.0,1.0,2.0,89.0
1,paid,158685.0,2.08082,3.402038,0.0,1.0,1.0,2.0,90.0


## Card Type Preference

**Note:** Big Spenders and Recent customers who prefer credit cards over debit cards tend to have a higher risk of default

In [73]:
labels = ["debit", "credit"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["credit_debit_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Recent customers who prefer credit cards over debit cards<br>tend to have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_9.png](attachment:4d1d4086-0315-4625-90ba-7290371d0cce.png)

## Installment Preference

**Note:** Big Spenders and Inactive customers who prefer installment plans for their purchases have a higher risk of default

In [74]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_transactions_installment"],
            showlegend=False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_transactions_installment"], 
            showlegend=False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Inactive customers who prefer installment plans<br>for their purchases have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="preference of installment plans<br>(0 - 100%)",
    template=template,
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_10.png](attachment:c86e3404-f83b-4446-9992-8e7dfd509ba3.png)

In [75]:
df_data.groupby(["rfm_segment_desc", "status_cleaned"], as_index=False)["rate_transactions_installment"].describe()

Unnamed: 0,rfm_segment_desc,status_cleaned,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,defaulted,302.0,0.698725,0.207448,0.001867,0.605941,0.720408,0.854772,0.993421
1,Big Spenders,paid,5421.0,0.593629,0.267921,0.001867,0.433198,0.639344,0.826087,0.993421
2,Champions,defaulted,1351.0,0.281584,0.221912,0.0,0.097304,0.254065,0.417973,0.992481
3,Champions,paid,31137.0,0.273035,0.219958,0.0,0.091988,0.229433,0.405882,0.992481
4,Inactive,defaulted,2305.0,0.535592,0.26584,0.0,0.335025,0.54902,0.761733,1.0
5,Inactive,paid,35392.0,0.485091,0.259919,0.0,0.275362,0.471014,0.690377,1.0
6,Promising,defaulted,4338.0,0.362445,0.262926,0.0,0.125933,0.324014,0.584,0.966667
7,Promising,paid,87567.0,0.358537,0.25994,0.0,0.124711,0.329545,0.565217,0.967972
8,Recent,defaulted,441.0,0.691025,0.21957,0.128205,0.538462,0.746988,0.876923,0.97561
9,Recent,paid,4186.0,0.706699,0.196853,0.128205,0.590164,0.753846,0.863014,1.0


In [76]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_transactions_installment"].median()], 
            x=["paid"],
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_transactions_installment"].median()],
            x=["defaulted"],
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders and Inactive customers who prefer installment plans<br>for their purchases have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="preference of installment plans<br>(0 - 100%)",
    barmode="group",
    bargroupgap=0.1,
    showlegend=False,
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="inside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_11.png](attachment:ea225f9b-3acb-4bc5-991d-99ef1c3a1feb.png)

## Denied Transactions

**Note:** Big Spenders, Recent and Inactive customers whose transactions are denied frequently have a higher risk of default

In [77]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_denied"],
            showlegend=False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Box(
            y=df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_denied"], 
            showlegend=False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders, Recent and Inactive customers whose transactions<br>are denied frequently have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="transactions rejection rate",
    template=template,
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_12.png](attachment:3c7494a2-4832-4915-9916-293a28600669.png)

In [78]:
df_data.groupby(["rfm_segment_desc", "status_cleaned"], as_index=False)["rate_denied"].describe()

Unnamed: 0,rfm_segment_desc,status_cleaned,count,mean,std,min,25%,50%,75%,max
0,Big Spenders,defaulted,302.0,0.294168,0.163468,0.053307,0.154971,0.270332,0.382038,0.818966
1,Big Spenders,paid,5421.0,0.247859,0.163975,0.034714,0.11831,0.201849,0.361191,0.818966
2,Champions,defaulted,1351.0,0.115355,0.076019,0.027778,0.067568,0.095652,0.125826,0.525194
3,Champions,paid,31137.0,0.101839,0.058179,0.027778,0.065076,0.089122,0.115007,0.525194
4,Inactive,defaulted,2305.0,0.189879,0.137051,0.0,0.092308,0.146341,0.258065,0.857585
5,Inactive,paid,35392.0,0.158901,0.112234,0.0,0.081232,0.123123,0.207547,0.857585
6,Promising,defaulted,4338.0,0.127687,0.084905,0.020134,0.070833,0.103076,0.156509,0.719424
7,Promising,paid,87567.0,0.116853,0.071925,0.020134,0.070715,0.098655,0.141762,0.77038
8,Recent,defaulted,441.0,0.221964,0.129204,0.0,0.111111,0.210526,0.295082,0.594203
9,Recent,paid,4186.0,0.193506,0.121244,0.0,0.1,0.183908,0.255814,0.7


In [79]:
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["rate_denied"].median()], 
            x=["paid"],
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[df_data[(df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["rate_denied"].median()],
            x=["defaulted"],
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>Big Spenders, Recent and Inactive customers whose transactions<br>are denied frequently have a higher risk of default</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="transactions rejection rate",
    barmode="group",
    bargroupgap=0.1,
    showlegend=False,
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="inside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_13.png](attachment:5cf2875e-be09-449a-b349-d7c603089e41.png)

## Purchase Mode

**Note:** A preference of in-person over online purchases (and vise versa), doesn't impact significantly the risk of a user defaulting

In [81]:
labels = ["in-person", "online"]
segments = ["Champions", "Big Spenders", "Promising", "Recent", "Inactive"]

fig = make_subplots(
    rows=1, 
    cols=df_data["rfm_segment_desc"].unique().shape[0],
    shared_yaxes=True,
    vertical_spacing=0.001,
)
for i, segment in enumerate(segments):
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_blue"],
            name="paid"
        ),
        row=1, 
        col=i+1
    )
    fig.add_trace(
        go.Bar(
            y=[
               df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[(df_data["online_person_preference"] == label) & (df_data["rfm_segment_desc"] == segment)]["repayment_id"].count() 
               for label in labels
            ], 
            x=labels,
            showlegend=True if i == 0 else False,
            marker_color=colors["dark_purple"],
            name="defaulted"
        ),
        row=1, 
        col=i+1
    )
    fig.update_xaxes(title_text=f"{segment}", row=1, col=i+1)

fig.update_layout(
    title="<b>A preference of in-person over online purchases (and vise versa),<br>doesn't impact significantly the risk of a user defaulting</b>",
    title_font_size=title_font_size,
    font_size=font_size,
    yaxis_title="% of repayments",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_traces(
    texttemplate="%{y:.0%}",
    textposition="outside"
)
fig.update_annotations(font_size=font_size)

fig.show()

![img_14.png](attachment:6fe11584-abd3-41c8-8053-d5eeb6d4d7ab.png)

## Previous Repayment Status

**Note:** 24% of prior repayment defaults lead to subsequent defaults, posing a significant risk

In [82]:
labels = ["paid", "defaulted"]

fig = go.Figure([
    go.Bar(y=labels,
           x=[
               df_data[(df_data["prev_status"] == label) & (df_data["status_cleaned"] == "paid")]["repayment_id"].count() \
               / df_data[df_data["prev_status"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_blue"],
           orientation="h",
           name="paid"),
    go.Bar(y=labels,
           x=[
               df_data[(df_data["prev_status"] == label) & (df_data["status_cleaned"] == "defaulted")]["repayment_id"].count() \
               / df_data[df_data["prev_status"] == label]["repayment_id"].count() 
               for label in labels
           ],
           marker_color=colors["dark_purple"],
           orientation="h",
           name="defaulted")
])

fig.update_layout(
    title='<b>24% of prior repayment defaults lead to subsequent defaults, posing a significant risk</b>',
    title_font_size=title_font_size,
    font_size=font_size,
    xaxis_title="% of repayments",
    yaxis_title="prev. repayment status",
    barmode="stack",
    legend=dict(
        traceorder="normal",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template=template,
)
fig.update_xaxes(
    categoryorder='array', 
    categoryarray=labels
)
fig.update_traces(
    texttemplate="%{x:.0%}",
    textposition="inside"
)

fig.show()

![img_15.png](attachment:140f443a-5d27-469f-9a77-ff19ff9fc8dd.png)