# Data Preprocessing and Feature Engineering

## Table of Contents

- Load Data
- Data Exploration
    - Data Types
    - Missing Data
    - Variable Content
- Data Preprocessing
    - checkpoint
- Feature Engineering
    - Loan Related
    - Loan Repayment Related
    - User Related 
    - checkpoint

In [207]:
from contextlib import closing
import sqlite3
from copy import deepcopy
import numpy as np
import pandas as pd

# Load Data

In [208]:
with closing(sqlite3.connect("../data/raw/database.db")) as conn:
    df_loans = pd.read_sql_query("SELECT * FROM loans", conn)
    df_loan_repayments = pd.read_sql_query("SELECT * FROM loan_repayments", conn)
    df_transactions = pd.read_sql_query("SELECT * FROM transactions", conn)

In [209]:
print(f"number of instances: {df_loans.shape[0]:,}")

df_loans.head(5)

number of instances: 6,746


Unnamed: 0,id,user_id,amount,total_amount,due_amount,due_date,status,created_at
0,0,3070,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 00:47:29.575000+00:00
1,1,2546,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 00:49:51.763000+00:00
2,2,2413,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 01:24:40.537000+00:00
3,3,2585,6000.0,6045.28,6459000000,2022-05-02,debt_collection,2022-02-01 02:52:59.803000+00:00
4,4,2556,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 02:53:07.123000+00:00


In [210]:
print(f"number of instances: {df_loan_repayments.shape[0]:,}")

df_loan_repayments.head(5)

number of instances: 172,445


Unnamed: 0,id,loan_id,type,amount,status,created_at
0,1,2,autopilot,269.7,paid,2022-02-01 00:10:08.057000+00:00
1,2,4,autopilot,2550.0,paid,2022-02-01 00:10:08.102000+00:00
2,3,53,pix,1500.0,defaulted,2022-02-01 13:09:53.757000+00:00
3,4,22,autopilot,630.0,paid,2022-02-02 01:21:34.278000+00:00
4,5,70,autopilot,120.0,paid,2022-02-02 01:21:34.449000+00:00


In [211]:
print(f"number of instances: {df_transactions.shape[0]:,}")

df_transactions.head(5)

number of instances: 1,448,684


Unnamed: 0,id,user_id,amount,status,capture_method,payment_method,installments,card_brand,created_at
0,1,2546,449.6,approved,emv,credit,1,mastercard,2021-12-02 13:41:53.548000+00:00
1,2,2546,269.8,approved,emv,credit,2,mastercard,2021-12-02 15:57:58.742000+00:00
2,3,2546,149.9,approved,contactless,debit,1,visa,2021-12-02 19:36:31.859000+00:00
3,4,2546,142.0,approved,contactless,credit,1,visa,2021-12-02 19:37:11.442000+00:00
4,5,2546,156.0,approved,emv,debit,1,mastercard,2021-12-02 20:20:26.648000+00:00


# Data Exploration

- Data Types
- Missing Data
- Variable Content

## Data Types

In [212]:
print(f"{df_loans.dtypes}\n")

df_loans["due_date"] = pd.to_datetime(df_loans["due_date"])
df_loans["created_at"] = pd.to_datetime(pd.to_datetime(df_loans["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_loans.dtypes)

id                int64
user_id           int64
amount          float64
total_amount    float64
due_amount        int64
due_date         object
status           object
created_at       object
dtype: object

id                       int64
user_id                  int64
amount                 float64
total_amount           float64
due_amount               int64
due_date        datetime64[ns]
status                  object
created_at      datetime64[ns]
dtype: object


In [213]:
print(f"{df_loan_repayments.dtypes}\n")

df_loan_repayments["created_at"] = pd.to_datetime(pd.to_datetime(df_loan_repayments["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_loan_repayments.dtypes)

id              int64
loan_id         int64
type           object
amount        float64
status         object
created_at     object
dtype: object

id                     int64
loan_id                int64
type                  object
amount               float64
status                object
created_at    datetime64[ns]
dtype: object


In [214]:
print(f"{df_transactions.dtypes}\n")

df_transactions["created_at"] = pd.to_datetime(pd.to_datetime(df_transactions["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_transactions.dtypes)

id                  int64
user_id             int64
amount            float64
status             object
capture_method     object
payment_method     object
installments        int64
card_brand         object
created_at         object
dtype: object

id                         int64
user_id                    int64
amount                   float64
status                    object
capture_method            object
payment_method            object
installments               int64
card_brand                object
created_at        datetime64[ns]
dtype: object


## Missing Data

In [215]:
df_loans.isnull().sum()

id              0
user_id         0
amount          0
total_amount    0
due_amount      0
due_date        0
status          0
created_at      0
dtype: int64

In [216]:
df_loan_repayments.isnull().sum()

id            0
loan_id       0
type          0
amount        0
status        0
created_at    0
dtype: int64

In [217]:
df_transactions.isnull().sum()

id                0
user_id           0
amount            0
status            0
capture_method    0
payment_method    0
installments      0
card_brand        0
created_at        0
dtype: int64

## Variable Content

In [218]:
print(f"number of loans: {df_loans['id'].nunique():,}")
print(f"number of users: {df_loans['user_id'].nunique():,}")
print(f"loans originating (years): {df_loans['created_at'].dt.year.unique()}")

number of loans: 6,746
number of users: 3,154
loans originating (years): [2022]


In [219]:
print(f"number of loans being repayed: {df_loan_repayments['loan_id'].nunique():,}")
print(f"repayments originating (years): {df_loan_repayments['created_at'].dt.year.unique()}")

number of loans being repayed: 6,598
repayments originating (years): [2022 2023]


In [220]:
print(f"number of user: {df_transactions['user_id'].nunique():,}")
print(f"transactions originating (years): {df_transactions['created_at'].dt.year.unique()}")

number of user: 3,150
transactions originating (years): [2021 2022 2023]


**Notes:**

- `table_transactions` is missings information about 4 users who have taken a loan
- `table_loan_repayments` is missing information about 148 loans

In [221]:
df_transactions["capture_method"].unique()

array(['emv', 'contactless', 'payment_link', 'ecommerce', 'mpos',
       'payment_link_web'], dtype=object)

**Notes:**

- emv: Europay, Mastercard and Visa
- mpos: Mobile Point of Sale

In [222]:
df_transactions["card_brand"].unique()

array(['mastercard', 'visa', 'elo', 'hipercard', 'amex'], dtype=object)

**Notes:**

- Elo and Hipercard are specific to Brazil

# Data Preprocessing 


- **Handle invalid data**
    - exclude loans that were cancelled or created by error both from `table_loans` and `table_loan_repayments`
    - due_amount in table_loans needs to be converted to a float amount
- **Misc**
    - auxiliary repayment status feature, where `refunded` is merged with `paid` (refunded: a loan repayment that happened but was fully refunded to the user)
    - auxiliary loan status features, where `debt_collection` and `debt_repaid` are merged together as `debt` 
- **Notes**
    - `table_loan_repayments` is missing information about 148 loans
        - keep and impute the measing values during modeling
    - `table_transactions` is missing information about 4 users
        - keep and consider the cold start problem

In [223]:
exclude = ["error", "cancelled"]
excluded_ids = set(df_loans[df_loans["status"].isin(exclude)]["id"])

# exclude loans that were cancelled or created by error
df_loans = df_loans[~df_loans["id"].isin(excluded_ids)]
df_loan_repayments = df_loan_repayments[~df_loan_repayments["loan_id"].isin(excluded_ids)]

In [224]:
# due_amount converted to a float amount
df_loans["due_amount"] = df_loans["due_amount"] / 1_000_000

In [225]:
# merge debt_collection and debt_repaid to debt
df_loans["status_cleaned"] = df_loans["status"].replace({"debt_collection": "debt", "debt_repaid": "debt"})

In [226]:
# merge refunded with paid, since refunded is a repayment that happened but was fully refunded to the user
df_loan_repayments["status_cleaned"] = df_loan_repayments["status"].replace({"refunded": "paid"})

## ✅ checkpoint

In [227]:
df_loans.to_pickle("../data/processed/df_loans.pkl")
df_loan_repayments.to_pickle("../data/processed/df_loan_repayments.pkl")
df_transactions.to_pickle("../data/processed/df_transactions.pkl")

# Feature Engineering

- **Loan Related**
    - *loan term:* length of time over which a loan is to be repaid (days)
    - *loan fees:* charges that have incurred when the loan was taken
    - *loan amount bins* & *loan due amount bins* & *loan fees bins*
    - *interest rate* & *interest rate bins*
    - *default ratio:* number of defaulted repayments to total number repayments
    - *median repayment amount*
    - *repayment amount variability*: standard deviation of repayment amounts (how consistent are the repayments) 
    - *number of repayments*
    - *ratio of repaid amount to total amount*
    - *max repayment delay:*  maximum number of days taken for a loan to be repayed after due date has been exceeded
    - *due date exceeded*
    - *number of late repayments*
    - *median repayment lag*
- **Loan Repayments Related**
    - *due date exceeded*
    - *time since due date:* length of time between due date and repayment (days)
    - *repayment lag:* lag between current repayment date and previous repayment date (days)
- **User Related**
    - *recency:* time since the user's last transaction
    - *frequency:* how often users make a purchase
    - *monetary:* total amount spent on purchases
    - *median spending:* median amount spent
    - *median number of installments*
    - *median transaction lag:* median time between transactions per user
    - *denied rate:* number of denied transactions to total number of transactions
    - *transactions with installments ratio:* proportion of transactions with installments
    - *credit to debit ratio:* number of credit transactions to number of debit transactions
    - *online to in-person purchases ratio:* number of online purchases to number of in-person purchases
    - *card brand preference:* determine whether an International or Brazil-based card provider is prefered
    - *default ratio:* number of defaulted repayments to total number repayments across all user loans
    - *maximum repayment delay*
    - *median repayment lag*
    - *number of loans*
    - *due date exceeded ratio:* number of loans who were repayed outside of deadline to the total number of loans
    - *frequency of late repayments*
    - *due date compiance (ratio):* ratio of repayments made on time (before the due date) to total repayments

## Loan Related

In [228]:
# Loan Term: difference between due_date and created_at
# longer loan terms might correlate with higher risk of default.
df_loans["loan_term"] = (df_loans["due_date"] - df_loans["created_at"]).dt.days

In [229]:
# loan amount bins
for i in range(0, 10_000, 1_000):
    label = f"{int(i/1_000)}k - {int((i+1_000)/1_000)}k"
    df_loans.loc[df_loans["amount"].between(i, i+1_000), "amount_bin"] = label

# loan due amount bins
for i in range(0, 15_000, 1_000):
    label = f"{int(i/1_000)}k - {int((i+1_000)/1_000)}k"
    df_loans.loc[df_loans["due_amount"].between(i, i+1_000), "due_amount_bin"] = label

In [230]:
# Loan Fees: difference between total_amount and (net) amount 
# higher fees might correlate with higher risk of default
df_loans["loan_fees"] = df_loans["total_amount"] - df_loans["amount"]

# loan fees bins
for i in range(0, 60, 10):
    label = f"{i} - {i+10}"
    df_loans.loc[df_loans["loan_fees"].between(i, i+10), "loan_fees_bin"] = label

In [231]:
# Interest Rate: based on the formula A=P*(1+r*t) where
# P = Principal Amount
# R = Rate of interest
# t = Number of years
# A = Total accrued amount (both principal and the interest)
df_loans["interest_rate"] = (
    (df_loans["due_amount"] - df_loans["amount"] * 1) / (df_loans["amount"] * df_loans["loan_term"] / 365)  *100
).round()

# interest rate bins
df_loans.loc[df_loans["interest_rate"] < 40, "interest_rate_bin"] = "small"
df_loans.loc[df_loans["interest_rate"].between(40, 100), "interest_rate_bin"] = "medium"
df_loans.loc[df_loans["interest_rate"] > 100, "interest_rate_bin"] = "large"

In [232]:
df_temp = (
    pd.get_dummies(df_loan_repayments['status_cleaned'])
    .rename(columns={"defaulted": "status_defaulted", "paid": "status_paid", "loan_id": "id"})
)
df_loan_repayments = pd.concat([df_loan_repayments, df_temp], axis=1)
df_temp = df_loan_repayments.groupby(["loan_id"], as_index=False).agg(
    {
        "status_defaulted": lambda repayment : repayment.sum(),
        "status_paid": lambda repayment : repayment.sum(),
    }
).rename(columns={"loan_id": "id"})
df_loans = df_loans.merge(df_temp, how="left", on="id")

# Default Ratio: number of defaulted repayments to total number repayments
df_loans["rate_default"] = df_loans["status_defaulted"] / (df_loans["status_defaulted"] + df_loans["status_paid"]) # smoothing

df_loan_repayments = df_loan_repayments.drop(columns=["status_defaulted", "status_paid"])

In [233]:
# Median Repayment Amount
# Number of Repayments
df_temp = df_loan_repayments.groupby(["loan_id"], as_index=False).agg(
    {
        "amount": lambda repayment : repayment.median(),
        "id": lambda repayment : len(repayment),
    }
).rename(columns={"loan_id": "id", "amount": "median_repayment_amount", "id": "num_repayments"})
df_loans = df_loans.merge(df_temp, how="left", on="id")

In [234]:
# Repayment Amount Variability: how consistent/inconsistent are the repayment amounts?
df_temp = (
    df_loan_repayments.groupby(["loan_id"], as_index=False)
    ["amount"].std()
    .rename(columns={"loan_id": "id", "amount": "std_repayment_amount"})
)
df_loans = df_loans.merge(df_temp, how="left", on="id")

In [235]:
# auxiliary amount column to account for defaulted and refunded payments
df_loan_repayments["temp_amount"] = 0  # defaulted repayments should be excluded in cumsum
df_loan_repayments.loc[df_loan_repayments["status"] == "paid", "temp_amount"] = (
    df_loan_repayments.loc[df_loan_repayments["status"] == "paid", "amount"]
)
df_loan_repayments.loc[df_loan_repayments["status"] == "refunded", "temp_amount"] = (
    - df_loan_repayments.loc[df_loan_repayments["status"] == "refunded", "amount"]
)  # refunded repayments should be subtracted in cumsum

# Repaid Amount
df_temp = df_loan_repayments.groupby("loan_id", as_index=False)["temp_amount"].sum().rename(columns={"loan_id": "id", "temp_amount": "repaid_amount"})
df_loans = df_loans.merge(df_temp, how="left", on="id")

# Ratio of Repaid Amount to Loan Amount: can be a good indicator of the borrower's repayment behavior
df_loans["ratio_repaid_total"] = df_loans["repaid_amount"] / df_loans["total_amount"]

df_loans = df_loans.drop(columns=["repaid_amount"])
df_loan_repayments = df_loan_repayments.drop(columns=["temp_amount"])

## Loan Repayment Related

In [236]:
df_temp = (
    df_loans[["id", "created_at", "due_date"]]
    .rename(columns={"id": "loan_id", "created_at": "loan_created_at"})
)
df_loan_repayments = df_loan_repayments.merge(df_temp, how="left", on="loan_id")
df_loan_repayments = df_loan_repayments.sort_values(["loan_id", "created_at"])

In [237]:
# Time Since Due Date: length of time between due date and repayment
df_loan_repayments["days_since_due_date"] = (df_loan_repayments["created_at"] - df_loan_repayments["due_date"]).dt.days
# in case the due date hasn't been exceeded the values will be negative
# default to 0 represent that the due date hasn't been exceeded
df_loan_repayments.loc[df_loan_repayments["days_since_due_date"] < 0, "days_since_due_date"] = 0

# Max Repayment Delay (per loan): maximum number of days taken for a loan to be repayed after due date has been exceeded
df_temp = (
    df_loan_repayments
    .groupby("loan_id", as_index=False)
    ["days_since_due_date"].max()
    .rename(columns={"loan_id": "id", "days_since_due_date": "max_repayment_delay"})
)
df_loans = df_loans.merge(df_temp, how="left", on="id")

In [238]:
# Due Date Exceeded: if due date has been exceeded
# Both for repayments and loans
df_loan_repayments["due_date_exceeded"] = 0
df_loan_repayments.loc[df_loan_repayments["days_since_due_date"] > 0, "due_date_exceeded"] = 1

df_temp = (
    df_loan_repayments
    .groupby("loan_id", as_index=False)
    ["due_date_exceeded"].max()
    .rename(columns={"loan_id": "id"})
)
df_loans = df_loans.merge(df_temp, how="left", on="id")

In [239]:
# Number of Late Repayments (per loan): this can indicate the borrower's tendency to miss deadlines
df_temp = (
    df_loan_repayments
    .groupby("loan_id", as_index=False)
    ["due_date_exceeded"].sum()
    .rename(columns={"loan_id": "id", "due_date_exceeded": "num_late_repayment"})
)
df_loans = df_loans.merge(df_temp, how="left", on="id")

In [240]:
# Repayment Lag: difference between current repayment date and previous repayment date 
# first repayment date should be compared to the loan creation date
# this can indicate whether the borrower is keeping up with the repayment schedule.
df_loan_repayments["days_lag_repayment"] = df_loan_repayments.groupby(["loan_id"], as_index=False)["created_at"].diff().dt.days
df_loan_repayments.loc[df_loan_repayments["days_lag_repayment"].isna(), "days_lag_repayment"] = (
    df_loan_repayments[df_loan_repayments["days_lag_repayment"].isna()]["created_at"] - \
    df_loan_repayments[df_loan_repayments["days_lag_repayment"].isna()]["loan_created_at"]
).dt.days

# Median Repayment Lag (per loan)
df_temp = (
    df_loan_repayments
    .groupby("loan_id", as_index=False)
    ["days_lag_repayment"].median()
    .apply(np.ceil)
    .rename(columns={"loan_id": "id", "days_lag_repayment": "median_days_lag_repayment"})
)
df_loans = df_loans.merge(df_temp, how="left", on="id")

df_loan_repayments = df_loan_repayments.drop(columns=["loan_created_at", "due_date"])

## User Related

In [241]:
# Recency: Time since the user's last transaction 
# Active users might be more financially active and potentially better candidates for loan repayment

# Frequency: How often user make a purchase
# Users who have a higher frequency of transactions might be more financially active 
# and potentially better candidates for loan repayment

# Monetary: Total amount spent on purchases
# Users who typically make larger transactions or have more consistent transaction amounts 
# might be more likely to repay loans

# to calculte Recency, the current date that will be used for comparison will be the max date of the sample
date_current = df_transactions["created_at"].max()

df_users = df_transactions.groupby("user_id", as_index=False).agg(
    {
        "created_at": lambda date : (date_current - date.max()).days,
        "id": lambda transaction : len(transaction),
        "amount": lambda amount : amount.sum(),
    }
).rename(columns={"created_at": "recency", "id": "frequency", "amount": "monetary"})

In [242]:
# Median Spending: median amount spent
# Median Number of Installments
df_temp = df_transactions.groupby("user_id", as_index=False).agg(
    {
        "amount": lambda amount : amount.median(),
        "installments": lambda installment : installment.median(),
    }
).rename(columns={"amount": "median_spending", "installments": "median_intallments"})

df_users = df_users.merge(df_temp, how="left", on="user_id")

In [243]:
# Median Transaction Lag: median time between transactions per user
df_transactions["days_lag_transaction"] = df_transactions.groupby(["user_id"], as_index=False)["created_at"].diff().dt.days

df_temp = (
    df_transactions.groupby(["user_id"], as_index=False)
    ["days_lag_transaction"].median()
    .rename(columns={"days_lag_transaction": "median_lag_transaction"})
)

df_users = df_users.merge(df_temp, how="left", on="user_id")

In [244]:
df_temp = (
    pd.get_dummies(df_transactions['status'])
    .rename(columns={"approved": "transaction_approved", "denied": "transaction_denied"})
)
df_transactions = pd.concat([df_transactions, df_temp], axis=1)
df_temp = df_transactions.groupby(["user_id"], as_index=False).agg(
    {
        "transaction_approved": lambda transaction : transaction.sum(),
        "transaction_denied": lambda transaction : transaction.sum(),
    }
)
df_users = df_users.merge(df_temp, how="left", on="user_id")

# Denied Ratio: number of denied transactions to total number of transactions 
# Users with a higher denied rate might demonstrate worse financial behavior
df_users["rate_denied"] = df_users["transaction_denied"] / df_users["frequency"]

df_users = df_users.drop(columns=["transaction_approved", "transaction_denied"])
df_transactions = df_transactions.drop(columns=["transaction_approved", "transaction_denied"])

In [245]:
df_transactions.loc[df_transactions["installments"] == 1, "with_installment"] = False
df_transactions.loc[df_transactions["installments"] > 1, "with_installment"] = True

df_temp = df_transactions.groupby(["user_id"], as_index=False)["with_installment"].sum()
df_users = df_users.merge(df_temp, how="left", on="user_id")

# Transactions with Installments Ratio: proportion of transactions with installments
# Users who frequently opt for installment payments might have better financial planning and repayment capabilities
df_users["rate_transactions_installment"] = df_users["with_installment"] / df_users["frequency"]

df_users = df_users.drop(columns=["with_installment"])
df_transactions = df_transactions.drop(columns=["with_installment"])

In [246]:
df_temp = (
    pd.get_dummies(df_transactions['payment_method'])
    .rename(columns={"credit": "payment_method_credit", "debit": "payment_method_debit"})
)
df_transactions = pd.concat([df_transactions, df_temp], axis=1)
df_temp = df_transactions.groupby(["user_id"], as_index=False).agg(
    {
        "payment_method_credit": lambda payment_method : payment_method.sum(),
        "payment_method_debit": lambda payment_method : payment_method.sum(),
    }
)
df_users = df_users.merge(df_temp, how="left", on="user_id")

# Credit to Debit Ratio: number of credit transactions to number of debit transactions 
# Users who rely on credit cards might be more likely to repay loans
df_users["rate_credit_debit"] = (1 + df_users["payment_method_credit"]) / (1 + df_users["payment_method_debit"])  # smoothing

df_users = df_users.drop(columns=["payment_method_credit", "payment_method_debit"])
df_transactions = df_transactions.drop(columns=["payment_method_credit", "payment_method_debit"])

In [247]:
# Based on the capture method, derive whether a purchase was made in-person or online
purchase_in_person, purchase_online = ["emv", "contactless", "mpos"], ["payment_link", "ecommerce", "payment_link_web"]
df_transactions.loc[df_transactions["capture_method"].isin(purchase_in_person), "purchase_type"] = "in_person"
df_transactions.loc[df_transactions["capture_method"].isin(purchase_online), "purchase_type"] = "online"

df_temp = (
    pd.get_dummies(df_transactions['purchase_type'])
    .rename(columns={"in_person": "purchase_in_person", "online": "purchase_online"})
)
df_transactions = pd.concat([df_transactions, df_temp], axis=1)
df_temp = df_transactions.groupby(["user_id"], as_index=False).agg(
    {
        "purchase_in_person": lambda purchase_type : purchase_type.sum(),
        "purchase_online": lambda purchase_type : purchase_type.sum(),
    }
)
df_users = df_users.merge(df_temp, how="left", on="user_id")

# Online to In-Person Purchases Ratio: number of online purchases to number of in-person purchases
# This ratio indicates users' financial habits
df_users["ratio_online_person"] = (1 + df_users["purchase_online"]) / (1 + df_users["purchase_in_person"])  # smoothing

df_users = df_users.drop(columns=["purchase_in_person", "purchase_online"])
df_transactions = df_transactions.drop(columns=["purchase_type", "purchase_in_person", "purchase_online"])

In [248]:
# Based on the card brand, derive whether an International or Brazil-based card provider is being used
card_internation, card_brazil = ["mastercard", "visa", "amex"], ["elo", "hipercard"]
df_transactions.loc[df_transactions["card_brand"].isin(card_internation), "card_type"] = "card_internation"
df_transactions.loc[df_transactions["card_brand"].isin(card_brazil), "card_type"] = "card_brazil"

# Card Brand Preference: determine whether an International or Brazil-based card provider is prefered
df_temp = (
    df_transactions
    .groupby(["user_id"], as_index=False)
    ["card_type"].value_counts()
    .sort_values(['user_id', 'count'], ascending=False)
    .drop_duplicates(['user_id'])
)
df_users = (
    df_users
    .merge(df_temp[["user_id", "card_type"]], how="left", on="user_id")
    .rename(columns={"card_type": "card_preference"})
)

df_transactions = df_transactions.drop(columns=["card_type"])

In [249]:
df_temp = df_loans.groupby("user_id", as_index=False).agg(
    {
        "status_defaulted": lambda loan : loan.sum(),
        "status_paid": lambda loan : loan.sum(),
    }
)
df_users = df_users.merge(df_temp, how="left", on="user_id")

# Default Ratio: number of defaulted repayments to total number repayments
# Users with a higher rate demonstrate worse financial behavior
df_users["rate_default"] = (1 + df_users["status_defaulted"]) / (1 + df_users["status_paid"])  # smoothing

In [250]:
# Median Repayment Amount: across all loans
df_temp = df_loans.groupby("user_id", as_index=False)['median_repayment_amount'].median()
df_users = df_users.merge(df_temp, how="left", on="user_id")

In [251]:
# Maximum Repayment Delay: across all loans
df_temp = df_loans.groupby("user_id", as_index=False)['max_repayment_delay'].max()
df_users = df_users.merge(df_temp, how="left", on="user_id")

In [252]:
# Median Repayment Lag: across all loans
df_temp = df_loans.groupby("user_id", as_index=False)['median_days_lag_repayment'].median()
df_users = df_users.merge(df_temp, how="left", on="user_id")

In [253]:
# Number of Loans
# Times Due Date Exceeded: number of loans who were repayed outside of deadline
df_temp = df_loans.groupby("user_id", as_index=False).agg(
    {
        "due_date_exceeded": lambda loan : loan.sum(),
        "id": lambda loan : len(loan),
    }
).rename(columns={"id": "num_loans", "due_date_exceeded": "times_due_date_exceeded"})
df_users = df_users.merge(df_temp, how="left", on="user_id")

# Due Data Exceeded Ratio: number of loans who were repayed outside of deadline to the total number of loans
df_users["rate_due_date_exceeded"] = df_users["times_due_date_exceeded"] / df_users["num_loans"]

df_users = df_users.drop(columns=["times_due_date_exceeded"])

In [254]:
# Frequency of Late Repayments
df_temp = df_loans.groupby("user_id", as_index=False)['num_late_repayment'].sum().rename(columns={"num_late_repayment": "freq_late_repayments"})
df_users = df_users.merge(df_temp, how="left", on="user_id")

# Due Date Compiance: ratio of repayments made on time (before the due date) to total repayments
df_temp = df_loans.groupby("user_id", as_index=False).agg(
    {
        "num_repayments": lambda loan : loan.sum(),
        "num_late_repayment": lambda loan : loan.sum(),
    }
).rename(columns={"id": "num_loans", "due_date_exceeded": "times_due_date_exceeded"})
df_users = df_users.merge(df_temp, how="left", on="user_id")
# 1 - (ratio of late repayments to total repayments)
df_users["rate_due_date_compliance"] = 1 - (df_users["num_late_repayment"] / df_users["num_repayments"])

In [None]:
# Type of Repayment preference (auto / pix) / behavior indincating whether the user prefers a platform for planning his expenses

# Ratio Repaid Total: Ratio of the total amount repaid to the total loan amount.

## ✅ checkpoint

In [None]:
df_loans.to_pickle("../data/processed/df_loans.pkl")
df_loan_repayments.to_pickle("../data/processed/df_loan_repayments.pkl")
df_users.to_pickle("../data/processed/df_users.pkl")