> Data Preprocessing and Feature Engineering

In [1]:
from contextlib import closing
import sqlite3
import numpy as np
import pandas as pd

# Load Data

In [2]:
with closing(sqlite3.connect("../data/raw/database.db")) as conn:
    df_loans = pd.read_sql_query("SELECT * FROM loans", conn)
    df_loan_repayments = pd.read_sql_query("SELECT * FROM loan_repayments", conn)
    df_transactions = pd.read_sql_query("SELECT * FROM transactions", conn)

In [3]:
print(f"number of instances: {df_loans.shape[0]:,}")

df_loans.head(5)

number of instances: 6,746


Unnamed: 0,id,user_id,amount,total_amount,due_amount,due_date,status,created_at
0,0,3070,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 00:47:29.575000+00:00
1,1,2546,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 00:49:51.763000+00:00
2,2,2413,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 01:24:40.537000+00:00
3,3,2585,6000.0,6045.28,6459000000,2022-05-02,debt_collection,2022-02-01 02:52:59.803000+00:00
4,4,2556,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 02:53:07.123000+00:00


In [4]:
print(f"number of instances: {df_loan_repayments.shape[0]:,}")

df_loan_repayments.head(5)

number of instances: 172,445


Unnamed: 0,id,loan_id,type,amount,status,created_at
0,1,2,autopilot,269.7,paid,2022-02-01 00:10:08.057000+00:00
1,2,4,autopilot,2550.0,paid,2022-02-01 00:10:08.102000+00:00
2,3,53,pix,1500.0,defaulted,2022-02-01 13:09:53.757000+00:00
3,4,22,autopilot,630.0,paid,2022-02-02 01:21:34.278000+00:00
4,5,70,autopilot,120.0,paid,2022-02-02 01:21:34.449000+00:00


In [5]:
print(f"number of instances: {df_transactions.shape[0]:,}")

df_transactions.head(5)

number of instances: 1,448,684


Unnamed: 0,id,user_id,amount,status,capture_method,payment_method,installments,card_brand,created_at
0,1,2546,449.6,approved,emv,credit,1,mastercard,2021-12-02 13:41:53.548000+00:00
1,2,2546,269.8,approved,emv,credit,2,mastercard,2021-12-02 15:57:58.742000+00:00
2,3,2546,149.9,approved,contactless,debit,1,visa,2021-12-02 19:36:31.859000+00:00
3,4,2546,142.0,approved,contactless,credit,1,visa,2021-12-02 19:37:11.442000+00:00
4,5,2546,156.0,approved,emv,debit,1,mastercard,2021-12-02 20:20:26.648000+00:00


# Data Exploration

- Data Types
- Missing Data
- Variable Content

## Data Types

In [6]:
print(f"{df_loans.dtypes}\n")

df_loans["due_date"] = pd.to_datetime(df_loans["due_date"])
df_loans["created_at"] = pd.to_datetime(pd.to_datetime(df_loans["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_loans.dtypes)

id                int64
user_id           int64
amount          float64
total_amount    float64
due_amount        int64
due_date         object
status           object
created_at       object
dtype: object

id                       int64
user_id                  int64
amount                 float64
total_amount           float64
due_amount               int64
due_date        datetime64[ns]
status                  object
created_at      datetime64[ns]
dtype: object


In [7]:
print(f"{df_loan_repayments.dtypes}\n")

df_loan_repayments["created_at"] = pd.to_datetime(pd.to_datetime(df_loan_repayments["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_loan_repayments.dtypes)

id              int64
loan_id         int64
type           object
amount        float64
status         object
created_at     object
dtype: object

id                     int64
loan_id                int64
type                  object
amount               float64
status                object
created_at    datetime64[ns]
dtype: object


In [8]:
print(f"{df_transactions.dtypes}\n")

df_transactions["created_at"] = pd.to_datetime(pd.to_datetime(df_transactions["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_transactions.dtypes)

id                  int64
user_id             int64
amount            float64
status             object
capture_method     object
payment_method     object
installments        int64
card_brand         object
created_at         object
dtype: object

id                         int64
user_id                    int64
amount                   float64
status                    object
capture_method            object
payment_method            object
installments               int64
card_brand                object
created_at        datetime64[ns]
dtype: object


## Missing Data

In [9]:
df_loans.isnull().sum()

id              0
user_id         0
amount          0
total_amount    0
due_amount      0
due_date        0
status          0
created_at      0
dtype: int64

In [10]:
df_loan_repayments.isnull().sum()

id            0
loan_id       0
type          0
amount        0
status        0
created_at    0
dtype: int64

In [11]:
df_transactions.isnull().sum()

id                0
user_id           0
amount            0
status            0
capture_method    0
payment_method    0
installments      0
card_brand        0
created_at        0
dtype: int64

## Variable Content

In [12]:
print(f"number of loans: {df_loans['id'].nunique():,}")
print(f"number of users: {df_loans['user_id'].nunique():,}")
print(f"loans originating (years): {df_loans['created_at'].dt.year.unique()}")

number of loans: 6,746
number of users: 3,154
loans originating (years): [2022]


In [13]:
print(f"number of loans being repayed: {df_loan_repayments['loan_id'].nunique():,}")
print(f"repayments originating (years): {df_loan_repayments['created_at'].dt.year.unique()}")

number of loans being repayed: 6,598
repayments originating (years): [2022 2023]


In [14]:
print(f"number of user: {df_transactions['user_id'].nunique():,}")
print(f"transactions originating (years): {df_transactions['created_at'].dt.year.unique()}")

number of user: 3,150
transactions originating (years): [2021 2022 2023]


**Notes:**

- table_transactions is missings information about some users who have taken a loan
    - remove these users from table_loans and their corresponding loans in table_loan_repayments
- table_loan_repayments is missing information about some loans
    - remove these loans from table_loans
    - check if there are inconcistences with the users, and if yes repeat the previous step

In [15]:
df_transactions["capture_method"].unique()

array(['emv', 'contactless', 'payment_link', 'ecommerce', 'mpos',
       'payment_link_web'], dtype=object)

**Notes:**

- emv: Europay, Mastercard and Visa
- mpos: Mobile Point of Sale

In [16]:
df_transactions["card_brand"].unique()

array(['mastercard', 'visa', 'elo', 'hipercard', 'amex'], dtype=object)

**Notes:**

- Elo and Hipercard are specific to Brazil

# Data Preprocessing 

TODO: what do I do when status == error 

- Handle inconsistencies
    - table_transactions is missings information about some users who have taken a loan
        - remove these users from table_loans and their corresponding loans in table_loan_repayments
    - table_loan_repayments is missing information about some loans
        - remove these loans from table_loans
        - check if there are inconcistences with the users, and if yes repeat the previous step

In [17]:
# table_transactions is missings information about some users who have taken a loan
# remove these users from table_loans and their corresponding loans in table_loan_repayments
missing_users = set(df_loans['user_id']) - set(df_transactions['user_id'])
missing_users_loans = set(df_loans[df_loans['user_id'].isin(missing_users)]['id'])

df_loans = df_loans[~df_loans['user_id'].isin(missing_users)]
df_loan_repayments = df_loan_repayments[~df_loan_repayments['loan_id'].isin(missing_users_loans)]

# table_loan_repayments is missing information about some loans
# remove these loans from table_loans
missing_loans = set(df_loans['id']) - set(df_loan_repayments['loan_id'])

df_loans = df_loans[~df_loans['id'].isin(missing_loans)]

In [18]:
print(f"number of loans: {df_loans['id'].nunique():,}")
print(f"number of users: {df_loans['user_id'].nunique():,}")
print(f"loans originating (years): {df_loans['created_at'].dt.year.unique()}\n")

print(f"number of loans being repayed: {df_loan_repayments['loan_id'].nunique():,}")
print(f"repayments originating (years): {df_loan_repayments['created_at'].dt.year.unique()}\n")

print(f"number of user: {df_transactions['user_id'].nunique():,}")
print(f"transactions originating (years): {df_transactions['created_at'].dt.year.unique()}")

number of loans: 6,595
number of users: 3,048
loans originating (years): [2022]

number of loans being repayed: 6,595
repayments originating (years): [2022 2023]

number of user: 3,150
transactions originating (years): [2021 2022 2023]


In [19]:
# table_transactions contains information about users who aren't present in table_loans
# remove these users from table_transactions
missing_users = set(df_transactions['user_id']) - set(df_loans['user_id'])

df_transactions = df_transactions[~df_transactions['user_id'].isin(missing_users)]

In [20]:
print(f"number of loans: {df_loans['id'].nunique():,}")
print(f"number of users: {df_loans['user_id'].nunique():,}")
print(f"loans originating (years): {df_loans['created_at'].dt.year.unique()}\n")

print(f"number of loans being repayed: {df_loan_repayments['loan_id'].nunique():,}")
print(f"repayments originating (years): {df_loan_repayments['created_at'].dt.year.unique()}\n")

print(f"number of user: {df_transactions['user_id'].nunique():,}")
print(f"transactions originating (years): {df_transactions['created_at'].dt.year.unique()}")

number of loans: 6,595
number of users: 3,048
loans originating (years): [2022]

number of loans being repayed: 6,595
repayments originating (years): [2022 2023]

number of user: 3,048
transactions originating (years): [2021 2022 2023]


## ✅ checkpoint

In [21]:
df_loans.to_pickle("../data/processed/df_loans.pkl")
df_loan_repayments.to_pickle("../data/processed/df_loan_repayments.pkl")
df_transactions.to_pickle("../data/processed/df_transactions.pkl")

# Feature Engineering

TODO: interest rates (df_loans)

- loan term: length of time over which a loan is to be repaid (days)
- loan fees: charges that have incurred when the loan was taken

In [22]:
# Loan Term: difference between due_date and created_at
# longer loan terms might correlate with higher risk of default.
df_loans["loan_term"] = (df_loans["due_date"] - df_loans["created_at"]).dt.days

In [23]:
# Loan Fees: difference between total_amount and (net) amount 
# higher fees might correlate with higher risk of default
df_loans["loan_fees"] = df_loans["total_amount"] - df_loans["amount"]

## ✅ checkpoint