# Data Preprocessing

## Table of Contents

- Load Data
- Data Quality
    - Data Types
    - Missing Data
    - Quality Check
- Data Preprocessing
    - checkpoint

In [3]:
from copy import deepcopy

from contextlib import closing
import sqlite3

import numpy as np
import pandas as pd

# Load Data

In [5]:
with closing(sqlite3.connect("../data/raw/database.db")) as conn:
    df_loans = pd.read_sql_query("SELECT * FROM loans", conn)
    df_repayments = pd.read_sql_query("SELECT * FROM loan_repayments", conn)
    df_transactions = pd.read_sql_query("SELECT * FROM transactions", conn)

In [6]:
df_loans

Unnamed: 0,id,user_id,amount,total_amount,due_amount,due_date,status,created_at
0,0,3070,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 00:47:29.575000+00:00
1,1,2546,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 00:49:51.763000+00:00
2,2,2413,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 01:24:40.537000+00:00
3,3,2585,6000.0,6045.28,6459000000,2022-05-02,debt_collection,2022-02-01 02:52:59.803000+00:00
4,4,2556,6000.0,6045.28,6459000000,2022-05-02,repaid,2022-02-01 02:53:07.123000+00:00
...,...,...,...,...,...,...,...,...
6741,6741,2130,2500.0,2518.87,3228820000,2023-01-01,repaid,2022-10-03 20:44:11.967000+00:00
6742,6742,549,6000.0,6045.28,7749160000,2023-01-01,repaid,2022-10-03 21:03:02.995000+00:00
6743,6743,1414,6000.0,6045.28,7749160000,2023-01-01,repaid,2022-10-03 21:33:47.964000+00:00
6744,6744,2070,6000.0,6045.28,7749160000,2023-01-01,debt_repaid,2022-10-03 22:29:03.256000+00:00


In [7]:
df_repayments

Unnamed: 0,id,loan_id,type,amount,status,created_at
0,1,2,autopilot,269.70,paid,2022-02-01 00:10:08.057000+00:00
1,2,4,autopilot,2550.00,paid,2022-02-01 00:10:08.102000+00:00
2,3,53,pix,1500.00,defaulted,2022-02-01 13:09:53.757000+00:00
3,4,22,autopilot,630.00,paid,2022-02-02 01:21:34.278000+00:00
4,5,70,autopilot,120.00,paid,2022-02-02 01:21:34.449000+00:00
...,...,...,...,...,...,...
172440,172441,4524,pix,362.75,defaulted,2023-03-21 23:36:06.093000+00:00
172441,172442,4524,pix,362.75,refunded,2023-03-22 13:15:47.100000+00:00
172442,172443,4524,autopilot,239.12,refunded,2023-03-22 16:06:19.680000+00:00
172443,172444,4524,pix,362.75,defaulted,2023-03-22 18:05:03.312000+00:00


In [8]:
df_transactions

Unnamed: 0,id,user_id,amount,status,capture_method,payment_method,installments,card_brand,created_at
0,1,2546,449.6,approved,emv,credit,1,mastercard,2021-12-02 13:41:53.548000+00:00
1,2,2546,269.8,approved,emv,credit,2,mastercard,2021-12-02 15:57:58.742000+00:00
2,3,2546,149.9,approved,contactless,debit,1,visa,2021-12-02 19:36:31.859000+00:00
3,4,2546,142.0,approved,contactless,credit,1,visa,2021-12-02 19:37:11.442000+00:00
4,5,2546,156.0,approved,emv,debit,1,mastercard,2021-12-02 20:20:26.648000+00:00
...,...,...,...,...,...,...,...,...,...
1448679,1448680,2154,3.5,approved,contactless,debit,1,mastercard,2023-01-14 20:49:15.543000+00:00
1448680,1448681,3042,100.0,approved,emv,debit,1,visa,2023-01-14 20:50:03.858000+00:00
1448681,1448682,2154,16.0,approved,emv,debit,1,visa,2023-01-14 21:13:30.542000+00:00
1448682,1448683,2418,70.0,approved,emv,credit,1,mastercard,2023-01-14 22:58:49.180000+00:00


# Data Quality

- Data Types
- Missing Data
- Quality Check

## Data Types

In [11]:
print(f"{df_loans.dtypes}\n")

df_loans["due_date"] = pd.to_datetime(df_loans["due_date"])
df_loans["created_at"] = pd.to_datetime(pd.to_datetime(df_loans["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))
df_loans["due_amount"] = df_loans["due_amount"] / 1_000_000

print(df_loans.dtypes)

id                int64
user_id           int64
amount          float64
total_amount    float64
due_amount        int64
due_date         object
status           object
created_at       object
dtype: object

id                       int64
user_id                  int64
amount                 float64
total_amount           float64
due_amount             float64
due_date        datetime64[ns]
status                  object
created_at      datetime64[ns]
dtype: object


In [12]:
print(f"{df_repayments.dtypes}\n")

df_repayments["created_at"] = pd.to_datetime(pd.to_datetime(df_repayments["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_repayments.dtypes)

id              int64
loan_id         int64
type           object
amount        float64
status         object
created_at     object
dtype: object

id                     int64
loan_id                int64
type                  object
amount               float64
status                object
created_at    datetime64[ns]
dtype: object


In [13]:
print(f"{df_transactions.dtypes}\n")

df_transactions["created_at"] = pd.to_datetime(pd.to_datetime(df_transactions["created_at"], format='ISO8601').dt.strftime('%Y-%m-%d'))

print(df_transactions.dtypes)

id                  int64
user_id             int64
amount            float64
status             object
capture_method     object
payment_method     object
installments        int64
card_brand         object
created_at         object
dtype: object

id                         int64
user_id                    int64
amount                   float64
status                    object
capture_method            object
payment_method            object
installments               int64
card_brand                object
created_at        datetime64[ns]
dtype: object


## Missing Data

In [15]:
df_loans.isnull().sum()

id              0
user_id         0
amount          0
total_amount    0
due_amount      0
due_date        0
status          0
created_at      0
dtype: int64

In [16]:
df_repayments.isnull().sum()

id            0
loan_id       0
type          0
amount        0
status        0
created_at    0
dtype: int64

In [17]:
df_transactions.isnull().sum()

id                0
user_id           0
amount            0
status            0
capture_method    0
payment_method    0
installments      0
card_brand        0
created_at        0
dtype: int64

## Quality Check

**Notes:**
- table_transactions is missings information about some users who have taken a loan
    - keep and consider it as the cold start problem (unavailable purchasing behavior)
- table_repayments is missing information about some loans
    - remove these loans from table_loans

In [19]:
print(f"number of loans: {df_loans['id'].nunique():,}")
print(f"number of users: {df_loans['user_id'].nunique():,}")

number of loans: 6,746
number of users: 3,154


In [20]:
print(f"number of loans associated with repayments: {df_repayments['loan_id'].nunique():,}")

number of loans associated with repayments: 6,598


In [21]:
print(f"number of user: {df_transactions['user_id'].nunique():,}")

number of user: 3,150


# Data Preprocessing 

- **Handle Invalid Data**
    - exclude loans that are marked as `error` or `cancelled` from table_loans and table_repayments
- **Handle Inconsistencies**
    - remove loans from table_loans that don't have repayment history in table_repayments 
    - remove miscellaneous users from table_transactions (users who aren't present in table_loans)
- **Misc**
    - clean repayment status feature: `refunded` is merged with `paid` (refunded: a loan repayment that happened but was fully refunded to the user)
    - clean loan status feature: `debt_collection` and `debt_repaid` are merged together as `debt`

In [23]:
exclude = ["error", "cancelled"]
excluded_ids = set(df_loans[df_loans["status"].isin(exclude)]["id"])

# exclude loans that are marked as error or cancelled
df_loans = df_loans[~df_loans["id"].isin(excluded_ids)]
df_repayments = df_repayments[~df_repayments["loan_id"].isin(excluded_ids)]

In [24]:
# remove loans that don't have repayment history
missing_loans = set(df_loans['id']) - set(df_repayments['loan_id'])

df_loans = df_loans[~df_loans['id'].isin(missing_loans)]

In [25]:
# remove miscellaneous users
missing_users = set(df_transactions['user_id']) - set(df_loans['user_id'])

df_transactions = df_transactions[~df_transactions['user_id'].isin(missing_users)]

In [26]:
print(f"number of loans in table_loans: {df_loans['id'].nunique():,}")
print(f"number of users in table_loans: {df_loans['user_id'].nunique():,}\n")

print(f"number of loans in table_repayments: {df_repayments['loan_id'].nunique():,}\n")

print(f"number of user in table_transactions: {df_transactions['user_id'].nunique():,}")

number of loans in table_loans: 6,588
number of users in table_loans: 3,046

number of loans in table_repayments: 6,588

number of user in table_transactions: 3,043


In [27]:
# merge debt_collection and debt_repaid together as debt
df_loans["status_cleaned"] = df_loans["status"].replace({"debt_collection": "debt", "debt_repaid": "debt"})

In [28]:
# merge refunded with paid, since refunded is a repayment that happened but was fully refunded to the user
df_repayments["status_cleaned"] = df_repayments["status"].replace({"refunded": "paid"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_repayments["status_cleaned"] = df_repayments["status"].replace({"refunded": "paid"})


## ✅ checkpoint

In [30]:
df_loans.to_pickle("../data/processed/df_loans.pkl")
df_repayments.to_pickle("../data/processed/df_repayments.pkl")
df_transactions.to_pickle("../data/processed/df_transactions.pkl")