In [42]:
import numpy as np
import polars as pl
import pandas as pd

# Data Preparation Notebook

This notebook loads the data, performs feature selection and engineering, and joins the tables. The end result is a Train/Val/Test split, to be used for any model training.

# Data Exploration

In [38]:
# from their starter notebook; preserved for reference
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input/home-credit-credit-risk-model-stability'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [10]:
dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

A couple notes on data interpretation:

Where predictors were transformed, columns describing the transformation have been added with a capital letter suffixing the predictor name
* P - Transform DPD (Days past due)
* M - Masking categories
* A - Transform amount
* D - Transform date
* T - Unspecified Transform
* L - Unspecified Transform

On depths: depth of a table refers to how many num_group# columns are used to index. Each case_id is only featured once for each unique set of indices, although it may not have a listing for every set. The indexing is not necessarily chronological either; dates where num_group1 == 2 may be earlier than dates where num_group1 == 0. It may be useful to pull summary information for each case_id, e.g. min, max, median, fraction_empty.

In [40]:
# # for exploration purposes: this gives more information about each feature
feature_definitions = pl.read_csv(dataPath + "feature_definitions.csv")
print(feature_definitions.head())

shape: (5, 2)
┌─────────────────────────┬───────────────────────────────────┐
│ Variable                ┆ Description                       │
│ ---                     ┆ ---                               │
│ str                     ┆ str                               │
╞═════════════════════════╪═══════════════════════════════════╡
│ actualdpd_943P          ┆ Days Past Due (DPD) of previous … │
│ actualdpdtolerance_344P ┆ DPD of client with tolerance.     │
│ addres_district_368M    ┆ District of the person's address… │
│ addres_role_871L        ┆ Role of person's address.         │
│ addres_zip_823M         ┆ Zip code of the address.          │
└─────────────────────────┴───────────────────────────────────┘


# Load Data

In [9]:
# helper function from their starter notebook
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

## Dataset descriptions
Note that each train file has an associated test file (files may have been partitioned if too large)
### DEPTH=0
* train_base: links case_id to WEEK_NUM and target
* train_static: contains transaction history for each case_id (late payments, total debt, etc)
* train_static_cb: data from an external source: demographic data, risk assessment, number of credit checks


### DEPTH=1
* train_person_1: contains internal demographic information: zip code, marital status, gender etc (all hashed)

### DEPTH=2
* train_credit_bureau_b_2: historical data from an external source, num and value of overdue payments

In [10]:
# load train datasets
train_base = pl.read_csv(dataPath + "csv_files/train/train_base.csv")
train_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes) 
train_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

In [41]:
# load test datasets
test_basetable = pl.read_csv(dataPath + "csv_files/test/test_base.csv")
test_static = pl.concat(
    [
        pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_static_0_2.csv").pipe(set_table_dtypes),
    ],
    how="vertical_relaxed",
)
test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes) 
test_credit_bureau_b_2 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_b_2.csv").pipe(set_table_dtypes) 

case_id,date_decision,MONTH,WEEK_NUM
i64,str,i64,i64
57543,"""2021-05-14""",202201,100
57549,"""2022-01-17""",202201,100
57551,"""2020-11-27""",202201,100
57552,"""2020-11-27""",202201,100
57569,"""2021-12-20""",202201,100


## Feature Engineering

In [None]:
# helper function from their starter notebook

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

In [None]:
# transform train data

In [None]:
# transform test data