# Home Credit Default Risk

Outline:
* Load the data
* Join tables with Polars - a DataFrame library implemented in Rust language, very fast and memory efficient.  
* Create features
* Train models
* Create a submission table

## Load the data
Data loading and polars code credit: https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook

In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score 

dataPath = "./data/"

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))

    return df

In [3]:
def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

## Data information
* basetable:
* static:
* static_cb:
 

In [4]:
# from  vectorization import *
#from data_reduction import *
compiled = pd.read_csv("W:/Erdos/Project/home_credit/data/csv_files/master_data_file.csv")

In [5]:
# vectorize_dataframe(train_static_cb.to_pandas())
# vectorize(train_static_cb.to_pandas())
# vectorize_dataframe_for_nn(train_static_cb.to_pandas())


In [6]:
# type(train_static_cb), type(train_static_cb.columns)

In [7]:
# print(train_static.dtypes.unique())


In [8]:
compiled.columns[:20], compiled.dtypes.unique()
# compiled.head

(Index(['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target',
        'actualdpd_943P', 'actualdpdtolerance_344P', 'addres_district_368M',
        'addres_role_871L', 'addres_zip_823M', 'amount_1115A', 'amount_416A',
        'amount_4527230A', 'amount_4917619A', 'amtdebitincoming_4809443A',
        'amtdebitoutgoing_4809440A', 'amtdepositbalance_4809441A',
        'amtdepositincoming_4809444A', 'amtdepositoutgoing_4809442A',
        'amtinstpaidbefduel24m_4187115A'],
       dtype='object'),
 array([dtype('int64'), dtype('O'), dtype('bool')], dtype=object))

In [9]:
compiled

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpd_943P,actualdpdtolerance_344P,addres_district_368M,addres_role_871L,addres_zip_823M,...,totaldebtoverduevalue_178A,totaldebtoverduevalue_718A,totaloutstanddebtvalue_39A,totaloutstanddebtvalue_668A,totalsettled_863A,totinstallast1m_4525188A,twobodfilling_608L,type_25L,typesuite_864L,validfrom_1069D
0,0,2019-01-03,201901,0,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,0,1,-1,-1
1,1,2019-01-03,201901,0,0,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,0,1,-1,-1
2,2,2019-01-04,201901,0,0,3,-1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,0,0,0,-1
3,3,2019-01-03,201901,0,0,3,-1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,0,1,0,-1
4,4,2019-01-04,201901,0,1,3,-1,-1,-1,-1,...,-1,-1,-1,-1,1,-1,0,1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,3,3,0,-1,0,...,-1,-1,-1,-1,3,3,0,0,-1,-1
1526655,2703451,2020-10-05,202010,91,0,3,3,0,-1,0,...,-1,-1,-1,-1,3,3,0,0,-1,-1
1526656,2703452,2020-10-05,202010,91,0,3,3,0,-1,0,...,-1,-1,-1,-1,1,-1,1,0,-1,-1
1526657,2703453,2020-10-05,202010,91,0,3,3,0,-1,0,...,-1,-1,-1,-1,3,1,1,0,-1,-1


In [10]:
# vectorize_dataframe(compiled)

In [11]:
from src.classification import *

In [12]:
# labels = np.random.randint(2, size=(num_rows, 1))  # Generates 0 or 1
# print(labels.shape)
# # Create DataFrame
# df = pd.DataFrame(data, columns=['a', 'b', 'c', 'd'])
# df['e'] = labels
# print(df)

X = compiled.loc[:, compiled.columns != "target"]
y = compiled.loc[:, compiled.columns == "target"]
# print(X.columns, y.columns)
train_loader, test_loader = data_split(X, y)

print("training..")
train_SimpleNN(train_loader, test_loader)
print("DONE")

training..
GPU is available. Using the GPU...
Train_features: 466
Epoch 1/100, Training Loss: 0.1250, Validation Loss: 0.1223 finished in 83.88 seconds
Epoch 2/100, Training Loss: 0.1219, Validation Loss: 0.1219 finished in 168.77 seconds
Epoch 3/100, Training Loss: 0.1210, Validation Loss: 0.1206 finished in 254.04 seconds
Epoch 4/100, Training Loss: 0.1203, Validation Loss: 0.1208 finished in 337.79 seconds
Epoch 5/100, Training Loss: 0.1197, Validation Loss: 0.1209 finished in 422.81 seconds
Epoch 6/100, Training Loss: 0.1193, Validation Loss: 0.1203 finished in 507.64 seconds
Epoch 7/100, Training Loss: 0.1189, Validation Loss: 0.1203 finished in 591.83 seconds
Epoch 8/100, Training Loss: 0.1186, Validation Loss: 0.1200 finished in 676.67 seconds
Epoch 9/100, Training Loss: 0.1181, Validation Loss: 0.1201 finished in 761.97 seconds
Epoch 10/100, Training Loss: 0.1179, Validation Loss: 0.1200 finished in 839.54 seconds
Epoch 11/100, Training Loss: 0.1174, Validation Loss: 0.1205 fin

In [13]:
test = compiled.values


In [14]:
test[test=='case_id']

array([], dtype=object)