In [1]:
import polars as pl
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.metrics import roc_auc_score 
import warnings
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
dataPath = "/kaggle/input/home-credit-credit-risk-model-stability/"

In [2]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    # implement here all desired dtypes for tables
    # the following is just an example
    for col in df.columns:
        # last letter of column name will help you determine the type
        if col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame: 
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

#https://www.kaggle.com/code/darynarr/home-credit-drop-date-features/notebook
def reduce_memory_usage_pl(df):
        """ Reduce memory usage by polars dataframe {df} with name {name} by changing its data types.
            Original pandas version of this function: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 """
        print(f"Memory usage of dataframe is {round(df.estimated_size('mb'), 2)} MB")
        Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
        Numeric_Float_types = [pl.Float32,pl.Float64]    
        for col in df.columns:
            try:
                col_type = df[col].dtype
                if col_type == pl.Categorical:
                    continue
                c_min = df[col].min()
                c_max = df[col].max()
                if col_type in Numeric_Int_types:
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df = df.with_columns(df[col].cast(pl.Int32))
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df = df.with_columns(df[col].cast(pl.Int16))
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df = df.with_columns(df[col].cast(pl.Int32))
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df = df.with_columns(df[col].cast(pl.Int64))
                elif col_type in Numeric_Float_types:
                    if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df = df.with_columns(df[col].cast(pl.Float32))
                    else:
                        pass
                # elif col_type == pl.Utf8:
                #     df = df.with_columns(df[col].cast(pl.Categorical))
                else:
                    pass
            except:
                pass
        print(f"Memory usage of dataframe became {round(df.estimated_size('mb'), 2)} MB")
        return df

In [3]:
train_basetable = pl.read_csv(dataPath+'csv_files/train/train_base.csv').pipe(set_table_dtypes)

train_static = pl.concat(
[pl.read_csv(dataPath + "csv_files/train/train_static_0_0.csv").pipe(set_table_dtypes),
pl.read_csv(dataPath + "csv_files/train/train_static_0_1.csv").pipe(set_table_dtypes)
],how = 'vertical_relaxed',
)

train_applprev = pl.concat(
[pl.read_csv(dataPath + "csv_files/train/train_applprev_1_0.csv").pipe(set_table_dtypes),
pl.read_csv(dataPath + "csv_files/train/train_applprev_1_1.csv").pipe(set_table_dtypes)   
],how = 'vertical_relaxed',
)

train_static_cb = pl.read_csv(dataPath + "csv_files/train/train_static_cb_0.csv").pipe(set_table_dtypes)
train_person_1 = pl.read_csv(dataPath + "csv_files/train/train_person_1.csv").pipe(set_table_dtypes)
train_credit_bureau_a_2_5 = pl.read_csv(dataPath + "csv_files/train/train_credit_bureau_a_2_5.csv").pipe(set_table_dtypes)
train_deposit_1 = pl.read_csv(dataPath + "csv_files/train/train_deposit_1.csv").pipe(set_table_dtypes)


In [4]:
test_basetable = pl.read_csv(dataPath+'csv_files/test/test_base.csv').pipe(set_table_dtypes)

test_static = pl.concat(
[pl.read_csv(dataPath + "csv_files/test/test_static_0_0.csv").pipe(set_table_dtypes),
pl.read_csv(dataPath + "csv_files/test/test_static_0_1.csv").pipe(set_table_dtypes)
],how = 'vertical_relaxed',
)
test_applprev = pl.concat(
[       pl.read_csv(dataPath + "csv_files/test/test_applprev_1_0.csv").pipe(set_table_dtypes),
        pl.read_csv(dataPath + "csv_files/test/test_applprev_1_1.csv").pipe(set_table_dtypes),    
],how = 'vertical_relaxed',
)

test_static_cb = pl.read_csv(dataPath + "csv_files/test/test_static_cb_0.csv").pipe(set_table_dtypes)
test_person_1 = pl.read_csv(dataPath + "csv_files/test/test_person_1.csv").pipe(set_table_dtypes)
test_credit_bureau_a_2_5 = pl.read_csv(dataPath + "csv_files/test/test_credit_bureau_a_2_5.csv").pipe(set_table_dtypes)
test_deposit_1 = pl.read_csv(dataPath + "csv_files/test/test_deposit_1.csv").pipe(set_table_dtypes)

In [5]:
class FeatureEngineer:
    def __init__(self, train_basetable, train_static, train_static_cb,
                 train_person_1, train_credit_bureau_a_2_5,train_deposit_1,train_applprev):
        
        self.train_basetable = train_basetable
        self.train_static = train_static
        self.train_static_cb = train_static_cb
        self.train_person_1 = train_person_1
        self.train_credit_bureau_a_2_5 = train_credit_bureau_a_2_5
        self.train_deposit_1 = train_deposit_1
        self.train_applprev = train_applprev
        
        
    def generate_person_features(self):
        # Aggregating features from train_person_1
        train_person_1_feats_1 = self.train_person_1.group_by("case_id").agg(
            pl.col("mainoccupationinc_384A").max().alias("mainoccupationinc_384A_maxA"),
            (pl.col("incometype_1044T") == "SELFEMPLOYED").max().alias("mainoccupationinc_384A_any_selfemployedA"),
            pl.col("childnum_185L").sum().alias("total_childrenL").cast(pl.Int32)
        )
        
        # Filtering and selecting features from train_person_1
        train_person_1_feats_2 = self.train_person_1.select(["case_id", "num_group1", "housetype_905L"]).filter(
            pl.col("num_group1") == 0).drop("num_group1").rename({"housetype_905L": "person_housetypeL"})
        
        
        return train_person_1_feats_1, train_person_1_feats_2
    
    
    def deposit_frequency(self):
        deposit_frequency = self.train_deposit_1.group_by("case_id").agg(
        pl.count("amount_416A").alias("deposit_countL")
    )   
        return deposit_frequency
    
    
    def generate_applprev(self):
         # Total Payment Amount
            total_payment_amount = self.train_applprev.group_by("case_id").agg(
                pl.col("pmtnum_8L").sum().alias("total_payment_amountL"))
            
            # average Annuity
            avg_annuity = self.train_applprev.group_by("case_id").agg(
                pl.col("annuity_853A").mean().alias("avg_annuityA"))

            
            #payment_rate = self.train_applprev.with_columns(
                #(pl.col("annuity_853A")/pl.col("credamount_590A")).alias("payment_rateL"))

            return total_payment_amount,avg_annuity #payment_rate

           
    
    def generate_static_columns(self, df):
        selected_static_cols = [col for col in df.columns if col[-1] in ("A", "M")]
        return df.select(["case_id"] + selected_static_cols) 
     
 
        # Generating features
    def join_tables(self):
        person_feats_1, person_feats_2 = self.generate_person_features()
        deposit_frequency = self.deposit_frequency()
        total_payment_amount,avg_annuity = self.generate_applprev()
         
        # Selecting static columns
        selected_static = self.generate_static_columns(self.train_static)
        selected_static_cb = self.generate_static_columns(self.train_static_cb)
        
        #joining data
        data = self.train_basetable.join(selected_static, how="left", on="case_id") \
                                   .join(selected_static_cb, how="left", on="case_id") \
                                   .join(person_feats_1, how="left", on="case_id") \
                                   .join(person_feats_2, how="left", on="case_id") \
                                   .join(deposit_frequency, how="left", on="case_id") \
                                   .join(total_payment_amount, how="left", on="case_id") \
                                   .join(avg_annuity, how="left", on="case_id") 

                                   #.join(payment_rate, how="left", on="case_id") 

        data = data.with_columns(
            pl.col(pl.Float64).cast(pl.Float32)
        )
        
        
        return data
    
            
       # Generate and join features for the test set
    def process_test_set(self, test_basetable, test_static, test_static_cb, test_person_1, test_credit_bureau_a_2_5, test_deposit_1,test_applprev):
        fe_test = FeatureEngineer(test_basetable, test_static, test_static_cb, test_person_1, test_credit_bureau_a_2_5, test_deposit_1,test_applprev)
        
        test_data = fe_test.join_tables()


        return test_data
    

fe = FeatureEngineer(train_basetable, train_static, train_static_cb, train_person_1,
                     train_credit_bureau_a_2_5,train_deposit_1,train_applprev)

data = fe.join_tables()
data = reduce_memory_usage_pl(data)


test_data = fe.process_test_set(test_basetable, test_static, test_static_cb,
                                test_person_1, test_credit_bureau_a_2_5, test_deposit_1,test_applprev)

test_data = reduce_memory_usage_pl(test_data)

Memory usage of dataframe is 635.99 MB
Memory usage of dataframe became 612.69 MB
Memory usage of dataframe is 0.0 MB
Memory usage of dataframe became 0.0 MB


In [6]:
case_ids = data["case_id"].unique().shuffle(seed=1)
case_ids_train, case_ids_test = train_test_split(case_ids, train_size=0.6, random_state=42)
case_ids_valid, case_ids_test = train_test_split(case_ids_test, train_size=0.5, random_state=42)

cols_pred = []
for col in data.columns:
    if col[-1].isupper() and col[:-1].islower():
        cols_pred.append(col)

        
def from_polars_to_pandas(case_ids: pl.DataFrame) -> pl.DataFrame:
    return (
        data.filter(pl.col("case_id").is_in(case_ids))[["case_id", "WEEK_NUM", "target"]].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))[cols_pred].to_pandas(),
        data.filter(pl.col("case_id").is_in(case_ids))["target"].to_pandas()
    )

base_train, X_train, y_train = from_polars_to_pandas(case_ids_train)
base_valid, X_valid, y_valid = from_polars_to_pandas(case_ids_valid)
base_test, X_test, y_test = from_polars_to_pandas(case_ids_test)

for df in [X_train, X_valid, X_test]:
    df = convert_strings(df)

In [7]:
print(f"X_train:{X_train.shape}")
print(f"Valid: {X_valid.shape}")
print(f"Test: {X_test.shape}")

X_train:(915995, 53)
Valid: (305332, 53)
Test: (305332, 53)


In [8]:
import lightgbm as lgb
from sklearn.model_selection import KFold


params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "max_bin": 255,
    "n_estimators": 1200,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    #"device": "gpu"
}

# KFold object with 5 folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

#store evaluation results
eval_results = []

# Perform 5 fold with cv
for train_index, valid_index in kf.split(X_train):
    X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    lgb_train = lgb.Dataset(X_train_fold, label=y_train_fold)
    lgb_valid = lgb.Dataset(X_valid_fold, label=y_valid_fold, reference=lgb_train)

    gbm = lgb.train(params, lgb_train, valid_sets=[lgb_valid],callbacks=[lgb.log_evaluation(50), lgb.early_stopping(50)])
    
    eval_results.append(gbm.best_score)



Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.737776
[100]	valid_0's auc: 0.748611
[150]	valid_0's auc: 0.753637
[200]	valid_0's auc: 0.756102
[250]	valid_0's auc: 0.758019
[300]	valid_0's auc: 0.759338
[350]	valid_0's auc: 0.759993
[400]	valid_0's auc: 0.760346
[450]	valid_0's auc: 0.760841
[500]	valid_0's auc: 0.76118
[550]	valid_0's auc: 0.761434
[600]	valid_0's auc: 0.761474
Early stopping, best iteration is:
[573]	valid_0's auc: 0.761503




Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.736981
[100]	valid_0's auc: 0.748429
[150]	valid_0's auc: 0.75393
[200]	valid_0's auc: 0.757027
[250]	valid_0's auc: 0.759337
[300]	valid_0's auc: 0.760675
[350]	valid_0's auc: 0.761622
[400]	valid_0's auc: 0.762312
[450]	valid_0's auc: 0.762982
[500]	valid_0's auc: 0.76314
[550]	valid_0's auc: 0.763433
[600]	valid_0's auc: 0.763698
[650]	valid_0's auc: 0.763814
[700]	valid_0's auc: 0.763827
Early stopping, best iteration is:
[663]	valid_0's auc: 0.763884




Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.732508
[100]	valid_0's auc: 0.743228
[150]	valid_0's auc: 0.747691
[200]	valid_0's auc: 0.750613
[250]	valid_0's auc: 0.752253
[300]	valid_0's auc: 0.753435
[350]	valid_0's auc: 0.754477
[400]	valid_0's auc: 0.75514
[450]	valid_0's auc: 0.755585
[500]	valid_0's auc: 0.755731
[550]	valid_0's auc: 0.755882
[600]	valid_0's auc: 0.756073
[650]	valid_0's auc: 0.756326
[700]	valid_0's auc: 0.756305
Early stopping, best iteration is:
[660]	valid_0's auc: 0.756381




Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.737498
[100]	valid_0's auc: 0.748241
[150]	valid_0's auc: 0.753545
[200]	valid_0's auc: 0.757047
[250]	valid_0's auc: 0.759161
[300]	valid_0's auc: 0.760438
[350]	valid_0's auc: 0.761347
[400]	valid_0's auc: 0.761771
[450]	valid_0's auc: 0.762047
[500]	valid_0's auc: 0.762361
[550]	valid_0's auc: 0.762641
[600]	valid_0's auc: 0.762861
[650]	valid_0's auc: 0.763077
[700]	valid_0's auc: 0.763215
[750]	valid_0's auc: 0.763333
[800]	valid_0's auc: 0.763183
Early stopping, best iteration is:
[754]	valid_0's auc: 0.763365




Training until validation scores don't improve for 50 rounds
[50]	valid_0's auc: 0.729163
[100]	valid_0's auc: 0.740269
[150]	valid_0's auc: 0.746156
[200]	valid_0's auc: 0.74964
[250]	valid_0's auc: 0.751896
[300]	valid_0's auc: 0.75338
[350]	valid_0's auc: 0.754515
[400]	valid_0's auc: 0.755227
[450]	valid_0's auc: 0.755577
[500]	valid_0's auc: 0.755918
[550]	valid_0's auc: 0.756238
[600]	valid_0's auc: 0.756498
[650]	valid_0's auc: 0.756683
[700]	valid_0's auc: 0.756766
[750]	valid_0's auc: 0.756932
[800]	valid_0's auc: 0.756918
[850]	valid_0's auc: 0.757057
[900]	valid_0's auc: 0.757151
[950]	valid_0's auc: 0.757276
[1000]	valid_0's auc: 0.757214
Early stopping, best iteration is:
[954]	valid_0's auc: 0.757294


In [9]:
X_submission = test_data[cols_pred].to_pandas()
X_submission = convert_strings(X_submission)
categorical_cols = X_train.select_dtypes(include=['category']).columns

for col in categorical_cols:
    train_categories = set(X_train[col].cat.categories)
    submission_categories = set(X_submission[col].cat.categories)
    new_categories = submission_categories - train_categories
    X_submission.loc[X_submission[col].isin(new_categories), col] = "Unknown"
    new_dtype = pd.CategoricalDtype(categories=train_categories, ordered=True)
    X_train[col] = X_train[col].astype(new_dtype)
    X_submission[col] = X_submission[col].astype(new_dtype)
    
y_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration)

In [10]:
submission = pd.DataFrame({
    "case_id":test_data["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")