In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

## Load data

In [2]:
df = pd.read_csv("../data/UCI_Credit_Card.csv")

In [3]:
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [4]:
features = df.columns.to_list()[1:-1]
target = df.columns.to_list()[-1]

## Train test split

In [5]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=23)

## Train

In [6]:
clf = LGBMClassifier(random_state=23)
clf.fit(df_train[features], df_train[target])

[LightGBM] [Info] Number of positive: 4648, number of negative: 16352
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 21000, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221333 -> initscore=-1.257913
[LightGBM] [Info] Start training from score -1.257913


## Evaluate

In [7]:
gini_train = 2 * roc_auc_score(df_train[target], clf.predict_proba(df_train[features])[:, 1]) - 1
gini_test = 2 * roc_auc_score(df_test[target], clf.predict_proba(df_test[features])[:, 1]) - 1

In [8]:
print(f"Gini train: {gini_train:.3f}")
print(f"Gini test:  {gini_test:.3f}")

Gini train: 0.773
Gini test:  0.556
