## Preprocessing

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import optuna.integration.lightgbm as lgb

In [18]:
train = pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/Input/train.csv")
test = pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/Input/test.csv")

In [19]:
print(f"Index numbers: {train.shape[0]}, Column numbers: {train.shape[1]}")
print(f"Index numbers: {test.shape[0]}, Column numbers: {train.shape[1]}")

Index numbers: 171202, Column numbers: 51
Index numbers: 61500, Column numbers: 51


In [20]:
# Code binary value 

for df in [train, test]:
    df.replace({'CODE_GENDER': {'M': 0, 'F': 1}}, inplace=True)
    df.replace({'FLAG_OWN_CAR': {'N': 0, 'Y': 1}}, inplace=True)
    df.replace({'FLAG_OWN_REALTY': {'N': 0, 'Y': 1}}, inplace=True)
    df.replace({'NAME_CONTRACT_TYPE': {'Cash loans': 0, 'Revolving loans': 1}}, inplace=True)
    
    # Replace XNA by NaN
    df.replace('XNA', np.nan, inplace=True)
    
    # Replace outliers by NaN
    df.replace({'DAYS_EMPLOYED': {365243: np.nan}}, inplace=True)
    df.replace({'DAYS_LAST_PHONE_CHANGE': {0: np.nan}}, inplace=True)

train.replace({'OWN_CAR_AGE': {64: np.nan, 65: np.nan, 91: np.nan}}, inplace=True)
test.replace({'OWN_CAR_AGE': {64: np.nan, 69: np.nan, 91: np.nan}}, inplace=True)

In [21]:
# Create features

for df in [train, test]:
    # YEARS_AMT
    df['YEARS_AMT'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    # AMT_INCOME_RATE
    df['AMT_INCOME_RATE'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    # GOODS_CREDIT_RATE
    df['GOODS_CREDIT_RATE'] = df['AMT_GOODS_PRICE'] / df['AMT_CREDIT']

In [22]:
# Drop unused features
for df in [train, test]:
    df.drop(["FLAG_MOBIL", "FLAG_EMP_PHONE", "FLAG_CONT_MOBILE"], axis=1, inplace=True)

In [23]:
#One-hot-Encoding
df_all = pd.concat([train, test])
df_ohe = pd.get_dummies(df_all)

train = df_ohe[:171202]
test = df_ohe[171202:]

In [24]:
X = train.iloc[:, 2:].values
y = train.iloc[:, 1].values
test_X = test.iloc[:, 2:].values

## Train Models

In [29]:
from xgboost import XGBClassifier

xgb1 = XGBClassifier(max_depth=5,
                    min_child_weight=6,
                    subsample=0.9, 
                    colsample_bytree=0.5, 
                    learning_rate=0.02943370997880096,
                    n_estimators=941,
                    tree_method='gpu_hist',
                    random_state=0)

from lightgbm import LGBMClassifier

lgb1 = LGBMClassifier(random_state=0,
                     objective='binary', 
                     metric='auc',
                     boosting_type='gbdt',
                     feature_pre_filter=False, 
                     lambda_l1=0.7050822301496653, 
                     lambda_l2=0.0058386827559236834, 
                     num_leaves=4, 
                     feature_fraction=0.41600000000000004, 
                     bagging_fraction=0.8300542479068319, 
                     bagging_freq=1, 
                     min_child_samples=20,
                     n_estimators=1440)

lgb2 = LGBMClassifier(random_state=0, 
                      metric='auc', 
                      n_estimators=857, 
                      max_depth=5, 
                      min_child_weight=14, 
                      colsample_bytree=0.5, 
                      learning_rate=0.045281028617681414, 
                      lambda_l1=2.5624353984127566e-07, 
                      lambda_l2=4.758516522619317)

xgb2 = XGBClassifier(random_state=0, 
                     n_estimators=1838, 
                     objective='binary:logistic', 
                     tree_method='gpu_hist', 
                     max_depth=5, 
                     min_child_weight=2, 
                     subsample=0.7, 
                     colsample_bytree=0.5, 
                     learning_rate=0.02013171044857669)

In [30]:
lgb1.fit(X, y)
lgb2.fit(X, y)
xgb1.fit(X, y)
xgb2.fit(X, y)

lgb1_pred = lgb1.predict_proba(test_X)
lgb2_pred = lgb2.predict_proba(test_X)
xgb1_pred = xgb1.predict_proba(test_X)
xgb2_pred = xgb2.predict_proba(test_X)

pred = ((xgb1_pred[:, 1] + lgb1_pred[:, 1] + xgb2_pred[:, 1] + lgb2_pred[:, 1])/4)







## Make a submission file

In [32]:
path = "C:/Users/daisu/OneDrive/Desktop/"

submission = pd.read_csv(path + 'sample_submission.csv')

In [203]:
submission["TARGET"] = pred

In [206]:
submission.to_csv('C:/Users/daisu/OneDrive/Desktop/submission.csv', index=False)