In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

Mem. usage decreased to 505.45 Mb (76.9% reduction)
Mem. usage decreased to 252.25 Mb (76.9% reduction)


In [5]:
train.to_csv("train.csv", index = False)
test.to_csv("test.csv", index = False)

In [55]:
cor = train.corr()['target']

In [61]:
col_nunique = [train[c].nunique() for c in train.columns[:-1]]
col_n_df = pd.DataFrame({'column' : train.columns[:-1], 'nunique' : col_nunique})

In [64]:
col_n_df[col_n_df['nunique'] < 3]

Unnamed: 0,column,nunique
23,f22,2
44,f43,2
243,f242,2
244,f243,2
245,f244,2
246,f245,2
247,f246,2
248,f247,2
249,f248,2
250,f249,2


In [5]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [6]:
X = train.iloc[:, 1:-1]
y = train.target

In [7]:
target = test.iloc[:, 1:]

In [8]:
del train
del test

In [9]:
X['std'] = X.std(axis=1)
X['min'] = X.min(axis=1)
X['max'] = X.max(axis=1)

target['std'] = target.std(axis=1)
target['min'] = target.min(axis=1)
target['max'] = target.max(axis=1)

In [11]:
skew_df = pd.DataFrame({'before' : X.skew(), 'after' : np.log1p(X).skew()})

In [12]:
skew_df

Unnamed: 0,before,after
f0,2.747791,2.310211
f1,0.293400,0.103153
f2,1.345657,1.016755
f3,1.612213,1.332803
f4,0.381527,0.171789
...,...,...
f280,2.084497,2.084497
f281,1.505239,1.505239
f282,1.895395,1.895395
f283,1.633264,1.633264


In [13]:
skew_df['gap'] = skew_df.before.abs() - skew_df.after.abs()

In [19]:
l_t_cols = skew_df[skew_df.gap > 0.5].index

In [21]:
X[l_t_cols] = np.log1p(X[l_t_cols])

In [22]:
target[l_t_cols] = np.log1p(target[l_t_cols])

In [9]:
scaler = StandardScaler()

In [23]:
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [24]:
target = pd.DataFrame(scaler.transform(target), columns = X.columns)

In [17]:
skf = StratifiedKFold(n_splits = 10, random_state = 2021, shuffle = True)

In [11]:
cb = CatBoostClassifier(iterations = 10000, learning_rate = .03, random_state = 42)

In [12]:
cb_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = cb.predict_proba(val_x)[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}')
    
    fold_pred = cb.predict_proba(target)[:, 1]
    cb_pred += (fold_pred / 10)

0:	learn: 0.6808484	test: 0.6808337	test1: 0.6805923	best: 0.6805923 (0)	total: 211ms	remaining: 35m 7s
1000:	learn: 0.4677988	test: 0.4678248	test1: 0.4657574	best: 0.4657574 (1000)	total: 2m	remaining: 17m 59s
2000:	learn: 0.4591064	test: 0.4591326	test1: 0.4623964	best: 0.4623964 (2000)	total: 3m 54s	remaining: 15m 36s
3000:	learn: 0.4526649	test: 0.4526911	test1: 0.4615831	best: 0.4615831 (3000)	total: 5m 45s	remaining: 13m 26s
4000:	learn: 0.4468426	test: 0.4468691	test1: 0.4612089	best: 0.4611974 (3955)	total: 7m 36s	remaining: 11m 25s
5000:	learn: 0.4413650	test: 0.4413915	test1: 0.4610782	best: 0.4610678 (4908)	total: 9m 25s	remaining: 9m 24s
6000:	learn: 0.4360724	test: 0.4360988	test1: 0.4610146	best: 0.4609898 (5639)	total: 11m 14s	remaining: 7m 29s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.4609898391
bestIteration = 5639

Shrink model to first 5640 iterations.
1 Fold roc_auc_score = 0.8605092756331083
0:	learn: 0.6808018	test: 0.6808098	test1: 0.6

4000:	learn: 0.4462975	test: 0.4463240	test1: 0.4658964	best: 0.4658958 (3996)	total: 7m 38s	remaining: 11m 26s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.4658450465
bestIteration = 4413

Shrink model to first 4414 iterations.
10 Fold roc_auc_score = 0.8568038944304691


In [18]:
params = {
    'max_depth': 6,
    'subsample': 0.75,
    'colsample_bytree': 0.25,
    'colsample_bylevel': 0.6000000000000001,
    'min_child_weight': 56.41980735551558,
    'reg_lambda': 75.56651890088857,
    'reg_alpha': 0.11766857055687065,
    'gamma': 0.6407823221122686,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'use_label_encoder': False
    }

In [20]:
xgb = XGBClassifier(random_state = 42, n_estimators = 5000, learning_rate = .03, **params)

In [21]:
xgb_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 250)
    
    val_pred = xgb.predict_proba(np.float32(val_x))[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}')
    
    fold_pred = xgb.predict_proba(np.float32(target))[:, 1]
    xgb_pred += (fold_pred / 10)

[0]	validation_0-auc:0.78977	validation_1-auc:0.78745
[1000]	validation_0-auc:0.86430	validation_1-auc:0.85508
[2000]	validation_0-auc:0.87398	validation_1-auc:0.85634
[2231]	validation_0-auc:0.87560	validation_1-auc:0.85632
1 Fold roc_auc_score = 0.8563289464183201
[0]	validation_0-auc:0.78939	validation_1-auc:0.79071
[1000]	validation_0-auc:0.86416	validation_1-auc:0.85592
[2000]	validation_0-auc:0.87385	validation_1-auc:0.85720
[2647]	validation_0-auc:0.87818	validation_1-auc:0.85720
2 Fold roc_auc_score = 0.8572132578676128
[0]	validation_0-auc:0.78985	validation_1-auc:0.78685
[1000]	validation_0-auc:0.86436	validation_1-auc:0.85437
[2000]	validation_0-auc:0.87402	validation_1-auc:0.85566
[2748]	validation_0-auc:0.87900	validation_1-auc:0.85578
3 Fold roc_auc_score = 0.8557866762975239
[0]	validation_0-auc:0.78922	validation_1-auc:0.79030
[1000]	validation_0-auc:0.86408	validation_1-auc:0.85701
[2000]	validation_0-auc:0.87379	validation_1-auc:0.85800
[2610]	validation_0-auc:0.87794

In [24]:
lgbm = LGBMClassifier(random_state = 42, n_estimators = 10000, learning_rate = .05, max_depth = 5)

In [None]:
lgbm_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}\n')
    
    fold_pred = lgbm.predict_proba(target)[:, 1]
    lgbm_pred += (fold_pred / 10)

Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.445024	valid_1's binary_logloss: 0.468865
[2000]	training's binary_logloss: 0.424815	valid_1's binary_logloss: 0.468325
Early stopping, best iteration is:
[1501]	training's binary_logloss: 0.434223	valid_1's binary_logloss: 0.468234
1 Fold roc_auc_score = 0.8554979256202078

Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.445164	valid_1's binary_logloss: 0.467458
[2000]	training's binary_logloss: 0.424906	valid_1's binary_logloss: 0.466831
Early stopping, best iteration is:
[1589]	training's binary_logloss: 0.432606	valid_1's binary_logloss: 0.466679
2 Fold roc_auc_score = 0.8563150200049453

Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.444907	valid_1's binary_logloss: 0.469605
[2000]	training's binary_logloss: 0.424748	valid_1's binary_logloss: 0.468828
Early stopping, best iteration is:

In [27]:
submission = pd.read_csv('sample_submission.csv')

In [28]:
submission['target'] = cb_pred * .5 + xgb_pred * .5

In [29]:
submission.to_csv("1014_1st.csv", index = False)

In [22]:
cb_pred

array([0.74667794, 0.24405604, 0.91423275, ..., 0.28630147, 0.51930003,
       0.43597287])

In [23]:
xgb_pred

array([0.75331522, 0.23050546, 0.90829048, ..., 0.30633049, 0.51902716,
       0.42023943])