In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

Mem. usage decreased to 505.45 Mb (76.9% reduction)
Mem. usage decreased to 252.25 Mb (76.9% reduction)


In [5]:
train.to_csv("train.csv", index = False)
test.to_csv("test.csv", index = False)

In [55]:
cor = train.corr()['target']

In [61]:
col_nunique = [train[c].nunique() for c in train.columns[:-1]]
col_n_df = pd.DataFrame({'column' : train.columns[:-1], 'nunique' : col_nunique})

In [64]:
col_n_df[col_n_df['nunique'] < 3]

Unnamed: 0,column,nunique
23,f22,2
44,f43,2
243,f242,2
244,f243,2
245,f244,2
246,f245,2
247,f246,2
248,f247,2
249,f248,2
250,f249,2


In [5]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [6]:
X = train.iloc[:, 1:-1]
y = train.target

In [7]:
target = test.iloc[:, 1:]

In [8]:
del train
del test

In [9]:
X['std'] = X.std(axis=1)
X['min'] = X.min(axis=1)
X['max'] = X.max(axis=1)

target['std'] = target.std(axis=1)
target['min'] = target.min(axis=1)
target['max'] = target.max(axis=1)

In [11]:
skew_df = pd.DataFrame({'before' : X.skew(), 'after' : np.log1p(X).skew()})

In [12]:
skew_df

Unnamed: 0,before,after
f0,2.747791,2.310211
f1,0.293400,0.103153
f2,1.345657,1.016755
f3,1.612213,1.332803
f4,0.381527,0.171789
...,...,...
f280,2.084497,2.084497
f281,1.505239,1.505239
f282,1.895395,1.895395
f283,1.633264,1.633264


In [13]:
skew_df['gap'] = skew_df.before.abs() - skew_df.after.abs()

In [19]:
l_t_cols = skew_df[skew_df.gap > 0.5].index

In [21]:
X[l_t_cols] = np.log1p(X[l_t_cols])

In [22]:
target[l_t_cols] = np.log1p(target[l_t_cols])

In [9]:
scaler = StandardScaler()

In [23]:
X = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

In [24]:
target = pd.DataFrame(scaler.transform(target), columns = X.columns)

In [16]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [49]:
cb = CatBoostClassifier(iterations = 10000, learning_rate = .03, random_state = 42)

In [50]:
cb_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = cb.predict_proba(val_x)[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}')
    
    fold_pred = cb.predict_proba(target)[:, 1]
    cb_pred += (fold_pred / 10)

0:	learn: 0.6729574	test: 0.6729834	test1: 0.6725985	best: 0.6725985 (0)	total: 241ms	remaining: 40m 9s
1000:	learn: 0.4617345	test: 0.4617604	test1: 0.4630342	best: 0.4630342 (1000)	total: 1m 57s	remaining: 17m 40s
2000:	learn: 0.4508551	test: 0.4508816	test1: 0.4614514	best: 0.4614514 (2000)	total: 3m 49s	remaining: 15m 15s
3000:	learn: 0.4415388	test: 0.4415654	test1: 0.4612090	best: 0.4612069 (2973)	total: 5m 38s	remaining: 13m 8s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.4611877686
bestIteration = 3383

Shrink model to first 3384 iterations.
1 Fold roc_auc_score = 0.8603574762873203
0:	learn: 0.6728715	test: 0.6728744	test1: 0.6728713	best: 0.6728713 (0)	total: 160ms	remaining: 26m 39s
1000:	learn: 0.4611462	test: 0.4611724	test1: 0.4683039	best: 0.4683037 (999)	total: 1m 58s	remaining: 17m 47s
2000:	learn: 0.4502705	test: 0.4502974	test1: 0.4667932	best: 0.4667760 (1974)	total: 3m 50s	remaining: 15m 22s
3000:	learn: 0.4409600	test: 0.4409869	test1: 0.46

In [27]:
skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)

In [28]:
params = {
    'max_depth': 6,
    'subsample': 0.7,
    'colsample_bytree': 0.2,
    'colsample_bylevel': 0.6000000000000001,
    'min_child_weight': 56.41980735551558,
    'reg_lambda': 75.56651890088857,
    'reg_alpha': 0.11766857055687065,
    'gamma': 0.6407823221122686,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'use_label_encoder': False
    }

In [29]:
xgb = XGBClassifier(random_state = 42, n_estimators = 10000, learning_rate = .03, **params)

In [35]:
xgb_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = xgb.predict_proba(np.float32(val_x))[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}')
    
    fold_pred = xgb.predict_proba(np.float32(target))[:, 1]
    xgb_pred += (fold_pred / 5)

[0]	validation_0-auc:0.78912	validation_1-auc:0.79223
[1000]	validation_0-auc:0.86738	validation_1-auc:0.85749
[2000]	validation_0-auc:0.87736	validation_1-auc:0.85848
[3000]	validation_0-auc:0.88439	validation_1-auc:0.85845
[3029]	validation_0-auc:0.88457	validation_1-auc:0.85844
1 Fold roc_auc_score = 0.858575467033657
[0]	validation_0-auc:0.78975	validation_1-auc:0.78805
[1000]	validation_0-auc:0.86801	validation_1-auc:0.85421
[2000]	validation_0-auc:0.87787	validation_1-auc:0.85563
[2794]	validation_0-auc:0.88339	validation_1-auc:0.85567
2 Fold roc_auc_score = 0.8557973935697676
[0]	validation_0-auc:0.78931	validation_1-auc:0.78849
[1000]	validation_0-auc:0.86760	validation_1-auc:0.85613
[2000]	validation_0-auc:0.87753	validation_1-auc:0.85745
[3000]	validation_0-auc:0.88439	validation_1-auc:0.85740
[3238]	validation_0-auc:0.88588	validation_1-auc:0.85734
3 Fold roc_auc_score = 0.8575134835844367
[0]	validation_0-auc:0.79001	validation_1-auc:0.78815
[1000]	validation_0-auc:0.86801	

In [32]:
tr_x.dtypes

f0      float16
f1      float16
f2      float16
f3      float16
f4      float16
         ...   
f283       int8
f284       int8
std     float16
min     float16
max     float16
Length: 288, dtype: object

In [21]:
lgbm = LGBMClassifier(random_state = 42, n_estimators = 10000, learning_rate = .05, max_depth = 5)

In [22]:
lgbm_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}\n')
    
    fold_pred = lgbm.predict_proba(target)[:, 1]
    lgbm_pred += (fold_pred / 10)

Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.445615	valid_1's binary_logloss: 0.462774
Early stopping, best iteration is:
[1494]	training's binary_logloss: 0.434989	valid_1's binary_logloss: 0.462077
1 Fold roc_auc_score = 0.8598921930404622

Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.445082	valid_1's binary_logloss: 0.467997
[2000]	training's binary_logloss: 0.424939	valid_1's binary_logloss: 0.467494
Early stopping, best iteration is:
[1614]	training's binary_logloss: 0.432123	valid_1's binary_logloss: 0.467369
2 Fold roc_auc_score = 0.8557803574914553

Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.444877	valid_1's binary_logloss: 0.470752
[2000]	training's binary_logloss: 0.424683	valid_1's binary_logloss: 0.469893
Early stopping, best iteration is:
[1612]	training's binary_logloss: 0.431895	valid_1's binary_logloss: 0.469733

In [24]:
submission = pd.read_csv('sample_submission.csv')

In [36]:
submission['target'] = lgbm_pred * .5 + xgb_pred * .5

In [37]:
submission.to_csv("1012_1st.csv", index = False)