In [1]:
import pandas as pd
import numpy as np
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train = reduce_memory_usage(train)
test = reduce_memory_usage(test)

Mem. usage decreased to 505.45 Mb (76.9% reduction)
Mem. usage decreased to 252.25 Mb (76.9% reduction)


In [5]:
train

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
0,0,0.205933,0.410889,0.176758,0.223633,0.423584,0.476074,0.413574,0.611816,0.534668,...,0,1,0,0,0,0,0,0,0,1
1,1,0.181030,0.473145,0.011734,0.213623,0.619629,0.441650,0.230347,0.686035,0.281982,...,0,1,0,0,0,0,0,0,0,1
2,2,0.182617,0.307373,0.325928,0.207153,0.605469,0.309814,0.493408,0.750977,0.536133,...,0,0,0,1,1,0,0,0,0,1
3,3,0.180298,0.494629,0.008369,0.223633,0.760742,0.439209,0.432129,0.776367,0.483887,...,0,0,0,0,1,0,0,0,0,1
4,4,0.177124,0.495605,0.014259,0.548828,0.625488,0.562500,0.117188,0.561035,0.077087,...,0,1,1,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999995,0.204346,0.344727,0.262207,0.228394,0.610840,0.357422,0.490479,0.613770,0.509277,...,0,0,0,1,0,0,1,0,0,1
999996,999996,0.182007,0.563965,0.242554,0.241211,0.453613,0.469482,0.477539,0.659180,0.519043,...,0,0,0,0,0,0,0,0,1,0
999997,999997,0.250244,0.491455,0.098572,0.235596,0.771484,0.367920,0.531738,0.598145,0.618652,...,0,0,0,0,0,0,0,0,0,0
999998,999998,0.203613,0.535156,0.180176,0.213135,0.654785,0.535156,0.316162,0.652344,0.397949,...,0,0,0,0,0,0,0,0,0,1


In [9]:
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [11]:
X = train.iloc[:, 1:-1]
y = train.target

In [31]:
target = test.iloc[:, 1:]

In [10]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [49]:
cb = CatBoostClassifier(iterations = 10000, learning_rate = .05, random_state = 42)

In [50]:
cb_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = cb.predict_proba(val_x)[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}')
    
    fold_pred = cb.predict_proba(target)[:, 1]
    cb_pred += (fold_pred / 10)

0:	learn: 0.6729574	test: 0.6729834	test1: 0.6725985	best: 0.6725985 (0)	total: 241ms	remaining: 40m 9s
1000:	learn: 0.4617345	test: 0.4617604	test1: 0.4630342	best: 0.4630342 (1000)	total: 1m 57s	remaining: 17m 40s
2000:	learn: 0.4508551	test: 0.4508816	test1: 0.4614514	best: 0.4614514 (2000)	total: 3m 49s	remaining: 15m 15s
3000:	learn: 0.4415388	test: 0.4415654	test1: 0.4612090	best: 0.4612069 (2973)	total: 5m 38s	remaining: 13m 8s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.4611877686
bestIteration = 3383

Shrink model to first 3384 iterations.
1 Fold roc_auc_score = 0.8603574762873203
0:	learn: 0.6728715	test: 0.6728744	test1: 0.6728713	best: 0.6728713 (0)	total: 160ms	remaining: 26m 39s
1000:	learn: 0.4611462	test: 0.4611724	test1: 0.4683039	best: 0.4683037 (999)	total: 1m 58s	remaining: 17m 47s
2000:	learn: 0.4502705	test: 0.4502974	test1: 0.4667932	best: 0.4667760 (1974)	total: 3m 50s	remaining: 15m 22s
3000:	learn: 0.4409600	test: 0.4409869	test1: 0.46

In [46]:
xgb = XGBClassifier(random_state = 42, n_estimators = 5000, learning_rate = .03)

In [47]:
xgb_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = xgb.predict_proba(val_x)[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}')
    
    fold_pred = xgb.predict_proba(target)[:, 1]
    xgb_pred += (fold_pred / 10)

[0]	validation_0-logloss:0.68366	validation_1-logloss:0.68345


KeyboardInterrupt: 

In [7]:
lgbm = LGBMClassifier(random_state = 42, n_estimators = 10000, learning_rate = .05)

In [36]:
lgbm_pred = np.zeros((target.shape[0]))
for i, idx in enumerate(zip(skf.split(X, y))) :
    tr_x, tr_y = X.iloc[idx[0][0]], y.iloc[idx[0][0]]
    val_x, val_y = X.iloc[idx[0][1]], y.iloc[idx[0][1]]
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], verbose = 1000, early_stopping_rounds = 500)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    
    score = roc_auc_score(val_y, val_pred)
    
    print(f'{i + 1} Fold roc_auc_score = {score}')
    
    fold_pred = lgbm.predict_proba(target)[:, 1]
    lgbm_pred += (fold_pred / 10)

Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.443247	valid_1's binary_logloss: 0.461729
Early stopping, best iteration is:
[1404]	training's binary_logloss: 0.433741	valid_1's binary_logloss: 0.461536
1 Fold roc_auc_score = 0.8599937211379698
Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.442629	valid_1's binary_logloss: 0.466915
Early stopping, best iteration is:
[1304]	training's binary_logloss: 0.435396	valid_1's binary_logloss: 0.466801
2 Fold roc_auc_score = 0.8558972224036924
Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.442368	valid_1's binary_logloss: 0.469589
Early stopping, best iteration is:
[1164]	training's binary_logloss: 0.438428	valid_1's binary_logloss: 0.469429
3 Fold roc_auc_score = 0.8540578346371444
Training until validation scores don't improve for 500 rounds
[1000]	training's binary_logloss: 0.442592	valid_1's 

In [40]:
submission = pd.read_csv('sample_submission.csv')

In [51]:
submission['target'] = lgbm_pred * .5 + cb_pred * .5

In [52]:
submission.to_csv("1007_2nd.csv", index = False)