In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from tqdm import tqdm_notebook
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
import seaborn as sns



plt.style.use('seaborn')
sns.set(font_scale=1)

import gc



In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
sample = pd.read_csv('../input/sample_submission.csv')

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
sample = reduce_mem_usage(sample)

Memory usage after optimization is: 345.07 MB
Decreased by 74.2%
Memory usage after optimization is: 159.31 MB
Decreased by 73.3%
Memory usage after optimization is: 44.81 MB
Decreased by 43.7%


In [5]:
gc.collect()

15

In [6]:
train.head()

Unnamed: 0,id,product_number,department,category,creditCard,customer,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,total,target
0,id_11149922,5072,1852,25,6447,928147,0.726074,3,2,115,0,44,62,12,1.399414,0
1,id_15609305,4751,2212,6,1813,928147,-1.023438,3,2,215,0,2,110,39,0.286377,0
2,id_5222335,5817,528,2,6447,928147,-0.517578,3,2,55,0,2,0,20,-0.335693,0
3,id_1884252,3088,3682,24,6447,928147,-0.395264,3,2,6,2,4,110,6,-1.516602,0
4,id_12069677,2158,2204,24,6447,928147,0.61084,3,2,62,3,11,59,5,0.908203,0


In [7]:
cols = train.columns[1:-1]

In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5168)
# oof = df_train[['ID_code', 'target']]
# oof['predict'] = 0
# predictions = test[['ID_code']]
# feature_importance_df = pd.DataFrame()
# val_aucs = []
for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    X_train, y_train = train.iloc[trn_idx][cols], train.iloc[trn_idx]['target']
    X_valid, y_valid = train.iloc[val_idx][cols], train.iloc[val_idx]['target']
    break
    
    
    
clf = CatBoostClassifier(loss_function = "Logloss", eval_metric = "AUC",random_seed=123,use_best_model=True,
                          learning_rate=0.1,  iterations=15000,verbose=100,
                           bootstrap_type= "Poisson", 
                           task_type="GPU", 
#                              l2_leaf_reg= 16.5056753964314982, depth= 3.0,
#                              fold_len_multiplier= 2.9772639036842174, 
#                              scale_pos_weight= 3.542962442406767, 
#                              fold_permutation_block_size=16.0, subsample= 0.46893530376570957
#                              fold_len_multiplier=3.2685541035861747, 
#                              scale_pos_weight= 2.6496926337120916, 
#                              fold_permutation_block_size= 6.0, 
                          )
print("Model training")
clf.fit(X_train, y_train,  eval_set=(X_valid, y_valid), early_stopping_rounds=2000,verbose=100)


Model training
0:	learn: 0.6621247	test: 0.6624856	best: 0.6624856 (0)	total: 71.8ms	remaining: 17m 56s
100:	learn: 0.7169731	test: 0.7170973	best: 0.7170973 (100)	total: 7.09s	remaining: 17m 25s
200:	learn: 0.7255473	test: 0.7253997	best: 0.7253997 (200)	total: 14.2s	remaining: 17m 25s
300:	learn: 0.7298585	test: 0.7294782	best: 0.7294782 (300)	total: 21.3s	remaining: 17m 18s
400:	learn: 0.7327523	test: 0.7321082	best: 0.7321082 (400)	total: 28.4s	remaining: 17m 13s
500:	learn: 0.7347164	test: 0.7338246	best: 0.7338246 (500)	total: 35.6s	remaining: 17m 10s
600:	learn: 0.7362702	test: 0.7351194	best: 0.7351194 (600)	total: 42.8s	remaining: 17m 5s
700:	learn: 0.7375904	test: 0.7362442	best: 0.7362442 (700)	total: 49.9s	remaining: 16m 58s
800:	learn: 0.7385508	test: 0.7370008	best: 0.7370008 (800)	total: 57.1s	remaining: 16m 51s
900:	learn: 0.7394971	test: 0.7377121	best: 0.7377121 (900)	total: 1m 4s	remaining: 16m 44s
1000:	learn: 0.7402244	test: 0.7382401	best: 0.7382401 (999)	total: 1

<catboost.core.CatBoostClassifier at 0x7f8875012cc0>

In [9]:
predict = clf.predict_proba(test[cols])

In [10]:
sample.target = predict[:,1]

In [11]:
sample

Unnamed: 0,id,target
0,id_5007385,0.081969
1,id_12558699,0.009303
2,id_5454443,0.033059
3,id_101507,0.023968
4,id_11803238,0.070622
5,id_154019,0.019710
6,id_4746707,0.102651
7,id_14179019,0.082856
8,id_6277431,0.159367
9,id_2711986,0.132894


In [12]:
from IPython.display import FileLink
def create_submission(submission_file, submission_name):
    submission_file.to_csv(submission_name+".csv",index=False)
    return FileLink(submission_name+".csv")

In [13]:
create_submission(sample, "sub_c_15k_simple")