In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jane-street-market-prediction/example_sample_submission.csv
/kaggle/input/jane-street-market-prediction/features.csv
/kaggle/input/jane-street-market-prediction/example_test.csv
/kaggle/input/jane-street-market-prediction/train.csv
/kaggle/input/jane-street-market-prediction/janestreet/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/jane-street-market-prediction/janestreet/__init__.py


In [2]:
import datatable as dt
import matplotlib.pyplot as plt
import gc
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


from sklearn import set_config
set_config(display='diagram') 

from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.core.common.SettingWithCopyWarning)

In [3]:
train = (
    dt.fread('../input/jane-street-market-prediction/train.csv')
      .to_pandas()
)
pd.set_option('display.max_columns', None)
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns})[train['date']>85]
train_85 = train[train['date']<=85]

In [4]:
#ignoring rows with weight=0 (which are included for completeness)
train = train.query('weight > 0').reset_index(drop = True)

#Last value imputation using ffillna
features = [c for c in train.columns if 'feature' in c]
train[features] = train[features].fillna(method = 'ffill').fillna(0)
train['action'] = (train['resp'] > 0).astype('int')

#feature drop
to_be_dropped = ['feature_21','feature_24','feature_25','feature_55',
                  'feature_58','feature_121','feature_127','feature_61',
                  'feature_63','feature_5','feature_3','feature_38',
                  'feature_66','feature_69', 'feature_12', 'feature_26', 'feature_68',
                  'feature_7','feature_8','feature_17','feature_18',
                  'feature_27','feature_28','feature_72','feature_78',
                  'feature_84','feature_90','feature_96','feature_102',
                  'feature_108','feature_114',
                  'feature_35','feature_36','feature_32','feature_40',
                  'feature_48','feature_122','feature_128','feature_76',
                  'feature_110','feature_101','feature_113','feature_116',
                  'feature_107','feature_119','feature_129','feature_126'] 
train = train.drop(columns=to_be_dropped)

In [5]:
# X,y
features = [c for c in train.columns if 'feature' in c]
X = train.loc[:, train.columns.str.contains('feature')]
#standard scaling
X = StandardScaler().fit_transform(X)
y = train.loc[:,'action']
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X), y, test_size=0.2, random_state = 42)
del train,X,y

# Base Model define

In [6]:
#from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
#from lightgbm import LGBMClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier as ada
#from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression # 메타 모델
from sklearn.ensemble import StackingClassifier
import xgboost as xgb
import lightgbm as lgb 
from sklearn.ensemble import RandomForestClassifier as rf


#SVM = SVC(kernel = 'rbf', gamma='scale')

lgbclf = lgb.LGBMClassifier(n_estimators = 369,
       learning_rate=0.29686498918493825,
       num_leaves=3,
       subsample=0.9369154235677937,
       subsample_freq=1,
       colsample_bytree=0.33362860498986807,
       random_state = 37,
       verbose=0,
       force_col_wise=True,
       max_depth=3,
       min_child_weight=3
    ) 
xgbclf = xgb.XGBClassifier(
        n_estimators=24,
        max_depth=3,
        learning_rate=0.01,
        subsample=0.85,
        colsample_bytree=0.85,
        missing=-999,
        tree_method='gpu_hist',
        nthread=-1,
        random_state=2020
    )
rfclf = rf(
            n_estimators=64,
            max_depth=8, 
            max_features='sqrt',
            n_jobs=-1,
            random_state=2020
    )

adaclf = ada()

In [7]:
# Ensemble bass models
models = [
    ('xgb',xgbclf),
    #('SVC',SVM),
    ('lgb',lgbclf),
    #('rf',rfclf)
    
]
# level-2 random forest is stacked over the base models
#lr_final = LogisticRegression()
stack_clf = StackingClassifier(models,final_estimator=adaclf,cv=2)   

In [8]:
%time stack_clf.fit(X_train, y_train)
del X_train, y_train







CPU times: user 4min 8s, sys: 3.76 s, total: 4min 12s
Wall time: 2min 33s


In [9]:
stack_final_pred = stack_clf.predict(X_test) 

print('Final test accuracy: ',accuracy_score(y_test, stack_final_pred))

Final test accuracy:  0.5275691017331513


In [10]:
from tqdm import tqdm
import janestreet
env = janestreet.make_env() 
iter_test = env.iter_test() 

In [11]:
for (test_df, sample_prediction_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0 :
        X_test = test_df.loc[:, test_df.columns.str.contains('feature')].drop(columns=to_be_dropped)
        X_test = X_test.fillna(method = 'ffill').fillna(0)
        y_preds = stack_clf.predict(X_test)
        sample_prediction_df.action = y_preds
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)

15219it [06:37, 38.29it/s]
