In [1]:
import datetime as dt
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import IsolationForest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


from sklearn.metrics import f1_score, roc_auc_score
from sklearn.pipeline import make_pipeline
import sklearn.preprocessing as pre
from transforming import WithSelected, DFPowerTransform, Apply, Calc, Select

In [2]:
source = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

X_source = source.drop(columns=['id', 'target'])
y = source.target

__preprocessing__

In [3]:
original = ['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc']
pipe = make_pipeline(
    Calc('calc * urea', to='ion_prod'),
    Calc('calc / urea', to='ion_rate'),
    # Calc('osmo / gravity', to='osmo_gravity_rate'),
    Calc('calc / ph', to='calc_ph_rate'),
    
    WithSelected(None, 'pow')(
        DFPowerTransform()
    ),
    WithSelected(['calc', 'ph', 'gravity'], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='onehot', strategy='kmeans')
    ),

    Apply(
        # estimator=LinearDiscriminantAnalysis(),
        # locpipe=pre.Normalizer(),
        estimator=LinearDiscriminantAnalysis(solver='lsqr'),
        locpipe=pre.StandardScaler(),
        # on=lambda columns: [col for col in columns if 'pow' in col],
        to='lda',
        as_proba=True
    ),
    Apply(
        estimator=KNeighborsClassifier(7, leaf_size=30, n_jobs=-1),
        to='neighbours',
        as_proba=True
    ),
    Apply(
        estimator=IsolationForest(n_estimators=10, max_samples='auto', warm_start=True, bootstrap=True, n_jobs=-1, random_state=17),
        locpipe=pre.Normalizer(),
        to='isolation'
    ),

    Calc('neighbours / calc', to='ngb_calc_rate'),
    Calc('lda / calc', to='lda_calc_rate'),
    Calc('neighbours / ph', to='ngb_ph_rate'),
    Calc('lda / ph', to='lda_ph_rate'),
    Calc('neighbours / gravity', to='ngb_gravity_rate'),
    Calc('lda / gravity', to='lda_gravity_rate'),

    Calc('neighbours / lda', to='ngb_lda_rate'),

    Select(['lda_gravity_rate', 'lda', 'lda_ph_rate', 'ngb_calc_rate', 'neighbours', 
            'ngb_gravity_rate', 'ngb_ph_rate', 'ngb_lda_rate', 'lda_calc_rate', 'ion_rate_pow', 
            'calc_pow', 'ph_pow', 'ion_rate', 'ion_prod_pow', 'gravity', 
            'cond_pow', 'cond', 'calc', 'calc_ph_rate', 'ph_bins_2'])
)


X = pipe.fit_transform(X_source, y)
X_test = pipe.transform(test.drop(columns=['id']))

In [7]:
# FIT
estimator = LGBMClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, n_jobs=-1, random_state=11)
estimator.fit(X, y)

pred_lgbm = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_lgbm)

0.9210066162570888

In [4]:
# FIT
estimator = CatBoostClassifier(iterations=250, learning_rate=0.01, depth=3, random_state=11, verbose=False, allow_writing_files=False)
estimator.fit(X, y)

pred_cb = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_cb)

0.8995746691871455

In [9]:
# FIT
estimator = XGBClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, random_state=23)
estimator.fit(X, y)

pred_xgb = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_xgb)

0.9317225897920606

In [10]:
# mix
pred = np.mean([pred_lgbm, pred_cb, pred_xgb], axis=0)
roc_auc_score(y, pred)

0.9198487712665406

__submit__

In [5]:
# PREDICT
submission = pd.read_csv('data/sample_submission.csv')
submission.target = estimator.predict_proba(X_test).T[1]
submission.to_csv(f'submission_{dt.datetime.now().replace(microsecond=0)}.csv', index=False)

In [78]:
# PREDICT MIX
submission = pd.read_csv('data/sample_submission.csv')
estimators = (
    LGBMClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, n_jobs=-1, random_state=11),
    CatBoostClassifier(iterations=250, learning_rate=0.01, depth=3, random_state=11, verbose=False, allow_writing_files=False),
    XGBClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, random_state=23),
)

proba = []
for est in estimators:
    est.fit(X, y)
    proba.append(est.predict_proba(X_test).T[1])

submission.target = np.mean(proba, axis=0)
submission.to_csv(f'submission_{dt.datetime.now().replace(microsecond=0)}.csv', index=False)

In [None]:
#