In [63]:
import datetime as dt
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import IsolationForest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


from sklearn.metrics import f1_score, roc_auc_score
from sklearn.pipeline import make_pipeline
import sklearn.preprocessing as pre
from transforming import WithSelected, DFPowerTransform, Apply, Drop

In [64]:
source = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

X_source = source.drop(columns=['id', 'target'])
y = source.target

__preprocessing__

In [65]:
original = ['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc']
pipe = make_pipeline(
    WithSelected(original, 'pow')(
        DFPowerTransform()
    ),
    WithSelected(['gravity', 'gravity_pow'], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['ph', 'ph_pow'], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['osmo', 'osmo_pow'], suffix='bins')(
        pre.KBinsDiscretizer(10, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['cond', 'cond_pow'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['urea', 'urea_pow'], suffix='bins')(
        pre.KBinsDiscretizer(10, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['calc', 'calc_pow'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    ),

    # Apply(
    #     estimator=KNeighborsClassifier(7, leaf_size=30, n_jobs=-1),
    #     to='neighbours'
    # ),
    Apply(
        estimator=LinearDiscriminantAnalysis(solver='lsqr'),
        locpipe=pre.Normalizer(),
        # on=lambda columns: [col for col in columns if 'pow' in col],
        to='lda'
    ),
    Apply(
        estimator=KNeighborsClassifier(7, leaf_size=30, n_jobs=-1),
        to='neighbours'
    ),
    Apply(
        estimator=IsolationForest(n_estimators=20, max_samples='auto', warm_start=True, bootstrap=True, n_jobs=-1, random_state=17),
        locpipe=pre.Normalizer(),
        to='isolation'
    ),
    # Drop(original),
    # Drop([col for col in X.columns if 'calc' in col])
)

X = pipe.fit_transform(X_source, y)
X_test = pipe.transform(test.drop(columns=['id']))

pred = []

In [70]:
# FIT
estimator = LGBMClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, n_jobs=-1, random_state=11)
estimator.fit(X, y)

pred_lgbm = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_lgbm)

0.9029182419659735

In [79]:
# FIT
estimator = CatBoostClassifier(iterations=250, learning_rate=0.01, depth=3, random_state=11, verbose=False, allow_writing_files=False)
estimator.fit(X, y)

pred_cb = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_cb)

0.8727788279773155

In [74]:
# FIT
estimator = XGBClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, random_state=23)
estimator.fit(X, y)

pred_xgb = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_xgb)

0.9040170132325143

In [76]:
# mix
pred = np.mean([pred_lgbm, pred_cb, pred_xgb], axis=0)
roc_auc_score(y, pred)

0.8969754253308129

__submit__

In [75]:
# PREDICT
submission = pd.read_csv('data/sample_submission.csv')
submission.target = estimator.predict_proba(X_test).T[1]
submission.to_csv(f'submission_{dt.datetime.now().replace(microsecond=0)}.csv', index=False)

In [78]:
# PREDICT MIX
submission = pd.read_csv('data/sample_submission.csv')
estimators = (
    LGBMClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, n_jobs=-1, random_state=11),
    CatBoostClassifier(iterations=250, learning_rate=0.01, depth=3, random_state=11, verbose=False, allow_writing_files=False),
    XGBClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, random_state=23),
)

proba = []
for est in estimators:
    est.fit(X, y)
    proba.append(est.predict_proba(X_test).T[1])

submission.target = np.mean(proba, axis=0)
submission.to_csv(f'submission_{dt.datetime.now().replace(microsecond=0)}.csv', index=False)

In [None]:
#