In [6]:
import datetime as dt
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import IsolationForest
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
import sklearn.preprocessing as pre
from transforming import WithSelected, Apply, Calc, Select, TypeRecast

In [7]:
source = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

X_source = source.drop(columns=['id', 'target'])
y = source.target

__preprocessing__

In [8]:
original = ['gravity', 'ph', 'osmo', 'cond', 'urea', 'calc']
pipe = make_pipeline(
    # is_norma features
    # Calc('1*((1.008 < gravity) & (gravity < 1.030))', to='gravity_is_norma'),
    # Calc('1*(urea <= 35)', to='urea_is_norma'),
    # Calc('1*((2.5 < calc) & (calc < 7.5))', to='calc_is_norma'),

    # pH-like
    Calc('-np.log10(calc * 10e-6)', to='pCalc'),
    Calc('-np.log10(urea * 10e-6)', to='pUr'),
    Calc('-np.log10(osmo)', to='pOs'),
    Calc('-np.log10(cond)', to='pCond'),
    # Calc('pCalc - ph', to='pCalc_ph_diff'),
    # Calc('pUr - ph', to='pUr_ph_diff'),

    # prod & ratio
    Calc('gravity / ph', to='gravity_ph_rate'),
    Calc('gravity * ph', to='gravity_ph_prod'),
    Calc('gravity * osmo', to='gravity_osmo_prod'),
    Calc('gravity / calc', to='gravity_calc_rate'),
    Calc('osmo / cond', to='osmo_cond_rate'),
    Calc('osmo * urea', to='osmo_urea_prod'),
    Calc('osmo * ph', to='osmo_ph_prod'),
    Calc('(cond * urea) / ph', to='cond_urea_prod_ph_rate'),
    Calc('cond * calc', to='cond_calc_prod'),
    Calc('(gravity * osmo) / urea', to='gravity_osmo_prod_urea_rate'),

    # Calc('calc / urea', to='calc_urea_rate'),
    # Calc('calc / ph', to='calc_ph_rate'),
    Calc('pOs / gravity', to='pOs_gravity_rate'),
    Calc('pOs / ph', to='pOs_ph_rate'),
    Calc('gravity / ph', to='gravity_ph_rate'),
    Calc('gravity / pCalc', to='gravity_pCalc_rate'),
    Calc('pCond / ph', to='pCond_ph_rate'),
    Calc('pCond / pCalc', to='pCond_pCalc_rate'),
    Calc('pCond / pUr', to='pCond_pUr_rate'),

    # Calc('(osmo * gravity) / (cond * ph)', to='osmo_gravity_prod_cond_ph_prod_rate'),
    # Calc('cond / (urea * calc)', to='cond_(urea_calc_prod)_rate'),

    # Calc('', to=''),
    # Calc('', to=''),

    # power
    WithSelected(None, 'pow')(
    # WithSelected(original, 'pow')(
        pre.PowerTransformer()
    ),
    # categirues/binaries
    WithSelected(original, suffix='bins')(
    # WithSelected(lambda columns: [col for col in columns if 'norma' not in col and 'pow' not in col], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='ordinal', strategy='kmeans')
    ),
    # analysis
    Apply(
        # estimator=LinearDiscriminantAnalysis(),
        # locpipe=pre.Normalizer(),
        estimator=LinearDiscriminantAnalysis(solver='lsqr'),
        locpipe=pre.StandardScaler(),
        # on=lambda columns: [col for col in columns if 'pow' in col],
        to='lda',
        as_proba=True
    ),
    Apply(
        estimator=KNeighborsClassifier(7, leaf_size=30, n_jobs=-1),
        to='neighbours',
        as_proba=True
    ),
    # Apply(
    #     estimator=IsolationForest(n_estimators=10, max_samples='auto', warm_start=True, bootstrap=True, n_jobs=-1, random_state=17),
    #     locpipe=pre.Normalizer(),
    #     to='isolation'
    # ),
    # calculations with generated features
    Calc('neighbours / calc', to='ngb_calc_rate'),
    Calc('lda / calc', to='lda_calc_rate'),
    Calc('neighbours / ph', to='ngb_ph_rate'),
    Calc('lda / ph', to='lda_ph_rate'),
    Calc('neighbours / gravity', to='ngb_gravity_rate'),
    Calc('lda / gravity', to='lda_gravity_rate'),
    # Calc('neighbours / lda', to='ngb_lda_rate'),
    Calc('(osmo / gravity) * lda', to='osmo_gravity_rate_lda_weighted'),

    Calc('(calc * urea / ph) * lda', to='ion_lda_prod'),
    Calc('(calc * urea / ph) * neighbours', to='ion_ngb_prod'),

    TypeRecast(
        int=lambda columns: [col for col in columns if 'bins' in col],
    )
    
    # Select(original, mode='drop'),
)

X = pipe.fit_transform(X_source, y)
X_test = pipe.transform(test.drop(columns=['id']))
X.shape

(414, 69)

In [9]:
# FIT
estimator = LGBMClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, n_jobs=-1, random_state=11)
estimator.fit(X, y)

pred_lgbm = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_lgbm)

0.9332230623818525

In [10]:
# FIT
estimator = CatBoostClassifier(iterations=250, learning_rate=0.01, depth=3, random_state=11, verbose=False, allow_writing_files=False)
estimator.fit(X, y)

pred_cb = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_cb)

0.9099480151228734

In [6]:
# FIT
estimator = XGBClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, random_state=23)
estimator.fit(X, y)

pred_xgb = estimator.predict_proba(X).T[1]
roc_auc_score(y, pred_xgb)

0.9364839319470699

In [7]:
# mix
pred = np.mean([pred_lgbm, pred_cb, pred_xgb], axis=0)
roc_auc_score(y, pred)

0.9237476370510397

__submit__

In [11]:
# PREDICT
submission = pd.read_csv('data/sample_submission.csv')
submission.target = estimator.predict_proba(X_test).T[1]
submission.to_csv(f'submission_{dt.datetime.now().replace(microsecond=0)}.csv', index=False)

In [78]:
# PREDICT MIX
submission = pd.read_csv('data/sample_submission.csv')
estimators = (
    LGBMClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, n_jobs=-1, random_state=11),
    CatBoostClassifier(iterations=250, learning_rate=0.01, depth=3, random_state=11, verbose=False, allow_writing_files=False),
    XGBClassifier(n_estimators=250, learning_rate=0.01, max_depth=3, random_state=23),
)

proba = []
for est in estimators:
    est.fit(X, y)
    proba.append(est.predict_proba(X_test).T[1])

submission.target = np.mean(proba, axis=0)
submission.to_csv(f'submission_{dt.datetime.now().replace(microsecond=0)}.csv', index=False)

In [None]:
#