In [1]:
import datetime as dt
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.pipeline import make_pipeline
import sklearn.preprocessing as pre
from transforming import WithSelected

In [2]:
source = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [13]:
# PREPROCESSING
X_source = source.drop(columns=['id', 'target'])
y = source.target

pipe = make_pipeline(
    WithSelected(['gravity'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['ph'], suffix='bins')(
        pre.KBinsDiscretizer(10, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['osmo'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['cond'], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['urea'], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['calc'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    )
)

X = pipe.fit_transform(X_source, y)
X_test = pipe.transform(test.drop(columns=['id']))

In [9]:
# FIT
estimator = LGBMClassifier(n_estimators=250, max_depth=1, n_jobs=-1, random_state=11)
estimator.fit(X, y)

pred = estimator.predict(X)
roc_auc_score(pred, y)

0.7862290985969633

In [14]:
# PREDICT
submission = pd.read_csv('data/sample_submission.csv')
submission.target = estimator.predict_proba(X_test).T[1]
submission.to_csv(f'submission_{dt.datetime.now().replace(microsecond=0)}.csv', index=False)

In [None]:
#