In [14]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import sklearn.preprocessing as pre
from transforming import WithSelected

from sklearn.metrics import f1_score, roc_auc_score, ConfusionMatrixDisplay
# from sklearn.linear_model import SGDClassifier

__loading__

In [4]:
source = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
# class MeanBy(BaseEstimator, TransformerMixin):
#     def __init__(self, by=None):
#         self.__mean = None
#         self.__basic_mean = None
#         self.__by = by
    
#     def fit(self, X, y=None):
#         df = X.copy()
#         self.__mean = df.groupby(self.__by).mean()
#         self.__basic_mean = df.mean()
#         return self

#     def transform(self, X, y=None):
#         X = X.copy()
#         if self.__by in X.columns:
#             return X.merge(self.__mean, on=self.__by, suffixes=('', '_mean'))
#         else:
#             exists = self.__basic_mean[self.__basic_mean.index.isin(X.columns)]
#             X[[f'{f}_mean' for f in exists.index]] = exists
#             return X

__preprocessing__

In [48]:
X_source = source.drop(columns=['id', 'target'])
y = source.target

pipe = make_pipeline(
    WithSelected(['gravity'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['ph'], suffix='bins')(
        pre.KBinsDiscretizer(10, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['osmo'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['cond'], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['urea'], suffix='bins')(
        pre.KBinsDiscretizer(7, encode='ordinal', strategy='quantile')
    ),
    WithSelected(['calc'], suffix='bins')(
        pre.KBinsDiscretizer(5, encode='ordinal', strategy='quantile')
    )
)

X = pipe.fit_transform(X_source, y)
X_test = pipe.transform(test)

X.head()

Unnamed: 0,gravity,ph,osmo,cond,urea,calc,gravity_bins,ph_bins,osmo_bins,cond_bins,urea_bins,calc_bins
0,1.013,6.19,443,14.8,124,1.45,1.0,7.0,1.0,1.0,0.0,1.0
1,1.025,5.4,703,23.6,394,4.18,4.0,1.0,2.0,4.0,5.0,3.0
2,1.009,6.13,371,24.5,159,9.04,0.0,6.0,0.0,4.0,1.0,4.0
3,1.021,4.91,442,20.8,398,6.63,3.0,0.0,1.0,2.0,5.0,3.0
4,1.021,5.53,874,17.8,385,2.21,3.0,3.0,4.0,2.0,5.0,1.0


__fit & validate__

In [91]:
# FIT
folds = 7
# X = source.drop(columns=['id', 'target'])     # basic dataframe
X = pipe.fit_transform(X_source, y)


metrics = []
estimator = LGBMClassifier(n_estimators=250, max_depth=1, n_jobs=-1, random_state=11)
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=19)
for f, (train, valid) in enumerate(kf.split(X, y)):
    estimator.fit(X.loc[train], y[train])
    
    pt = estimator.predict(X.loc[train])
    pv = estimator.predict(X.loc[valid])

    auc_train = roc_auc_score(pt, y[train])
    auc_valid = roc_auc_score(pv, y[valid])
    f1_train = f1_score(pt, y[train])
    f1_valid = f1_score(pv, y[valid])
    metrics.append((auc_train, f1_train, auc_valid, f1_valid))
    print(f'Fold: {f}; TRAIN ROC AUC={auc_train:.5f}; TRAIN f1={f1_train:.5f} | VALID ROC AUC={auc_valid:.5f}; VALID f1={f1_valid:.5f}')
means = np.mean(metrics, axis=0)
print(f'MEAN: TRAIN ROC AUC={means[0]:.5f}; TRAIN f1={means[1]:.5f} | VALID ROC AUC={means[2]:.5f}; VALID f1={means[3]:.5f}')

Fold: 0; TRAIN ROC AUC=0.78439; TRAIN f1=0.74834 | VALID ROC AUC=0.69918; VALID f1=0.64000
Fold: 1; TRAIN ROC AUC=0.78423; TRAIN f1=0.75325 | VALID ROC AUC=0.80098; VALID f1=0.75000
Fold: 2; TRAIN ROC AUC=0.78065; TRAIN f1=0.75399 | VALID ROC AUC=0.74937; VALID f1=0.68085
Fold: 3; TRAIN ROC AUC=0.79000; TRAIN f1=0.75974 | VALID ROC AUC=0.74248; VALID f1=0.71698
Fold: 4; TRAIN ROC AUC=0.79486; TRAIN f1=0.77070 | VALID ROC AUC=0.77706; VALID f1=0.74510
Fold: 5; TRAIN ROC AUC=0.78087; TRAIN f1=0.75241 | VALID ROC AUC=0.81740; VALID f1=0.80702
Fold: 6; TRAIN ROC AUC=0.77818; TRAIN f1=0.74510 | VALID ROC AUC=0.69505; VALID f1=0.64000
MEAN: TRAIN ROC AUC=0.78474; TRAIN f1=0.75479 | VALID ROC AUC=0.75450; VALID f1=0.71142


__features selection__

In [None]:
# TODO

In [41]:
# # make shadow features by randomly permuting each column of X
# np.random.seed(42)
# X_shadow = X.apply(np.random.permutation)
# X_shadow.columns = ['shadow_' + feat for feat in X.columns]
# X_boruta = pd.concat([X, X_shadow], axis = 1)

In [46]:
# metrics = []
# estimator = LGBMClassifier(n_jobs=-1, random_state=11)
# kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=19)
# for f, (train, valid) in enumerate(kf.split(X_boruta, y)):
#     estimator.fit(X_boruta.loc[train], y[train])
#     p = estimator.predict(X_boruta.loc[valid])
#     auc = roc_auc_score(p, y[valid])
#     f1 = f1_score(p, y[valid])
#     metrics.append((auc, f1))
#     print(f'Fold: {f}; ROC AUC={auc:.5f}; f1={f1:.5f}')
# means = np.mean(metrics, axis=1)
# print(f'MEAN: ROC AUC={means[0]:.5f}; f1={means[1]:.5f}')

In [47]:
# feat_imp_X = estimator.feature_importances_[:len(X.columns)]
# feat_imp_shadow = estimator.feature_importances_[len(X.columns):]
# hits = feat_imp_X > feat_imp_shadow.max()
# hits

In [None]:
#