In [1]:
! cd ../../../LightGBM/python-package && sudo python setup.py install --precompile && \
    cd ../../wec/ticket-upgrade-prediction

INFO:root:running install
INFO:root:running build
INFO:root:running build_py
INFO:root:running egg_info
INFO:root:writing lightgbm.egg-info/PKG-INFO
INFO:root:writing dependency_links to lightgbm.egg-info/dependency_links.txt
INFO:root:writing requirements to lightgbm.egg-info/requires.txt
INFO:root:writing top-level names to lightgbm.egg-info/top_level.txt
INFO:root:reading manifest file 'lightgbm.egg-info/SOURCES.txt'
INFO:root:reading manifest template 'MANIFEST.in'
INFO:root:writing manifest file 'lightgbm.egg-info/SOURCES.txt'
INFO:root:copying lightgbm/VERSION.txt -> build/lib/lightgbm
INFO:root:running install_lib
INFO:root:copying build/lib/lightgbm/VERSION.txt -> /usr/lib/python3.8/site-packages/lightgbm
INFO:LightGBM:Installing lib_lightgbm from: ['/home/ubuntu/project/LightGBM/lib_lightgbm.so']
INFO:root:running install_egg_info
INFO:root:removing '/usr/lib/python3.8/site-packages/lightgbm-3.3.3.99-py3.8.egg-info' (and everything under it)
INFO:root:Copying lightgbm.egg-info

In [22]:
import pandas as pd
from pathlib import Path
# import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from sklearn.feature_selection import RFE, SequentialFeatureSelector
import numpy as np

In [2]:
df_path = Path.cwd().parents[0] / "data" / "dataset.csv"

In [3]:
df = pd.read_csv(df_path)

In [4]:
y = df["UPGRADED_FLAG"]
X = df.drop(columns="UPGRADED_FLAG")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [39]:
model = lgbm.LGBMClassifier(n_estimators=1000)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 7061, number of negative: 7363690
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1613
[LightGBM] [Info] Number of data points in the train set: 7370751, number of used features: 38
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000958 -> initscore=-6.949730
[LightGBM] [Info] Start training from score -6.949730


In [34]:
proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_true=y_test, y_score=proba)

In [35]:
auc

0.9491658051079647

In [18]:
xgb_model = XGBClassifier(
    tree_method="gpu_hist", gpu_id=0, verbosity=2, n_estimators=100
)
xgb_model.fit(X_train, y_train)


In [46]:
proba_xgb = xgb_model.predict_proba(X_test)[:, 1]
auc_xgb = roc_auc_score(y_true=y_test, y_score=proba_xgb)

In [47]:
auc_xgb

0.9744132249804485

In [51]:
features_len = X_train.shape[1]

In [15]:
X_sample = X.head(1000)
y_sample = y.head(1000)

In [19]:
sfs = SequentialFeatureSelector(
    estimator=XGBClassifier(tree_method="gpu_hist", verbosity=2),
    direction="backward",
    n_features_to_select="auto", 
    tol=2, 
    scoring="roc_auc", 
    cv=5,
)


In [21]:
# sfs.fit(X=X_sample, y=y_sample)

In [28]:
def _get_auc(model, X_test: pd.DataFrame, y_test: pd.Series) -> float:
    proba = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_true=y_test, y_score=proba)


def permutation_feature_importance(
    model, X_test: pd.DataFrame, y_test: pd.Series
) -> dict:
    features_score_diff = {}
    baseline = _get_auc(model=model, X_test=X_test, y_test=y_test)

    for col in X.columns:
        X_changed = X_test.copy()
        X_changed[col] = np.random.permutation(X_changed[col].values)
        auc = _get_auc(model=model, X_test=X_changed, y_test=y_test)
        features_score_diff[col] = baseline - auc

    return {
        k: v
        for k, v in sorted(
            features_score_diff.items(), key=lambda item: item[1]
        )
    }


In [29]:
permutations = permutation_feature_importance(
    model=xgb_model, X_test=X_test, y_test=y_test
)


In [30]:
permutations

{'BOOKING_DOMESTIC_FLAG': 0.0,
 'is_sus_currency': 0.0,
 'FLIGHT_RANGE_LONG-HAUL': 0.0,
 'FLIGHT_RANGE_SHORT-HAUL': 0.0,
 'FLIGHT_RANGE_UNKNOWN': 0.0,
 'PAX_TYPE_INFANT': 0.0,
 'SALES_CHANNEL_DCS': 0.0,
 'TRIP_TYPE_ONE WAY': 5.7375782791524443e-08,
 'PAX_TYPE_CHILD': 1.2348266276163145e-07,
 'VAB_SEMI-FLEX': 2.4097049150739736e-05,
 'SALES_CHANNEL_ATO_CTO': 2.7208593972316564e-05,
 'SALES_CHANNEL_LOT TRAVEL': 6.141017327987441e-05,
 'CORPORATE_CONTRACT_FLG': 7.573229119905989e-05,
 'VAB_FULL-FLEX': 0.00019063400019081467,
 'BOOKING_LONG_HOUL_FLAG': 0.0002329371023734872,
 'SALES_CHANNEL_CALL CENTER': 0.0002334873174195673,
 'is_sus_payment': 0.00036422973162109074,
 'VAB_STANDARD': 0.0004016265807375019,
 'VAB_FLEX': 0.0004236705747088232,
 'PAX_GENDER': 0.0005560789134692046,
 'LOYAL_CUSTOMER': 0.0005574002715069692,
 'VAB_OTHER': 0.0005635744046400548,
 'SALES_CHANNEL_LOT.COM': 0.0006189811689227831,
 'same_carrier': 0.0006592563178310362,
 'TRIP_TYPE_ROUND TRIP': 0.00071419768344216