Imports

In [157]:
import pandas as pd
from category_encoders.cat_boost import CatBoostEncoder
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import cross_val_predict, train_test_split

from metrics import uplift_score
from train import uplift_fit_predict
from utils import timer

Model config

In [134]:
# config
lgb_params = {
    "learning_rate": 0.01,
    "max_depth": 6,
    "num_leaves": 20,
    "min_data_in_leaf": 3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.01,
    "max_bin": 416,
    "bagging_freq": 3,
    "reg_lambda": 0.01,
    "n_estimators": 600,
    "application": "binary",
    "n_jobs": -1,
}

Load data

In [3]:
# read data
print("start loading data...")
with timer("loading data"):
    df_train = pd.read_csv("data/uplift_train.csv", index_col="client_id")
    df_test = pd.read_csv("data/uplift_test.csv", index_col="client_id")
    df_features = pd.read_csv("data/df_features.csv", index_col="client_id")

start loading data...
[loading data] done in 176 s


In [13]:
drop_features = [
    "first_issue_date",
    "first_redeem_date",
    "issue_redeem_delay",
    "first_issue_date_hour",
    "age_antinorm",
]

df_features = df_features.drop(columns=drop_features)

TfIdf

In [69]:
%%time
product_cols = [c for c in df_features.columns if "product_" in c]

df_products = df_features[product_cols]
df_products = df_products.fillna(0)

tfidf = TfidfTransformer()
tfidf_array = tfidf.fit_transform(df_products.values).toarray()

for i, c in enumerate(product_cols):
    df_features[c] = tfidf_array[:, i]

In [92]:
%%time
constant_cols = df_features.columns[df_features.nunique() <= 1]
df_features = df_features.drop(columns=constant_cols)

CPU times: user 2min 1s, sys: 3min 14s, total: 5min 16s
Wall time: 7min 28s


Categorical Features Encoding

In [105]:
cat_cols = df_features.drop(columns=product_cols).columns[
    df_features.drop(columns=product_cols).nunique() <= 10
]

In [99]:
# cross-validation indices
indices_train = df_train.index
indices_test = df_test.index

indices_learn, indices_valid = train_test_split(
    indices_train, test_size=0.3, random_state=123
)

In [121]:
%%time
encoder = CatBoostEncoder(random_state=42)

encoder.fit(
    df_features.loc[indices_train, cat_cols].fillna(0),
    pd.DataFrame(df_train.loc[indices_train, "target"]),
)

df_cats = encoder.transform(df_features.loc[:, cat_cols])

for c in cat_cols:
    df_features[c] = df_cats[c]

CPU times: user 247 ms, sys: 44.2 ms, total: 291 ms
Wall time: 291 ms


Cross-validation

In [161]:
%%time
print("start training the models...")
valid_uplift = uplift_fit_predict(
    model=LGBMClassifier(**lgb_params),
    X_train=df_features.loc[indices_learn, :].fillna(0).values,
    treatment_train=df_train.loc[indices_learn, "treatment_flg"].values,
    target_train=df_train.loc[indices_learn, "target"].values,
    X_test=df_features.loc[indices_valid, :].fillna(0).values,
)

start training the models...
fitting treatment model...
fitting control model...
predicting treatment and control...
predicting uplift...
CPU times: user 14min 24s, sys: 1min 15s, total: 15min 40s
Wall time: 3min 51s


In [162]:
valid_score = uplift_score(
    valid_uplift,
    treatment=df_train.loc[indices_valid, "treatment_flg"].values,
    target=df_train.loc[indices_valid, "target"].values,
)
print(f"Validation score: {valid_score:.4f}")

Validation score: 0.0807


In [163]:
%%time
# predict test
test_uplift = uplift_fit_predict(
    model=LGBMClassifier(**lgb_params),
    X_train=df_features.loc[indices_train, :].fillna(0).values,
    treatment_train=df_train.loc[indices_train, "treatment_flg"].values,
    target_train=df_train.loc[indices_train, "target"].values,
    X_test=df_features.loc[indices_test, :].fillna(0).values,
)

fitting treatment model...
fitting control model...
predicting treatment and control...


KeyboardInterrupt: 

Save results

In [164]:
df_submission = pd.DataFrame({"uplift": test_uplift}, index=df_test.index)
df_submission.to_csv(f"submissions/submission_{valid_score:.4f}.csv")