In [1]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

SEED = 47

In [2]:
train = pd.read_csv('../input/techuklon-int20h/train.csv')
test = pd.read_csv('../input/techuklon-int20h/test.csv')
ss = pd.read_csv('../input/techuklon-int20h/Samle_Submission.csv')

# Preprocessing

Since we need to predict churn without depending on the period, we decided to try 2 preprocessing options:
* Use all training records and predict for each Id and Week in the test. After that, for the final forecast, we aggregate the weekly forecasts for each Id.
* (It seemed better to us) First, aggregate the features to the Id level (without dependence on the period) and get a set of statistics for each feature, for example, the mean and variance. Using the resulting set of features, we solve the usual regression problem.

In [3]:
agg = ['mean', 'std', 'min', 'max', 'sum', 'median']

train_X = train.drop(columns=['target', 'Week']).groupby(['Id']).agg(agg).reset_index()
train_X.columns = ['Id'] + [f'{col}_{stat}' for col in train.columns[2:-1] for stat in agg]
train_X['target'] = train.groupby(['Id'])['target'].max().reset_index()['target']

In [4]:
X, y = train_X.drop(columns=['target', 'Id']), train_X['target']

# Modeling

For training, checking and forecasting, we use the following function. It is good because it generates oof-predicts.

**Out-of-Fold Predictions** was used as a prediction method. An out-of-fold prediction is a prediction by the model during the k-fold cross-validation procedure. That is, out-of-fold predictions are those predictions made on the holdout datasets during the resampling procedure. There will be one prediction for each example in the training dataset.

In [5]:
def calc(X, y, X_test, model, cv, oof):
    
    res = []
    local_probs = pd.DataFrame()
    for i, (tdx, vdx) in enumerate(cv.split(X, y)):
        X_train, X_valid, y_train, y_valid = X.iloc[tdx], X.iloc[vdx], y.iloc[tdx], y.iloc[vdx]
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_valid)[:,1]
        
        if oof:
            oof_predict = model.predict_proba(X_test)[:,1]
            local_probs['fold_%i'%i] = oof_predict
        
        score = roc_auc_score(y_valid, preds)
        print(f'{i} Fold: {score:.4f}')
        res.append(score)
    
    print(f'AVG score: {np.mean(res):.4f}')
    return np.mean(res), local_probs.mean(axis=1)

In [6]:
test_X = test.drop(columns=['Week']).groupby(['Id']).agg(agg).reset_index()
test_X.columns = ['Id'] + [f'{col}_{stat}' for col in train.columns[2:-1] for stat in agg]

The set of hyperparameters for LGBM was selected using **Optuna**. You can see how to use it in our other [notebook](https://www.kaggle.com/imgremlin/4th-place-in-fraud-detection-from-zindi).

In [7]:
best_lgbm = {'n_estimators': 991,
 'max_depth': 3,
 'learning_rate': 0.036386125120857434,
 'subsample': 0.014257208320520425,
 'colsample_bytree': 0.4113657182157471,
 'min_child_weight': 9,
 'reg_alpha': 0.4433813076766884,
 'reg_lambda': 2.871918517181243,
             'random_state': SEED}

In [8]:
cb = CatBoostClassifier(iterations=250, random_state=SEED, verbose=0)
lgbm = LGBMClassifier(**best_lgbm)
kfold = KFold(n_splits=5, random_state=SEED, shuffle=True)

In [9]:
score, predict_lgbm = calc(X, y, test_X.drop(columns=['Id']), lgbm, kfold, True)

0 Fold: 0.9808
1 Fold: 0.9820
2 Fold: 0.9804
3 Fold: 0.9833
4 Fold: 0.9706
AVG score: 0.9794


In [10]:
score, predict_cb = calc(X, y, test_X.drop(columns=['Id']), cb, kfold, True)

0 Fold: 0.9787
1 Fold: 0.9826
2 Fold: 0.9786
3 Fold: 0.9812
4 Fold: 0.9688
AVG score: 0.9780


# (Bonus) Magic

**Automated machine learning (AutoML)** is the process of automating the end-to-end process of applying machine learning to real-world problems. In a typical machine learning application, the typical stages (and sub-stages) of work are the following:

* Data preparation
    * data pre-processing
    * feature engineering
    * feature extraction
    * feature selection
* Model selection
* Hyperparameter optimization (to maximize the performance of the final model)

AutoML was proposed as an artificial intelligence-based solution to the ever-growing challenge of applying machine learning.

In [11]:
# import h2o
# from h2o.automl import H2OAutoML

# h2o.init()

In [12]:
# y = "target"
# x = list(train_X.columns) 
# x.remove(y)

In [13]:
# train_X = h2o.H2OFrame(train_X)
# train_X[y] = train_X[y].asfactor()

In [14]:
# # Increase max_runtime_secs the value for a better prediction.

# aml = H2OAutoML(max_runtime_secs = 60*2, sort_metric='AUC', stopping_metric='AUC' , stopping_rounds=100)
# aml.train(x = x, y = y, training_frame = train_X)

In [15]:
# aml.leaderboard

In [16]:
# test_X = test.drop(columns=['Week']).groupby(['Id']).agg(agg).reset_index()
# test_X.columns = ['Id'] + [f'{col}_{stat}' for col in train.columns[2:-1] for stat in agg]

In [17]:
# predict_automl = aml.leader.predict(h2o.H2OFrame(test_X.drop(columns=['Id']))).as_data_frame()['p1']

# Create a submission

In [18]:
test_X['Predicted'] = 0.6 * predict_lgbm + 0.4 * predict_cb # + 0.3 * predict_automl
test_X[['Id', 'Predicted']].to_csv('submit.csv', index=0)