# Out-Of-Fold

References

* *https://www.kaggle.com/adarshchavakula/out-of-fold-oof-model-cross-validation/data*
* *https://machinelearningmastery.com/k-fold-cross-validation/*
* *https://www.kaggle.com/adarshchavakula/out-of-fold-oof-model-cross-validation*

## Examples

### Scikit-Learn OOF

```python
"""
Use Scikit Learn's cross_val_predict to do a Out-of-Fold Cross validation as opposed 
to averaging out the scores on each fold.
This **usually** tends to be more stable/reliable compared to within fold average.

This script works for all Scikit Learn models as well as the Scikit Learn APIs of
XGBoost, LightGBM and Keras.
"""

import numpy as np 
import pandas as pd 
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

# Read Data
print("Reading Dataset...")
train = pd.read_csv("../input/train.csv")
target = np.array(train["target"])
target_log = np.log1p(target) # Log transform target as the evaluation metric uses it
xtrain = np.array(train.iloc[:,2:])
print("Shape of training data: {}".format(np.shape(xtrain)))

# Define Model 
xgb_model = XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=70,
                         min_child_weight=100, subsample=1.0, 
                         colsample_bytree=0.8, colsample_bylevel=0.8,
                         random_state=42, n_jobs=4)

# Make OOF predictions using 5 folds
print("Cross Validating...")
oof_preds_log = cross_val_predict(xgb_model, xtrain, target_log, cv=5, 
                                  n_jobs=1, method="predict")
                                  
# Calculate RMSLE (RMSE of Log(1+y))
cv_rmsle = np.sqrt(mean_squared_error(target_log, oof_preds_log))
print("\nOOF RMSLE Score: {:.4f}".format(cv_rmsle))

```

### 自己实现OOF

```py
ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',
                               'personidentification', 'level', 'followscore', 'personalscore', 'gender',
                               'hourl', 'group'],
           df_train.columns))

model = lgb.LGBMClassifier(num_leaves=64,
                           max_depth=10,
                           learning_rate=0.4,
                           n_estimators=10000000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=seed,
                           metric='auc'
                           )

oof = []
prediction = df_test[['id']]
prediction['target'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=n_fold, shuffle=False, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train['day'])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=100,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]
    df_oof = df_train.iloc[val_idx][['id', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]
    prediction['target'] += pred_test / n_fold

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

```

**oof evaluation**

```py
df_oof = pd.concat(oof)
df_oof['pred_bin'] = df_oof['pred'].rank()
df_oof['pred_bin'] = (df_oof['pred_bin'] >= df_oof.shape[0]
                      * 0.8934642948637943).astype(int)

auc = roc_auc_score(df_oof['target'], df_oof['pred_bin'])
f1 = f1_score(df_oof['target'], df_oof['pred_bin'])

print('f1:', f1)
print('auc:', auc)
```