In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


As I realized from previous experience to get better performance, we need use source dataset.

In [0]:
import pandas as pd

train_ds = pd.read_csv('/content/drive/My Drive/credit/trainDS.csv')
test_ds = pd.read_csv('/content/drive/My Drive/credit/testDS.csv')

In [0]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, recall_score
import lightgbm as lgb
import gc
import numpy as np

def model(features, test_features, n_folds = 5):
 
    labels = train_ds['target']
    prediction = train_ds.copy()
    
    features = train_ds.drop(columns = 'target')

    features.feature4 = features.feature4.replace(' ', -999).astype(int)
    features.feature9 = features.feature9.replace(' ', -1).astype(int)

    test_features.feature4 = test_features.feature4.replace(' ', -999).astype(int)
    test_features.feature9 = test_features.feature9.replace(' ', -1).astype(int)
        
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)

    feature_names = list(features.columns)

    features = np.array(features)
    test_features = np.array(test_features)

    k_fold = StratifiedKFold(n_splits = 5, shuffle = True)

    test_predictions = np.zeros(test_features.shape[0])
    out_of_fold = np.zeros(features.shape[0])
    
    for train_indices, valid_indices in k_fold.split(features, labels):
        
        train_features, train_labels = features[train_indices], labels[train_indices]
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]

        model = lgb.LGBMClassifier(n_estimators=500, objective = 'binary', 
                                   learning_rate = 0.05, 
                                   reg_alpha = 0.3,  reg_lambda = 0.1,
                                   subsample = 1.0, n_jobs = -1)
        model.fit(train_features, train_labels, 
                  eval_set = [(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names = ['valid', 'train'],
                  early_stopping_rounds = 100, 
                  verbose = 100)
        
        best_iteration = model.best_iteration_
        
        test_predictions += model.predict_proba(test_features, num_iteration = best_iteration)[:, 1] / k_fold.n_splits
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration = best_iteration)[:, 1]
        
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
        
    test_prediction = pd.DataFrame({'preds_1': test_predictions})
    prediction['preds_1'] = out_of_fold

    f1 = f1_score(labels, (out_of_fold>0.5))
    recall = recall_score(labels, (out_of_fold>0.5))
    
    metrics = {'F1 Score': [f1], 'Recall': [recall]} 
    
    return test_prediction, prediction, metrics

In [4]:
test_prediction, prediction, metrics = model(train_ds, test_ds)

Training Data Shape:  (73276, 10)
Testing Data Shape:  (31405, 10)
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.164074	valid's binary_logloss: 0.176633
Early stopping, best iteration is:
[95]	train's binary_logloss: 0.164664	valid's binary_logloss: 0.176592
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.162716	valid's binary_logloss: 0.181773
Early stopping, best iteration is:
[91]	train's binary_logloss: 0.163929	valid's binary_logloss: 0.181746
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.163989	valid's binary_logloss: 0.176874
[200]	train's binary_logloss: 0.154198	valid's binary_logloss: 0.177232
Early stopping, best iteration is:
[114]	train's binary_logloss: 0.162383	valid's binary_logloss: 0.176825
Training until validation scores don't improve for 100 rounds.
[100]	train's binary_logloss: 0.164414	valid's binary_logloss: 0.176011
[2

In [5]:
metrics

{'F1 Score': [0.29344465058750774], 'Recall': [0.19391091131998364]}

Little bit better then Random Forest, so we will use probabilities as **meta-feature** for final model.

In [0]:
prediction.to_csv('/content/drive/My Drive/credit/train_predictions_lgbm.csv', index_label='idx')
test_prediction.to_csv('/content/drive/My Drive/credit/test_predictions_lgbm.csv', index_label='idx')