In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn import metrics
from sklearn import model_selection

In [3]:
data_dir = Path.home() / 'Desktop/kaggle/quora'

[str(p) for p in data_dir.iterdir()]

['/usr/local/google/home/maekawa/Desktop/kaggle/quora/.ipynb_checkpoints',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/embeddings.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_length.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_submission.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_resubmissions.csv',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_length.pickle']

In [4]:
examples_with_length = pd.read_pickle(data_dir / 'train_with_length.pickle')
examples_with_keywords = pd.read_pickle(data_dir / 'train_with_keywords.pickle')

In [5]:
examples_with_length.head()

Unnamed: 0,qid,target,question_chars,question_terms
0,00002165364db923c7e6,0,72,13
1,000032939017120e6e44,0,81,16
2,0000412ca6e4628ce2cf,0,67,10
3,000042bf85aa498cd78e,0,57,9
4,0000455dfa3e01eae3af,0,77,15


In [6]:
examples_with_keywords.head()

Unnamed: 0,qid,target,kw_blacks,kw_liberals,kw_whites,kw_feminists,kw_fuck,kw_democrats,kw_muslims,kw_hindus,...,kw_religious,kw_rude,kw_they,kw_kill,kw_justify,kw_rahul,kw_respect,kw_peaceful,kw_sexual,kw_america
0,00002165364db923c7e6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000032939017120e6e44,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0000412ca6e4628ce2cf,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000042bf85aa498cd78e,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0000455dfa3e01eae3af,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
examples_concat = pd.concat(
    [examples_with_length.drop(columns=['qid', 'target']),
     examples_with_keywords.drop(columns=['qid', 'target'])],
    sort=False,
    axis=1)

In [8]:
examples_concat.head()

Unnamed: 0,question_chars,question_terms,kw_blacks,kw_liberals,kw_whites,kw_feminists,kw_fuck,kw_democrats,kw_muslims,kw_hindus,...,kw_religious,kw_rude,kw_they,kw_kill,kw_justify,kw_rahul,kw_respect,kw_peaceful,kw_sexual,kw_america
0,72,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,81,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,67,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,57,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,77,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X_all = examples_concat
y_all = examples_with_length.target

In [18]:
train_params = {}

train_meta = np.zeros(y_all.shape)

SEED = 42

splits = model_selection.StratifiedKFold(
    n_splits=5, shuffle=True, random_state=SEED
).split(X_all, y_all)

cv_scores = []

for idx, (train_idx, valid_idx) in enumerate(splits):
    X_train = X_all.iloc[train_idx]
    y_train = y_all.iloc[train_idx]
    X_val = X_all.iloc[valid_idx]
    y_val = y_all.iloc[valid_idx]

    model = lgb.train(train_params, lgb.Dataset(X_train, y_train))
    pred_val_y = model.predict(X_val)
    
    best_thresh = 0.5
    best_f1_score = 0.0
    precision_score = 0.0
    recall_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        f1_score = metrics.f1_score(y_val, (pred_val_y > thresh).astype(int))
        if f1_score > best_f1_score:
            best_thresh = thresh
            best_f1_score = f1_score
            precision_score = metrics.precision_score(y_val, (pred_val_y > thresh).astype(int))
            recall_score = metrics.recall_score(y_val, (pred_val_y > thresh).astype(int))

    print("Val F1 Score: {:.4f} at threshold {} (Precision={:.4f}, Recall={:.4f})".format(
        best_f1_score, best_thresh, precision_score, recall_score))
    train_meta[valid_idx] = pred_val_y.reshape(-1)
    cv_scores.append(best_f1_score)

cv_scores

Val F1 Score: 0.5094 at threshold 0.23 (Precision=0.4814, Recall=0.5409)
Val F1 Score: 0.5116 at threshold 0.24 (Precision=0.4854, Recall=0.5407)
Val F1 Score: 0.5057 at threshold 0.22 (Precision=0.4699, Recall=0.5473)
Val F1 Score: 0.5131 at threshold 0.21 (Precision=0.4691, Recall=0.5661)
Val F1 Score: 0.5122 at threshold 0.23 (Precision=0.4811, Recall=0.5476)


[0.5094108734922208,
 0.511576174447534,
 0.5056735358847573,
 0.513079317015729,
 0.5122402916835466]

## Hyper parameter tuning with Optuna

In [11]:
import sys

sys.path.append('/usr/local/google/home/maekawa/.local/lib/python3.5/site-packages')

import optuna

In [None]:
import optuna

SEED = 42
kfold = model_selection.StratifiedKFold(
    n_splits=5, shuffle=True, random_state=SEED
)

def objective(trial):
    drop_rate = trial.suggest_uniform('drop_rate', 0, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0, 1.0)
    learning_rate = trial.suggest_uniform('learning_rate', 0, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.8, 1.0)
    num_leaves = trial.suggest_int('num_leaves', 5, 1000)
    verbosity = trial.suggest_int('verbosity', -1, 1)
    num_boost_round = trial.suggest_int('num_boost_round', 10, 100000)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 10, 100000)
    min_child_samples = trial.suggest_int('min_child_samples', 5, 500)
    min_child_weight = trial.suggest_int('min_child_weight', 5, 500)

    params = {"objective": "binary",
              "boosting_type": "gbdt",
              "learning_rate": learning_rate,
              "num_leaves": num_leaves,
              "max_bin": 256,
              "feature_fraction": feature_fraction,
              "verbosity": verbosity,
              "drop_rate": drop_rate,
              "is_unbalance": False,
              "max_drop": 50,
              "min_child_samples": min_child_samples,
              "min_child_weight": min_child_weight,
              "min_split_gain": 0,
              "min_data_in_leaf": min_data_in_leaf,
              "subsample": subsample
              }

    x_score = []
    final_cv_train = np.zeros(len(y_all))
    #final_cv_pred = np.zeros(len(test_id))

    cv_train = np.zeros(len(y_all))
    #cv_pred = np.zeros(len(test_id))

    params['seed'] = SEED

    best_trees = []
    fold_scores = []

    for idx, (train_idx, valid_idx) in enumerate(kfold.split(X_all, y_all)):
        print('kfold_index:', idx)
        X_train = X_all.iloc[train_idx]
        y_train = y_all.iloc[train_idx]
        X_val = X_all.iloc[valid_idx]
        y_val = y_all.iloc[valid_idx]

        dtrain = lgb.Dataset(X_train, y_train)
        dvalid = lgb.Dataset(X_val, y_val, reference=dtrain)
        bst = lgb.train(params, dtrain, num_boost_round, valid_sets=dvalid, verbose_eval=100,
                        early_stopping_rounds=100)
        best_trees.append(bst.best_iteration)
        #cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
        pred_val_y = bst.predict(X_val)
        cv_train[valid_idx] += pred_val_y

        best_thresh = 0.5
        best_f1_score = 0.0
        precision_score = 0.0
        recall_score = 0.0
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            f1_score = metrics.f1_score(y_val, (pred_val_y > thresh).astype(int))
            if f1_score > best_f1_score:
                best_thresh = thresh
                best_f1_score = f1_score
                precision_score = metrics.precision_score(y_val, (pred_val_y > thresh).astype(int))
                recall_score = metrics.recall_score(y_val, (pred_val_y > thresh).astype(int))
        
        print("Val F1 Score: {:.4f} at threshold {} (Precision={:.4f}, Recall={:.4f})".format(
              best_f1_score, best_thresh, precision_score, recall_score))
        fold_scores.append(best_f1_score)


    cv_score = sum(fold_scores) / len(fold_scores)

    print("cv score:", cv_score)
    return (1 - cv_score)

study = optuna.create_study()
study.optimize(objective, n_trials=150)

kfold_index: 0
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.21924
[200]	valid_0's binary_logloss: 0.219195
[300]	valid_0's binary_logloss: 0.219196
Early stopping, best iteration is:
[211]	valid_0's binary_logloss: 0.219194


  'precision', 'predicted', average, warn_for)


Val F1 Score: 0.2247 at threshold 0.11 (Precision=0.1698, Recall=0.3321)
kfold_index: 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.218903
[200]	valid_0's binary_logloss: 0.218863
[300]	valid_0's binary_logloss: 0.218865
Early stopping, best iteration is:
[206]	valid_0's binary_logloss: 0.218863
Val F1 Score: 0.2236 at threshold 0.11 (Precision=0.1702, Recall=0.3259)
kfold_index: 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.219138
[200]	valid_0's binary_logloss: 0.219137
Early stopping, best iteration is:
[127]	valid_0's binary_logloss: 0.219129
Val F1 Score: 0.2214 at threshold 0.11 (Precision=0.1671, Recall=0.3279)
kfold_index: 3
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.219129
[200]	valid_0's binary_logloss: 0.219105
Early stopping, best iteration is:
[175]	valid_0's binary_logloss: 0.219103
Val F1 Score: 0.2226 at threshol

[I 2018-12-28 16:07:57,197] Finished a trial resulted in value: 0.7763424037639967. Current best value is 0.7763424037639967 with parameters: {'drop_rate': 0.08344007698631228, 'feature_fraction': 0.5051428224954837, 'learning_rate': 0.7622411445514977, 'subsample': 0.9431635337035124, 'num_leaves': 203, 'verbosity': 1, 'num_boost_round': 69012, 'min_data_in_leaf': 72062, 'min_child_samples': 487, 'min_child_weight': 455}.


kfold_index: 0
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.216736
[200]	valid_0's binary_logloss: 0.216409
[300]	valid_0's binary_logloss: 0.216225
[400]	valid_0's binary_logloss: 0.21608
[500]	valid_0's binary_logloss: 0.215966
[600]	valid_0's binary_logloss: 0.215892
[700]	valid_0's binary_logloss: 0.215845
[800]	valid_0's binary_logloss: 0.215813
[900]	valid_0's binary_logloss: 0.215788
Did not meet early stopping. Best iteration is:
[986]	valid_0's binary_logloss: 0.215773
Val F1 Score: 0.2434 at threshold 0.11 (Precision=0.1936, Recall=0.3275)
kfold_index: 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.216102
[200]	valid_0's binary_logloss: 0.215774
[300]	valid_0's binary_logloss: 0.215593
[400]	valid_0's binary_logloss: 0.215446
[500]	valid_0's binary_logloss: 0.215329
[600]	valid_0's binary_logloss: 0.215254
[700]	valid_0's binary_logloss: 0.215204
[800]	valid_0's binary_logl

[I 2018-12-28 16:09:46,340] Finished a trial resulted in value: 0.7542883792301753. Current best value is 0.7542883792301753 with parameters: {'drop_rate': 0.03990300844648875, 'feature_fraction': 0.26475298717530904, 'learning_rate': 0.20902542063509422, 'subsample': 0.8686272942397322, 'num_leaves': 484, 'verbosity': 0, 'num_boost_round': 986, 'min_data_in_leaf': 15813, 'min_child_samples': 323, 'min_child_weight': 310}.


kfold_index: 0
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.219539
[200]	valid_0's binary_logloss: 0.219346
[300]	valid_0's binary_logloss: 0.219233
[400]	valid_0's binary_logloss: 0.219159
[500]	valid_0's binary_logloss: 0.219107
[600]	valid_0's binary_logloss: 0.21907
[700]	valid_0's binary_logloss: 0.219044
[800]	valid_0's binary_logloss: 0.219027
[900]	valid_0's binary_logloss: 0.219014
[1000]	valid_0's binary_logloss: 0.219006
[1100]	valid_0's binary_logloss: 0.219
[1200]	valid_0's binary_logloss: 0.218996
[1300]	valid_0's binary_logloss: 0.218993
[1400]	valid_0's binary_logloss: 0.218991
[1500]	valid_0's binary_logloss: 0.21899
[1600]	valid_0's binary_logloss: 0.218989
[1700]	valid_0's binary_logloss: 0.218989
[1800]	valid_0's binary_logloss: 0.218988
[1900]	valid_0's binary_logloss: 0.218989
Early stopping, best iteration is:
[1809]	valid_0's binary_logloss: 0.218988


  'precision', 'predicted', average, warn_for)


Val F1 Score: 0.2249 at threshold 0.11 (Precision=0.1700, Recall=0.3320)
kfold_index: 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.219176
[200]	valid_0's binary_logloss: 0.219
[300]	valid_0's binary_logloss: 0.21889
[400]	valid_0's binary_logloss: 0.218818
[500]	valid_0's binary_logloss: 0.218769
[600]	valid_0's binary_logloss: 0.218734
[700]	valid_0's binary_logloss: 0.21871
[800]	valid_0's binary_logloss: 0.218693
[900]	valid_0's binary_logloss: 0.218682
[1000]	valid_0's binary_logloss: 0.218673
[1100]	valid_0's binary_logloss: 0.218668
[1200]	valid_0's binary_logloss: 0.218664
[1300]	valid_0's binary_logloss: 0.218662
[1400]	valid_0's binary_logloss: 0.218661
[1500]	valid_0's binary_logloss: 0.218659
[1600]	valid_0's binary_logloss: 0.218659
[1700]	valid_0's binary_logloss: 0.218658
[1800]	valid_0's binary_logloss: 0.218658
[1900]	valid_0's binary_logloss: 0.218658
[2000]	valid_0's binary_logloss: 0.218658
Early stopping, best it

[I 2018-12-28 16:12:17,728] Finished a trial resulted in value: 0.775816281655908. Current best value is 0.7542883792301753 with parameters: {'drop_rate': 0.03990300844648875, 'feature_fraction': 0.26475298717530904, 'learning_rate': 0.20902542063509422, 'subsample': 0.8686272942397322, 'num_leaves': 484, 'verbosity': 0, 'num_boost_round': 986, 'min_data_in_leaf': 15813, 'min_child_samples': 323, 'min_child_weight': 310}.


kfold_index: 0
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.219505
[200]	valid_0's binary_logloss: 0.219336
[300]	valid_0's binary_logloss: 0.219283
[400]	valid_0's binary_logloss: 0.219267
[500]	valid_0's binary_logloss: 0.219263
[600]	valid_0's binary_logloss: 0.219262
Early stopping, best iteration is:
[573]	valid_0's binary_logloss: 0.219262


  'precision', 'predicted', average, warn_for)


Val F1 Score: 0.2251 at threshold 0.11 (Precision=0.1703, Recall=0.3317)
kfold_index: 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.219129
[200]	valid_0's binary_logloss: 0.218958
[300]	valid_0's binary_logloss: 0.218905
[400]	valid_0's binary_logloss: 0.21889
[500]	valid_0's binary_logloss: 0.218885
[600]	valid_0's binary_logloss: 0.218885
[700]	valid_0's binary_logloss: 0.218884
Early stopping, best iteration is:
[664]	valid_0's binary_logloss: 0.218884
Val F1 Score: 0.2236 at threshold 0.11 (Precision=0.1700, Recall=0.3266)
kfold_index: 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.219313
[200]	valid_0's binary_logloss: 0.21918
[300]	valid_0's binary_logloss: 0.219153
[400]	valid_0's binary_logloss: 0.219153
Early stopping, best iteration is:
[323]	valid_0's binary_logloss: 0.219151
Val F1 Score: 0.2213 at threshold 0.11 (Precision=0.1667, Recall=0.3294)
kfold_index: 3
Training 

[I 2018-12-28 16:13:18,846] Finished a trial resulted in value: 0.7763940143994813. Current best value is 0.7542883792301753 with parameters: {'drop_rate': 0.03990300844648875, 'feature_fraction': 0.26475298717530904, 'learning_rate': 0.20902542063509422, 'subsample': 0.8686272942397322, 'num_leaves': 484, 'verbosity': 0, 'num_boost_round': 986, 'min_data_in_leaf': 15813, 'min_child_samples': 323, 'min_child_weight': 310}.


kfold_index: 0
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.218469
[200]	valid_0's binary_logloss: 0.218266
[300]	valid_0's binary_logloss: 0.218238
[400]	valid_0's binary_logloss: 0.218235
Early stopping, best iteration is:
[395]	valid_0's binary_logloss: 0.218235
Val F1 Score: 0.2291 at threshold 0.12 (Precision=0.1892, Recall=0.2902)
kfold_index: 1
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.218071
[200]	valid_0's binary_logloss: 0.217867
[300]	valid_0's binary_logloss: 0.21784
[400]	valid_0's binary_logloss: 0.217838
Early stopping, best iteration is:
[384]	valid_0's binary_logloss: 0.217837
Val F1 Score: 0.2280 at threshold 0.11 (Precision=0.1806, Recall=0.3093)
kfold_index: 2
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.218354
[200]	valid_0's binary_logloss: 0.218212
[300]	valid_0's binary_logloss: 0.21821
Early stopping, bes