In [10]:
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn import metrics
from sklearn import model_selection

In [11]:
data_dir = Path.home() / 'Desktop/kaggle/quora'

[str(p) for p in data_dir.iterdir()]

['/usr/local/google/home/maekawa/Desktop/kaggle/quora/.ipynb_checkpoints',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/embeddings.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_length.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_submission.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_resubmissions.csv',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_length.pickle']

In [12]:
examples_with_length = pd.read_pickle(data_dir / 'train_with_length.pickle')
examples_with_keywords = pd.read_pickle(data_dir / 'train_with_keywords.pickle')

In [13]:
examples_with_length.head()

Unnamed: 0,qid,target,question_chars,question_terms
0,00002165364db923c7e6,0,72,13
1,000032939017120e6e44,0,81,16
2,0000412ca6e4628ce2cf,0,67,10
3,000042bf85aa498cd78e,0,57,9
4,0000455dfa3e01eae3af,0,77,15


In [14]:
examples_with_keywords.head()

Unnamed: 0,qid,target,kw_blacks,kw_liberals,kw_whites,kw_feminists,kw_fuck,kw_democrats,kw_muslims,kw_hindus,...,kw_have_sex,kw_why_do_so,kw_why_are_the,kw_the_fact_that,kw_is_it_that,kw_it_true_that,kw_why_are_there,kw_is_it_true,kw_why_does_the,kw_why_is_it
0,00002165364db923c7e6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000032939017120e6e44,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0000412ca6e4628ce2cf,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000042bf85aa498cd78e,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0000455dfa3e01eae3af,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
examples_concat = pd.concat(
    [examples_with_length.drop(columns=['qid', 'target']),
     examples_with_keywords.drop(columns=['qid', 'target'])],
    sort=False,
    axis=1)

In [16]:
examples_concat.head()

Unnamed: 0,question_chars,question_terms,kw_blacks,kw_liberals,kw_whites,kw_feminists,kw_fuck,kw_democrats,kw_muslims,kw_hindus,...,kw_have_sex,kw_why_do_so,kw_why_are_the,kw_the_fact_that,kw_is_it_that,kw_it_true_that,kw_why_are_there,kw_is_it_true,kw_why_does_the,kw_why_is_it
0,72,13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,81,16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,67,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,57,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,77,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
X_all = examples_concat
y_all = examples_with_length.target

In [18]:
train_params = {}

train_meta = np.zeros(y_all.shape)

DATA_SPLIT_SEED = 42

splits = model_selection.StratifiedKFold(
    n_splits=5, shuffle=True, random_state=DATA_SPLIT_SEED
).split(X_all, y_all)

cv_scores = []

for idx, (train_idx, valid_idx) in enumerate(splits):
    X_train = X_all.iloc[train_idx]
    y_train = y_all.iloc[train_idx]
    X_val = X_all.iloc[valid_idx]
    y_val = y_all.iloc[valid_idx]

    model = lgb.train(train_params, lgb.Dataset(X_train, y_train))
    pred_val_y = model.predict(X_val)
    
    best_thresh = 0.5
    best_f1_score = 0.0
    precision_score = 0.0
    recall_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        f1_score = metrics.f1_score(y_val, (pred_val_y > thresh).astype(int))
        if f1_score > best_f1_score:
            best_thresh = thresh
            best_f1_score = f1_score
            precision_score = metrics.precision_score(y_val, (pred_val_y > thresh).astype(int))
            recall_score = metrics.recall_score(y_val, (pred_val_y > thresh).astype(int))

    print("Val F1 Score: {:.4f} at threshold {} (Precision={:.4f}, Recall={:.4f})".format(
        best_f1_score, best_thresh, precision_score, recall_score))
    train_meta[valid_idx] = pred_val_y.reshape(-1)
    cv_scores.append(best_f1_score)

cv_scores

Val F1 Score: 0.5094 at threshold 0.23 (Precision=0.4814, Recall=0.5409)
Val F1 Score: 0.5116 at threshold 0.24 (Precision=0.4854, Recall=0.5407)
Val F1 Score: 0.5057 at threshold 0.22 (Precision=0.4699, Recall=0.5473)
Val F1 Score: 0.5131 at threshold 0.21 (Precision=0.4691, Recall=0.5661)
Val F1 Score: 0.5122 at threshold 0.23 (Precision=0.4811, Recall=0.5476)


[0.5094108734922208,
 0.511576174447534,
 0.5056735358847573,
 0.513079317015729,
 0.5122402916835466]