In [13]:
from pathlib import Path

import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn import metrics
from sklearn import model_selection

In [3]:
data_dir = Path.home() / 'Desktop/kaggle/quora'

[str(p) for p in data_dir.iterdir()]

['/usr/local/google/home/maekawa/Desktop/kaggle/quora/.ipynb_checkpoints',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/embeddings.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_length.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_submission.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_resubmissions.csv',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_length.pickle']

In [4]:
examples_with_length = pd.read_pickle(data_dir / 'train_with_length.pickle')
examples_with_keywords = pd.read_pickle(data_dir / 'train_with_keywords.pickle')

In [5]:
examples_with_length.head()

Unnamed: 0,qid,target,question_chars,question_terms
0,00002165364db923c7e6,0,72,13
1,000032939017120e6e44,0,81,16
2,0000412ca6e4628ce2cf,0,67,10
3,000042bf85aa498cd78e,0,57,9
4,0000455dfa3e01eae3af,0,77,15


In [6]:
examples_with_keywords.head()

Unnamed: 0,qid,target,keyword_kill,keyword_killed,keyword_killing,keyword_fuck,keyword_fucking,keyword_penis,keyword_skin,keyword_races,keyword_racism,keyword_racist
0,00002165364db923c7e6,0,False,False,False,False,False,False,False,False,False,False
1,000032939017120e6e44,0,False,False,False,False,False,False,False,False,False,False
2,0000412ca6e4628ce2cf,0,False,False,False,False,False,False,False,False,False,False
3,000042bf85aa498cd78e,0,False,False,False,False,False,False,False,False,False,False
4,0000455dfa3e01eae3af,0,False,False,False,False,False,False,False,False,False,False


In [7]:
examples_concat = pd.concat(
    [examples_with_length.drop(columns=['qid', 'target']),
     examples_with_keywords.drop(columns=['qid', 'target'])],
    sort=False,
    axis=1)

In [8]:
examples_concat.head()

Unnamed: 0,question_chars,question_terms,keyword_kill,keyword_killed,keyword_killing,keyword_fuck,keyword_fucking,keyword_penis,keyword_skin,keyword_races,keyword_racism,keyword_racist
0,72,13,False,False,False,False,False,False,False,False,False,False
1,81,16,False,False,False,False,False,False,False,False,False,False
2,67,10,False,False,False,False,False,False,False,False,False,False
3,57,9,False,False,False,False,False,False,False,False,False,False
4,77,15,False,False,False,False,False,False,False,False,False,False


In [10]:
X_all = examples_concat
y_all = examples_with_length.target

In [27]:
train_params = {}

train_meta = np.zeros(y_all.shape)

DATA_SPLIT_SEED = 42

splits = model_selection.StratifiedKFold(
    n_splits=5, shuffle=True, random_state=DATA_SPLIT_SEED
).split(X_all, y_all)

cv_scores = []

for idx, (train_idx, valid_idx) in enumerate(splits):
    X_train = X_all.iloc[train_idx]
    y_train = y_all.iloc[train_idx]
    X_val = X_all.iloc[valid_idx]
    y_val = y_all.iloc[valid_idx]

    model = lgb.train(train_params, lgb.Dataset(X_train, y_train))
    pred_val_y = model.predict(X_val)
    
    best_thresh = 0.5
    best_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score = metrics.f1_score(y_val, (pred_val_y > thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
            best_score = score

    print("Val F1 Score: {:.4f}  at threshold {}".format(best_score, best_thresh))
    train_meta[valid_idx] = pred_val_y.reshape(-1)
    cv_scores.append(best_score)

cv_scores

Val F1 Score: 0.2465  at threshold 0.12
Val F1 Score: 0.2456  at threshold 0.13
Val F1 Score: 0.2443  at threshold 0.12
Val F1 Score: 0.2474  at threshold 0.13
Val F1 Score: 0.2473  at threshold 0.12


[0.24647659764857943,
 0.24562648399334078,
 0.2442601096133906,
 0.24742875508892223,
 0.24729101194217476]

In [40]:
X_test = test_concat.drop(columns=['qid'])

In [42]:
y_test_pred = pd.Series(model.predict(X_test))

In [43]:
y_test_pred.head()

0    0.074785
1    0.032760
2    0.032949
3    0.127251
4    0.035798
dtype: float64

In [48]:
prediction = (y_test_pred > 0.15).map(lambda x: 1 if x else 0)
prediction.sum() / len(y_test_pred)

0.05749512151853823

In [51]:
answer = pd.DataFrame(test_concat.qid)
answer = answer.assign(prediction=prediction)
answer.head()

Unnamed: 0,qid,prediction
0,00014894849d00ba98a9,0
1,000156468431f09b3cae,0
2,000227734433360e1aae,0
3,0005e06fbe3045bd2a92,0
4,00068a0f7f41f50fc399,0
