![train](http://3.bp.blogspot.com/-Nt8l3SY2UiY/VHHLVsm8cFI/AAAAAAAAJGg/sPXvFN41aaU/s1600/contr%C3%A1rio.jpg)

In [1]:
import numpy as np
np.random.seed(1019)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split 

import sys, os, gc, types
import time
from subprocess import check_output

In [2]:
sys.path.append('./utils')

from training import cv, train
from plotting import plot_importance
from data import Data

In [3]:
root_paths = [
    "/data/kaggle-instacart",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [37]:
class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)


In [4]:
def ensemble(preds):
    # Average ensemble
    r = None
    for p in preds:
        if r is None:
            r = p
        else:
            r += p
    return r / len(preds)

def ensemble_predict(bsts, test):
    dtest = xgboost.DMatrix(
        test.drop(['eval_set', 'order_id', 'reordered', 'product_id'], axis=1))
    preds = []
    for bst in bsts:
        preds.append(bst.predict(dtest))
    return ensemble(preds)

def thresholding(pred):
    return (pred > 0.21).astype(int)

In [7]:
def order_agg(test_result, test_none):
    test_result.sort_values(by = 'order_id', axis = 0, ascending = True, inplace = True)
    test_result.reset_index(drop = True, inplace = True)
    test_order_id = test_result.order_id.unique()

    start_time = time.time()
    submission = []

    start, end, i = 0, 0, 0
    while start < len(test_result):
        current_id = test_result.loc[start, 'order_id']
        while end < len(test_result) and current_id == test_result.loc[end, 'order_id']:
            end += 1
        df = test_result.iloc[start:end]
        df.sort_values(by = 'reordered', axis = 0, ascending = False, inplace = True)

        if test_none is not None:
            p_none = test_none.loc[current_id, 'is_none']
            best_k, predNone, _ = F1Optimizer.maximize_expectation(df.reordered, p_none)
        else:
            best_k, predNone, _ = F1Optimizer.maximize_expectation(df.reordered)    

        if best_k == 0:
            submission.append({'order_id': current_id, 'products': 'None'})
        else:
            df.product_id = df.product_id.astype(str)
            reordered_product = ' '.join(df.product_id[:best_k])
            if predNone:
                reordered_product += ' None'
            submission.append({'order_id': current_id, 'products': reordered_product})

        if i % 10 == 0:
            print('{} predictions have been saved'.format(i))
            remaining_time = (time.time()-start_time) / (i+1) * (len(test_order_id) - i)
            print('{:.2f}s remaining'.format(remaining_time))

        start = end
        i += 1

    return pd.DataFrame(data = submission)
    
def process_shard(name, shard=0, nshards=1, down_sample=None):
    global alltest
    test = alltest[alltest.order_id % nshards == shard]

    global bsts
    test['reordered'] = ensemble_predict(bsts, test)

    test_result = test[['product_id', 'order_id', 'reordered']]
    test_none = pd.read_csv(os.path.join(root, 'none_prediction.csv'), index_col='order_id')
    
    submission_df = order_agg(test_result, test_none)
    
    submission_df.to_csv(os.path.join(root, "submission-{}-s{}.csv".format(name, shard)), index=False)

In [None]:
from multiprocessing import Process

name = 'v5-r2'

# model
train_name = 'v5-r2'
num_searches = 1
bsts = []
for i in range(num_searches):
    bsts.append(xgboost.Booster(model_file=os.path.join(root, 'train-{}-n{}.bst'.format(train_name, i))))

# test data
down_sample = None
alltest = Data.test(down_sample=down_sample)
    
nshards = 32
jobs = []
for s in range(nshards):
    p = Process(target=process_shard, args=(name, s, nshards, down_sample))
    p.start()
    jobs.append(p)
    
for p in jobs:
    p.join()

print("\n\nShards done.")

subs = []
for s in range(nshards):
    subs.append(pd.read_csv(os.path.join(root, "submission-{}-s{}.csv".format(name, s))))

submission_df = pd.concat(subs)
submission_df.sort_values(by = 'order_id', axis = 0, ascending = True, inplace = True)
submission_df.to_csv(os.path.join(root, "submission-{}.csv".format(name)), index=False)

for s in range(nshards):
    os.remove(os.path.join(root, "submission-{}-s{}.csv".format(name, s)))
    
print("\n\nAll done.")

In [8]:
# Prediction
# test['reordered'] = thresholding(ensemble_predict([bst for bst in bsts]))

In [9]:
# # Submission file
# test['product_id'] = test.product_id.astype(str)
# submit = test[test.reordered == 1].groupby('order_id')['product_id'].agg([lambda x: ' '.join(set(x))]).reset_index()
# sample_submission = pd.read_csv(os.path.join(root, 'sample_submission.csv'))
# submit.columns = sample_submission.columns.tolist()
# submit_final = sample_submission[['order_id']].merge(submit, how='left').fillna('None')
# submit_final.to_csv("submission-{}.csv".format(name), index=False)

In [10]:
# # Stats
# print('{} pred orders; {} of them non-empty'.format(submit_final.shape[0], submit.shape[0]))
# empty_order_ratio = (submit_final.shape[0] - submit.shape[0]) * 100. / submit_final.shape[0]
# print('Empty order ratio is {:.2f}%'.format(empty_order_ratio))

75000 pred orders; 743 of them non-empty
Empty order ratio is 99.01%
