![train](http://3.bp.blogspot.com/-Nt8l3SY2UiY/VHHLVsm8cFI/AAAAAAAAJGg/sPXvFN41aaU/s1600/contr%C3%A1rio.jpg)

In [1]:
import numpy as np
np.random.seed(1019)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import xgboost

import sklearn
from sklearn.model_selection import train_test_split 

import sys, os, gc, types
import time
from subprocess import check_output

In [2]:
sys.path.append('./utils')

from training import cv, train
from plotting import plot_importance
from data import Data

In [3]:
root_paths = [
    "/data/kaggle-instacart",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [37]:
class F1Optimizer():
    def __init__(self):
        pass

    @staticmethod
    def get_expectations(P, pNone=None):
        expectations = []
        P = np.sort(P)[::-1]

        n = np.array(P).shape[0]
        DP_C = np.zeros((n + 2, n + 1))
        if pNone is None:
            pNone = (1.0 - P).prod()

        DP_C[0][0] = 1.0
        for j in range(1, n):
            DP_C[0][j] = (1.0 - P[j - 1]) * DP_C[0, j - 1]

        for i in range(1, n + 1):
            DP_C[i, i] = DP_C[i - 1, i - 1] * P[i - 1]
            for j in range(i + 1, n + 1):
                DP_C[i, j] = P[j - 1] * DP_C[i - 1, j - 1] + (1.0 - P[j - 1]) * DP_C[i, j - 1]

        DP_S = np.zeros((2 * n + 1,))
        DP_SNone = np.zeros((2 * n + 1,))
        for i in range(1, 2 * n + 1):
            DP_S[i] = 1. / (1. * i)
            DP_SNone[i] = 1. / (1. * i + 1)
        for k in range(n + 1)[::-1]:
            f1 = 0
            f1None = 0
            for k1 in range(n + 1):
                f1 += 2 * k1 * DP_C[k1][k] * DP_S[k + k1]
                f1None += 2 * k1 * DP_C[k1][k] * DP_SNone[k + k1]
            for i in range(1, 2 * k - 1):
                DP_S[i] = (1 - P[k - 1]) * DP_S[i] + P[k - 1] * DP_S[i + 1]
                DP_SNone[i] = (1 - P[k - 1]) * DP_SNone[i] + P[k - 1] * DP_SNone[i + 1]
            expectations.append([f1None + 2 * pNone / (2 + k), f1])

        return np.array(expectations[::-1]).T

    @staticmethod
    def maximize_expectation(P, pNone=None):
        expectations = F1Optimizer.get_expectations(P, pNone)

        ix_max = np.unravel_index(expectations.argmax(), expectations.shape)
        max_f1 = expectations[ix_max]

        predNone = True if ix_max[0] == 0 else False
        best_k = ix_max[1]

        return best_k, predNone, max_f1

    @staticmethod
    def _F1(tp, fp, fn):
        return 2 * tp / (2 * tp + fp + fn)

    @staticmethod
    def _Fbeta(tp, fp, fn, beta=1.0):
        beta_squared = beta ** 2
        return (1.0 + beta_squared) * tp / ((1.0 + beta_squared) * tp + fp + beta_squared * fn)


In [4]:
def predict(bst):
    d_test = xgboost.DMatrix(
        test.drop(['eval_set', 'order_id', 'reordered', 'product_id'], axis=1))
    return bst.predict(d_test)

def ensemble(preds):
    # Average ensemble
    r = None
    for p in preds:
        if r is None:
            r = p
        else:
            r += p
    return r / len(preds)

def ensemble_predict(bsts):
    preds = []
    for bst in bsts:
        preds.append(predict(bst))
    return ensemble(preds)

def thresholding(pred):
    return (pred > 0.21).astype(int)

In [5]:
name = 'v3-test'
down_sample = 10
num_searches = 1

In [6]:
bsts = []
for i in range(num_searches):
    bsts.append(xgboost.Booster(model_file=os.path.join(root, 'train-{}-n{}.bst'.format(name, i))))

In [7]:
test = Data.test(down_sample=down_sample)

# Prediction with F1 optimization

In [8]:
test['reordered'] = ensemble_predict([bst for bst in bsts])

In [16]:
test_result = test[['product_id', 'order_id', 'reordered']]
test_result.head()

Unnamed: 0,product_id,order_id,reordered
20,260,1017560,0.035462
21,21137,1017560,0.031505
22,47766,1017560,0.044791
117,260,2939110,0.020977
118,11782,2939110,0.025118


In [77]:
test_none = pd.read_csv(os.path.join(root, 'none_prediction.csv'))
test_none.loc[test_none.order_id == 3302990, 'is_none'].values[0]

0.19991157948970795

In [91]:
test_order_id = test_result.order_id.unique()
submission = []

def predict_wf1(w_none = False):
    start_time = time.time()
    for i in range(len(test_order_id)):
#     for i in range(30):
        df = test_result[test_result.order_id == test_order_id[i]]
        df.sort_values(by = 'reordered', axis = 0, ascending = False, inplace = True)
        if w_none:
            p_none = test_none.loc[test_none.order_id == test_order_id[i], 'is_none'].values[0]
            best_k = F1Optimizer.maximize_expectation(df.reordered, p_none)[0]
        else:
            best_k = F1Optimizer.maximize_expectation(df.reordered)[0]
            
        if best_k == 0:
            submission.append({'order_id': test_order_id[i], 'products': 'None'})
        else:
            df.product_id = df.product_id.astype(str)
            reordered_product = ' '.join(df.product_id[:best_k])
            submission.append({'order_id': test_order_id[i], 'products': reordered_product})
        if i % 10 == 0:
            print('{} predictions have been saved'.format(i))
            remaining_time = (time.time()-start_time) / (i+1) * (len(test_order_id) - i)
            print('{:.2f}s remaining'.format(remaining_time))
    

In [92]:
predict_wf1()
submission_df = pd.DataFrame(data = submission)
submission_df.sort_values(by = 'order_id', axis = 0, ascending = True, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


0 predictions have been saved
172.55s remaining
10 predictions have been saved
769.35s remaining
20 predictions have been saved
623.98s remaining


In [87]:
submission_df.head()

[{'order_id': 1017560,
  'product_id': '30489 43654 44596 29370 18531 42710 39180 46886 48726 37754'},
 {'order_id': 2939110,
  'product_id': '11520 42265 13176 19816 27845 5120 41213 26604 21405'},
 {'order_id': 1461650,
  'product_id': '24852 4210 44042 9076 26348 43263 49683 41844 7058 48205 43967 33651 40706 49379 3732 43530 15650 26152 35511 877 48596'},
 {'order_id': 235350,
  'product_id': '21137 24852 19171 39275 43988 39928 47144 49236 3896 11896 10132'},
 {'order_id': 3287420,
  'product_id': '13176 19057 30776 33731 12935 27241 27966 41220 16759 31717 38647 19862 12395 21607 44359 14084 3952 12456 3896 24964 22935 21137 260'},
 {'order_id': 1021030,
  'product_id': '21903 27845 47029 32655 3405 13249 3952'},
 {'order_id': 592160,
  'product_id': '22993 25718 5876 46802 32537 28985 24964 21903 7781 46667 29217 1158 30776 27521 22963 17794 260 8230 44698 19125 11428 27104'},
 {'order_id': 2251990,
  'product_id': '47209 42768 12935 5876 30489 26209 27966 18594 43504 17553 2113

In [52]:
submission_df.to_csv("submission-{}.csv".format(name), index=False)

In [8]:
# Prediction
test['reordered'] = thresholding(ensemble_predict([bst for bst in bsts]))

In [9]:
# Submission file
test['product_id'] = test.product_id.astype(str)
submit = test[test.reordered == 1].groupby('order_id')['product_id'].agg([lambda x: ' '.join(set(x))]).reset_index()
sample_submission = pd.read_csv(os.path.join(root, 'sample_submission.csv'))
submit.columns = sample_submission.columns.tolist()
submit_final = sample_submission[['order_id']].merge(submit, how='left').fillna('None')
submit_final.to_csv("submission-{}.csv".format(name), index=False)

In [10]:
# Stats
print('{} pred orders; {} of them non-empty'.format(submit_final.shape[0], submit.shape[0]))
empty_order_ratio = (submit_final.shape[0] - submit.shape[0]) * 100. / submit_final.shape[0]
print('Empty order ratio is {:.2f}%'.format(empty_order_ratio))

75000 pred orders; 743 of them non-empty
Empty order ratio is 99.01%
