In [1]:
import pickle as pkl
import numpy as np
import pandas as pd
import operator
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import log_loss, roc_auc_score, f1_score
from xgboost import XGBClassifier



In [2]:
def f1_score_single(y_true, y_pred):
    try:
        y_true = set(list(y_true))
        y_pred = set(list(y_pred))
        cross_size = len(y_true & y_pred)
        if cross_size == 0: return 0.
        p = 1. * cross_size / len(y_pred)
        r = 1. * cross_size / len(y_true)
        return 2 * p * r / (p + r)
    except:
        return 0
    
def f1_score(y_true, y_pred):
    return np.mean([f1_score_single(x, y) for x, y in zip(y_true, y_pred)])

In [3]:
orders = pd.read_csv('../data/orders.csv', dtype={"order_id": np.int32, "user_id": np.int32,
                                                  "eval_set": "category", "order_number": np.int32,
                                                  "order_dow": np.int32, "order_hour_of_day": np.int32,
                                                  "days_since_prior_order": np.float32})
test = orders[orders["eval_set"] == "test"]
test_users = test["user_id"].unique()
train = orders[(orders["eval_set"] == "prior") & (orders["user_id"].isin(test_users))]

In [4]:
user_orders = train.groupby("user_id")["order_id"].apply(list)

In [5]:
train_products = pd.read_csv('../data/order_products__prior.csv', dtype=np.int32, index_col=0)
train_products = train_products.loc[train["order_id"], "product_id"].groupby("order_id").apply(list)

  mask |= (ar1 == a)


In [6]:
from collections import defaultdict
user_products = defaultdict(list)
for user, order_ids in user_orders.iteritems():
    if user % 10000 == 0: print(user)
    products = train_products.loc[order_ids]
    user_products[user] = products

30000
40000
70000
110000
120000
200000


In [7]:
user_products[3]

order_id
1374495    [9387, 17668, 15143, 16797, 39190, 47766, 2190...
444309     [38596, 21903, 248, 40604, 8021, 17668, 21137,...
3002854            [39190, 47766, 21903, 49683, 28373, 7503]
2037211                    [1819, 12845, 9387, 16965, 24010]
2710558    [39190, 9387, 17668, 47766, 16965, 28373, 2190...
1972919    [22035, 39190, 9387, 47766, 1819, 24810, 14992...
1839752    [39190, 1819, 21903, 43961, 16797, 24010, 1499...
3225766    [39190, 22035, 43961, 18599, 21903, 47766, 425...
3160850                  [39190, 47766, 16797, 43961, 48523]
676467             [18599, 17668, 47766, 39190, 1005, 32402]
521107                   [39190, 47766, 21903, 43961, 17668]
1402502           [39190, 18599, 23650, 21903, 47766, 24810]
Name: product_id, dtype: object

In [15]:
rows = []
for i, (user, products) in enumerate(user_products.items()):
    if i % 5000 == 0: print(i)
    product_counts = pd.Series(products.sum()).value_counts()
    total_orders = products.shape[0]
    for product, count in product_counts.iteritems():
        in_last_order1 = product in products.iloc[-1]
        in_last_order2 = product in products.iloc[-2]
        in_last_order3 = product in products.iloc[-3]
        rows.append([product, user, count, in_last_order1, in_last_order2, in_last_order3, total_orders]) #... etc

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000


In [22]:
df = pd.DataFrame(rows, columns=["product_id", "user_id", "product_count",
                                 "in_last_order1", "in_last_order2", "in_last_order3",
                                 "total_orders"])
df["order_frac"] = df["product_count"] / df["total_orders"]
df = pd.merge(df, test.drop(["eval_set"], axis=1), on="user_id")
df = pd.merge(df, pd.read_csv("../data/products.csv", index_col=0,
                              usecols=["product_id", "aisle_id", "department_id"]),
              how="left", left_on="product_id", right_index=True)
print(df.shape)
df.head()

(4833292, 15)


Unnamed: 0,product_id,user_id,product_count,in_last_order1,in_last_order2,in_last_order3,total_orders,order_frac,order_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id
0,24852,131072,16,True,True,True,16,1.0,806547,17,2,14,21.0,24,4
1,21903,131072,13,True,True,True,16,0.8125,806547,17,2,14,21.0,123,4
2,49683,131072,9,True,True,True,16,0.5625,806547,17,2,14,21.0,83,4
3,21137,131072,9,True,True,False,16,0.5625,806547,17,2,14,21.0,24,4
4,47144,131072,9,True,True,True,16,0.5625,806547,17,2,14,21.0,91,16


In [24]:
import pickle
with open("xgb.pkl", "rb") as f:
    model = pickle.load(f)
df["pred"] = 0
df["pred"].iloc[:100000] = model.predict_proba(df.iloc[:100000].drop("pred", axis=1))[:, 1]
df["pred"].iloc[100000:] = model.predict_proba(df.iloc[100000:].drop("pred", axis=1))[:, 1]
sub = df[df["pred"] > 0.13].groupby("order_id")["product_id"].apply(lambda x: " ".join(str(a) for a in x))
samp = pd.read_csv("../data/sample_submission.csv", index_col=0)
sub.loc[samp.index].fillna("None").to_csv("joe_xgb2.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
