In [1]:
import pickle as pkl
import numpy as np
import pandas as pd
import operator
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import log_loss, roc_auc_score, f1_score
from xgboost import XGBClassifier



In [2]:
def f1_score_single(y_true, y_pred):
    try:
        y_true = set(list(y_true))
        y_pred = set(list(y_pred))
        cross_size = len(y_true & y_pred)
        if cross_size == 0: return 0.
        p = 1. * cross_size / len(y_pred)
        r = 1. * cross_size / len(y_true)
        return 2 * p * r / (p + r)
    except:
        return 0
    
def f1_score(y_true, y_pred):
    return np.mean([f1_score_single(x, y) for x, y in zip(y_true, y_pred)])

In [3]:
orders = pd.read_csv('../data/orders.csv', dtype={"order_id": np.int32, "user_id": np.int32,
                                                  "eval_set": "category", "order_number": np.int32,
                                                  "order_dow": np.int32, "order_hour_of_day": np.int32,
                                                  "days_since_prior_order": np.float32})
test = orders[orders["eval_set"] == "train"]
test_users = test["user_id"].unique()
train = orders[(orders["eval_set"] == "prior") & (orders["user_id"].isin(test_users))]

truth = pd.read_csv('../data/order_products__train.csv', dtype=np.int32, index_col=0).groupby("order_id")["product_id"].apply(list)
truth.head()

  mask |= (ar1 == a)


order_id
1     [49302, 11109, 10246, 49683, 43633, 13176, 472...
36    [39612, 19660, 49235, 43086, 46620, 34497, 486...
38    [11913, 18159, 4461, 21616, 23622, 32433, 2884...
96    [20574, 30391, 40706, 25610, 27966, 24489, 39275]
98    [8859, 19731, 43654, 13176, 4357, 37664, 34065...
Name: product_id, dtype: object

In [4]:
user_orders = train.groupby("user_id")["order_id"].apply(list)

In [5]:
train_products = pd.read_csv('../data/order_products__prior.csv', dtype=np.int32, index_col=0)
train_products = train_products.loc[train["order_id"], "product_id"].groupby("order_id").apply(list)

  mask |= (ar1 == a)


In [6]:
from collections import defaultdict
user_products = defaultdict(list)
for user, order_ids in user_orders.iteritems():
    if user % 10000 == 0: print(user)
    products = train_products.loc[order_ids]
    user_products[user] = products

10000
20000
50000
60000
80000
90000
100000
130000
140000
150000
160000
170000
180000
190000


In [7]:
with open("user_products.pkl", "wb") as f:
    pkl.dump(user_products, f)

MemoryError: 

In [8]:
rows = []
for user, products in user_products.items():
    if user % 10000 == 0: print(user)
    if user > 60000: break
    product_counts = pd.Series(products.sum()).value_counts()
    total_orders = products.shape[0]
    for product, count in product_counts.iteritems():
        in_last_order1 = product in products.iloc[-1]
        in_last_order2 = product in products.iloc[-2]
        in_last_order3 = product in products.iloc[-3]
        rows.append([product, user, count, in_last_order1, in_last_order2, in_last_order3, total_orders]) #... etc

10000
20000
50000
60000


In [9]:
df = pd.DataFrame(rows, columns=["product_id", "user_id", "product_count",
                                 "in_last_order1", "in_last_order2", "in_last_order3",
                                 "total_orders"])
df["order_frac"] = df["product_count"] / df["total_orders"]
df = pd.merge(df, test.drop(["eval_set"], axis=1), on="user_id")
df = pd.merge(df, pd.read_csv("../data/products.csv", index_col=0,
                              usecols=["product_id", "aisle_id", "department_id"]),
              how="left", left_on="product_id", right_index=True)
df.tail()

Unnamed: 0,product_id,user_id,product_count,in_last_order1,in_last_order2,in_last_order3,total_orders,order_frac,order_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id
2459543,29112,60000,1,False,False,True,3,0.333333,2854165,4,6,17,2.0,59,15
2459544,28473,60000,1,False,False,True,3,0.333333,2854165,4,6,17,2.0,107,19
2459545,19578,60000,1,False,False,True,3,0.333333,2854165,4,6,17,2.0,54,17
2459546,4091,60000,1,False,False,True,3,0.333333,2854165,4,6,17,2.0,110,13
2459547,1823,60000,1,False,True,False,3,0.333333,2854165,4,6,17,2.0,45,19


In [10]:
for i in np.linspace(0.1, 0.5, 5):
    a = pd.DataFrame(df[df["order_frac"] >= i].groupby("user_id")["product_id"].apply(list))
    results = pd.merge(test, a, left_on="user_id", right_index=True).join(truth, on="order_id", rsuffix="_pred")
    print(i, f1_score(results["product_id"], results["product_id_pred"]))

0.1 0.248010053787
0.2 0.274756099963
0.3 0.273334739853
0.4 0.262264248147
0.5 0.250410029707


In [11]:
y = []
for i, row in df.iterrows():
    y.append(row["product_id"] in truth[row["order_id"]])

In [12]:
df["y"] = y
df.to_csv("product_train_features.csv")

In [None]:
p1 = cross_val_predict(make_pipeline(MinMaxScaler(), LogisticRegression()), df, y, method="predict_proba")
print(log_loss(y, p1), roc_auc_score(y, p1[:, 1]))

In [None]:
p2 = cross_val_predict(XGBClassifier(n_estimators=8, learning_rate=0.4, max_depth=2),
                      df.drop(["pred", "y"], axis=1), y, method="predict_proba")
print(log_loss(y, p2), roc_auc_score(y, p2[:, 1]))
print(log_loss(y, (p1+p2)/2), roc_auc_score(y, ((p1+p2)/2)[:, 1]))

In [None]:
p = df["order_frac"]
print(log_loss(y, p), roc_auc_score(y, p))

In [None]:
model = make_pipeline(MinMaxScaler(), LogisticRegression()).fit(df, y)
with open("linear.pkl", "wb")as f:
    pkl.dump(model, f)
model = make_pipeline(XGBClassifier(n_estimators=8, learning_rate=0.4)).fit(df, y)
with open("xgb.pkl", "wb")as f:
    pkl.dump(model, f)

In [None]:
df["pred"] = p2[:,1]

In [None]:
points = []
for i in np.linspace(0.05, 0.2, 16):
    a = pd.DataFrame(df[df["pred"] >= i].groupby("user_id")["product_id"].apply(list))
    results = pd.merge(test, a, left_on="user_id", right_index=True).join(truth, on="order_id", rsuffix="_pred")
    score = f1_score(results["product_id"], results["product_id_pred"])
    print(i, score)
    points.append(score)

In [None]:
df["y"] = y

In [None]:
df

In [None]:
%matplotlib inline
pd.Series(points, index=np.linspace(0.05, 0.2, 16)).plot()