In [1]:
import pandas as pd
import numpy as np

In [9]:
import pickle as pkl
import numpy as np
import pandas as pd
import operator
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.metrics import log_loss, roc_auc_score, f1_score
from xgboost import XGBClassifier

print("Reading data")
orders = pd.read_csv('../data/orders.csv', dtype={"order_id": np.int32, "user_id": np.int32,
                                                  "eval_set": "category", "order_number": np.int32,
                                                  "order_dow": np.int32, "order_hour_of_day": np.int32,
                                                  "days_since_prior_order": np.float32})

product_info = pd.read_csv("../data/products.csv")
print("Finished reading data. Generating features.")

orders["last_order"] = orders.groupby("user_id")["order_id"].shift()

test = orders[(orders['eval_set'] == 'test')]
del orders

all_orders = pd.concat([
    pd.read_csv('../data/order_products__prior.csv', dtype=np.int32, index_col=0),
    pd.read_csv('../data/order_products__train.csv', dtype=np.int32, index_col=0)
]).loc[pd.concat([test["order_id"], test["last_order"]])]

# for each user, generate a list of the items in the last order                                                                                                  
test_products = pd.merge(all_orders, test, left_index=True, right_on="last_order")

test_products = pd.merge(test_products, product_info[["aisle_id", "department_id"]], how="left", left_on="product_id", right_index=True)

test_products["dummy"] = 1
test_products["hour_q"] = test_products["order_hour_of_day"]//6
test_products["1w"] = (test_products["days_since_prior_order"] == 7)
test_products["1m"] = (test_products["days_since_prior_order"] == 30)

Reading data
Finished reading data. Generating features.


  mask |= (ar1 == a)


In [10]:
test.sort_values("order_id")

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,last_order
611292,17,36855,test,5,6,15,1.0,1058761.0
584613,34,35220,test,20,3,11,8.0,2370415.0
3102779,137,187107,test,9,2,19,30.0,3116016.0
1929637,182,115892,test,28,0,11,8.0,353097.0
590591,257,35581,test,9,6,23,5.0,3401153.0
1888538,313,113359,test,31,6,22,7.0,567531.0
2880687,353,173814,test,4,4,13,30.0,659593.0
924638,386,55492,test,8,0,15,30.0,594743.0
2009806,414,120775,test,18,5,14,8.0,2759387.0
556925,418,33565,test,12,0,12,14.0,1953190.0


In [11]:
print("Reading trained model")
with open("../scripts/xgb_model.pkl", "rb") as f:
    model, features = pkl.load(f)

print("Generating predictions")
Xt = test_products[features]
probs = model.predict_proba(Xt)
test_products["pred"] = probs[:, 1]

Reading trained model
Generating predictions
Generating baskets


In [15]:
print("Generating baskets")
sub1 = test_products[test_products["pred"] > 0.3].groupby("order_id")["product_id"].apply(lambda x: " ".join(str(int(prod)) for prod in x))
sub1.to_csv("sub1.csv.gz")
sub1.head()

Generating baskets


order_id
17                                                 13107
34              47766 16083 2596 39475 13176 47792 44663
137        44422 29594 25890 24852 41787 23794 5134 2326
182                    5479 47672 9337 39275 32109 33000
257    49235 24852 30233 29837 37646 27104 47547 3947...
Name: product_id, dtype: object

In [16]:
print("Generating baskets")
sub2 = test_products[test_products["pred"] > 0.5].groupby("order_id")["product_id"].apply(lambda x: " ".join(str(int(prod)) for prod in x))
sub2.to_csv("sub2.csv.gz")
sub2.head()

Generating baskets


order_id
257                24852
313    45007 21903 28535
414                20564
418                 5262
437                13176
Name: product_id, dtype: object

In [17]:
print("Generating baskets")
sub3 = test_products[test_products["pred"] > 0.25].groupby("order_id")["product_id"].apply(lambda x: " ".join(str(int(prod)) for prod in x))
sub3.to_csv("sub3.csv.gz")
sub3.head()

Generating baskets


order_id
17                                                 13107
34              47766 16083 2596 39475 13176 47792 44663
137        44422 29594 25890 24852 41787 23794 5134 2326
182                    5479 47672 9337 39275 32109 33000
257    49235 24852 30233 29837 37646 27104 47547 3947...
Name: product_id, dtype: object

In [5]:
samp = pd.read_csv("../data/sample_submission.csv")

In [6]:
samp

Unnamed: 0,order_id,products
0,17,39276 29259
1,34,39276 29259
2,137,39276 29259
3,182,39276 29259
4,257,39276 29259
5,313,39276 29259
6,353,39276 29259
7,386,39276 29259
8,414,39276 29259
9,418,39276 29259
