In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
# Products, departments and aisles
product_df = pd.read_csv("./data/products.csv")
aisles_df = pd.read_csv("./data/aisles.csv")
deparments_df = pd.read_csv("./data/departments.csv")
products_df = product_df.merge(deparments_df, on="department_id", how="inner")
products_df = products_df.merge(aisles_df, on="aisle_id", how="inner")

# Order informations
dist_store_df = pd.read_csv("./data/orders_distance_stores_softmax.csv")
dist_store_df = dist_store_df.drop("Unnamed: 0", axis=1)
prod_prior_df = pd.read_csv("./data/order_products__prior_specials.csv")
prod_prior_df = prod_prior_df.drop("Unnamed: 0", axis=1)
merged_df = prod_prior_df.merge(dist_store_df, on="order_id")

basic_info_df = pd.DataFrame([
    ["orders_distance_stores_softmax", dist_store_df.shape[0], dist_store_df.shape[1]],
    ["order_products_prior_specials", prod_prior_df.shape[0], prod_prior_df.shape[1]],
    ["merged", merged_df.shape[0], merged_df.shape[1]]
    ],
    columns=["File", "# Instances", "# Attributes"]
)
basic_info_df.head(10)

Unnamed: 0,File,# Instances,# Attributes
0,orders_distance_stores_softmax,136026,9
1,order_products_prior_specials,1172312,5
2,merged,1172312,13


In [3]:
# Columns with unique values
uniq_cols = list(merged_df.columns[merged_df.nunique() == 1])

# Columns with invalid values
nan_cols = list(merged_df.columns[merged_df.isna().any()])

print("Unique columns: {}".format(uniq_cols))
print("NaN columns: {}".format(nan_cols))

# NaN values days_since_prior_order come from items bought twice on the same day.
# Replace the values with with zeros
merged_df["days_since_prior_order"] = merged_df["days_since_prior_order"].fillna(0)
# Remove columns with unique values
merged_df = merged_df.drop(uniq_cols, axis=1)

assert len(list(merged_df.columns[merged_df.nunique() == 1])) == 0, "There are still columns with unique values"
assert len(list(merged_df.columns[merged_df.isna().any()])) == 0, "There are still columns with NaN values"

Unique columns: ['eval_set']
NaN columns: ['days_since_prior_order']


In [28]:
# Merge product information with orders
full_df = merged_df.merge(products_df, on="product_id", how="left")
full_df.head(5)

n_products = full_df["product_id"].nunique()
n_stores = full_df["store_id"].nunique()
n_users = full_df["user_id"].nunique()
n_departments = full_df["department_id"].nunique()
n_aisles = full_df["aisle_id"].nunique()
n_orders = full_df["order_id"].nunique()
n_target_rows = int(len(merged_df) * 0.05)

print("Target num rows: %d\nCur num rows: %d\nNum rows to remove: %d" % (n_target_rows, len(full_df), len(full_df) - n_target_rows))
basic_info_df = pd.DataFrame(
    [[n_products, n_departments, n_aisles, n_stores, n_users, n_orders]],
    columns=["# Products", "# Departments", "# Aisles", "# Stores", "# Users", "# Orders"]
)
basic_info_df

Target num rows: 58615
Cur num rows: 1172312
Num rows to remove: 1113697


Unnamed: 0,# Products,# Departments,# Aisles,# Stores,# Users,# Orders
0,24860,21,134,10,1374,136026


In [5]:
import duckdb 
t1 = duckdb.query("select user_id, count(order_id) AS n_orders from merged_df group by user_id, order_id").df()
t1['n_orders'].describe()

count    136026.000000
mean          8.618294
std           6.567139
min           1.000000
25%           4.000000
50%           7.000000
75%          11.000000
max          98.000000
Name: n_orders, dtype: float64

In [6]:
# Retirer les consommateurs avec peu de commandes
users_to_remove = t1[(t1["n_orders"] < 4)]["user_id"].unique()
t1 = t1.drop(t1[t1["n_orders"] < 4].index)
subset_df = merged_df.drop(merged_df[merged_df["user_id"].isin(users_to_remove)].index)
n_users = subset_df["user_id"].nunique()
print("Removing {} users helped reduce the number of orders by {}".format(len(users_to_remove), len(merged_df) - len(subset_df)))
subset_df

Removing 1314 users helped reduce the number of orders by 1080355


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,special,user_id,store_id,distance,order_number,order_dow,order_hour_of_day,days_since_prior_order
44,185,23165,1,1,0,172793,8,0.677334,72,3,1,2.0
45,185,24964,2,1,0,172793,8,0.677334,72,3,1,2.0
46,185,39812,3,0,30,172793,8,0.677334,72,3,1,2.0
47,185,11520,4,0,50,172793,8,0.677334,72,3,1,2.0
48,185,41556,5,0,15,172793,8,0.677334,72,3,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1172128,3420383,46820,11,1,15,173431,6,1.321906,14,1,15,5.0
1172129,3420383,24830,12,0,15,173431,6,1.321906,14,1,15,5.0
1172130,3420383,11507,13,0,30,173431,6,1.321906,14,1,15,5.0
1172131,3420383,42768,14,0,15,173431,6,1.321906,14,1,15,5.0


In [7]:
# select products that were bought by different customers
p = {uid: {"cnt": 0, "users": set()} for uid in subset_df["product_id"].values}
 
for _, line in subset_df.iterrows():
    p_uid = int(line["product_id"])
    p[p_uid]["cnt"] += 1
    p[p_uid]["users"].add(int(line["user_id"]))

for uid, prod in p.items():
    p[uid]["users"] = len(p[uid]["users"])

data_matrix = [[uid, *p[uid].values()] for uid in p.keys()]

t3 = pd.DataFrame(data_matrix, columns=["product_id", "count", "n_users"])
t3

Unnamed: 0,product_id,count,n_users
0,23165,37,10
1,24964,611,32
2,39812,79,17
3,11520,84,6
4,41556,12,6
...,...,...,...
6113,31564,1,1
6114,15975,1,1
6115,32478,1,1
6116,48415,1,1


In [27]:
#products_to_remove = t3[(t3["n_users"] / n_users < 0.001)]["product_id"]
products_to_remove = t3[(t3["n_users"] / t3["n_users"].sum()) < 0.0001]["product_id"]
n_deleted_products = len(products_to_remove)
subset_df_prime = subset_df.drop(subset_df[subset_df["product_id"].isin(products_to_remove)].index)
print("Removing {} products helped to remove {} orders".format(n_deleted_products, len(subset_df) - len(subset_df_prime)))
subset_df_prime

Removing 3629 products helped to remove 15534 orders


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,special,user_id,store_id,distance,order_number,order_dow,order_hour_of_day,days_since_prior_order
44,185,23165,1,1,0,172793,8,0.677334,72,3,1,2.0
45,185,24964,2,1,0,172793,8,0.677334,72,3,1,2.0
46,185,39812,3,0,30,172793,8,0.677334,72,3,1,2.0
47,185,11520,4,0,50,172793,8,0.677334,72,3,1,2.0
48,185,41556,5,0,15,172793,8,0.677334,72,3,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1172126,3420383,36076,9,1,0,173431,6,1.321906,14,1,15,5.0
1172128,3420383,46820,11,1,15,173431,6,1.321906,14,1,15,5.0
1172129,3420383,24830,12,0,15,173431,6,1.321906,14,1,15,5.0
1172131,3420383,42768,14,0,15,173431,6,1.321906,14,1,15,5.0


In [21]:
(t3["n_users"]).sort_values().to_csv("n_products_by_user")

In [None]:
# Rebalancer le nombre de commandes par client
removed_rows = len(merged_df) - len(subset_df)
n_target_rows = int(len(merged_df) * 0.05)
to_remove = len(subset_df) - n_target_rows
print("removed: {}\ntarget: {}\nto remove: {}".format(removed_rows, n_target_rows, to_remove))
thres = int(n_target_rows / n_users)
# People under the threshold must be kept
# Therefore the sum of their orders removed from the number of rows
mask = t1["n_orders"] <= thres
to_keep = mask.sum()
users_prime = t1[mask]["user_id"].unique()
pool_size = to_remove - to_keep
# This means customers with more than thres orders must share pool_size rows 
pool_size
(t1["n_orders"] <= thres).sum() == 136026
to_keep
#n_target_rows / 1370
#n_target_rows
#t1["n_orders"].describe()

In [None]:
import scipy.optimize as opt

obj = np.poly1d([1., -2., 0.])

In [None]:
merged_df[["product_id", "user_id"]].groupby("user_id").size().max() #apply(lambda x: list(x["product_id"]))
# n_products_per_user = merged_df[["product_id", "user_id"]].groupby("product_id").size()

# n_products_per_orders /= n_products_per_orders.sum()
#n_products_per_user.describe()

In [None]:
# Order products by their priors
products_priors = test_df["product_id"].value_counts() / test_df["product_id"].value_counts().sum()
np.testing.assert_almost_equal(products_priors.sum(), 1.)
products_priors

In [None]:
# Products with strong priors actually come from the same department
test_df[test_df["product_id"].isin(products_priors[0:5].index)][["product_name", "department", "aisle"]]

In [None]:
department_priors = test_df[["department", "department_id"]].value_counts() / test_df["department"].value_counts().sum()
np.testing.assert_almost_equal(department_priors.sum(), 1.)
department_priors

In [None]:
ds = {uid: {"priors": pct, "products": {}, "name": ""} for uid, pct in department_id_priors.to_dict().items()}
for department_id in ds.keys():
    mask = test_df["department_id"] == department_id
    ds[department_id]["name"] = test_df[mask]["department"].unique()[0]
    #product_ids = list(test_df["department_id" == department_id]["product_id"])
ds

In [None]:
test_df[mask]["department"].unique()[0]

In [None]:
# https://stackoverflow.com/questions/18138693/replicating-group-concat-for-pandas-dataframe
P = np.zeros((n_products, n_products))
order_prod_grouped_df = merged_df.groupby("order_id").apply(lambda x: list(x["product_id"]))

In [None]:
for line in order_prod_grouped_df:
    for product in line:
        for other_product in line:
            P[product-1, other_product-1] += 1

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(P)
pca.explained_variance_ratio_


In [None]:
order_prod_grouped_df