In [None]:
import numpy as np
import pandas as pd
import torch
from collections import defaultdict

# Products, departments and aisles
product_df = pd.read_csv("./data/products.csv")
aisles_df = pd.read_csv("./data/aisles.csv")
deparments_df = pd.read_csv("./data/departments.csv")
products_df = product_df.merge(deparments_df, on="department_id", how="left")
products_df = products_df.merge(aisles_df, on="aisle_id", how="left")

# Order informations
dist_store_df = pd.read_csv("./data/orders_distance_stores_softmax.csv")
dist_store_df = dist_store_df.drop("Unnamed: 0", axis=1)
prod_prior_df = pd.read_csv("./data/order_products__prior_specials.csv")
prod_prior_df = prod_prior_df.drop("Unnamed: 0", axis=1)
merged_df = prod_prior_df.merge(dist_store_df, on="order_id")
merged_df = merged_df.merge(products_df, on="product_id", how="left")

# Columns with unique values
uniq_cols = list(merged_df.columns[merged_df.nunique() == 1])

# Columns with invalid values
nan_cols = list(merged_df.columns[merged_df.isna().any()])

print("Unique columns: {}".format(uniq_cols))
print("NaN columns: {}".format(nan_cols))

# NaN values days_since_prior_order come from items bought twice on the same day.
# Replace the values with with zeros
merged_df["days_since_prior_order"] = merged_df["days_since_prior_order"].fillna(0)
# Remove columns with unique values
merged_df = merged_df.drop(uniq_cols, axis=1)

assert len(list(merged_df.columns[merged_df.nunique() == 1])) == 0, "There are still columns with unique values"
assert len(list(merged_df.columns[merged_df.isna().any()])) == 0, "There are still columns with NaN values"

In [None]:
# Fonction utilitaire permettant de simuler une requête d'aggrégation SQL du genre
# SELECT col, COUNT(DISTINCT agg_col) AS n_agg_col FROM table_name GROUP BY col, agg_col
# Les commandes équivalentes avec l'API de Pandas génèrent beaucoup de doublons, ce qui nous force
# à implémenter cette solution.  
def count_distinct(dataframe, groupby_col, count_col):
    d = defaultdict()
    for row in dataframe.itertuples():
        uid = getattr(row, groupby_col)
        d[uid] = set()

    for row in dataframe.itertuples():
        uid = getattr(row, groupby_col)
        count_attr = getattr(row, count_col)
        d[uid].add(count_attr)

    df_count_col_name = "%s_count" % count_col
    d = {groupby_col: d.keys(), df_count_col_name: [len(v) for v in d.values()]}
    return pd.DataFrame.from_dict(d).sort_values(by=df_count_col_name)



In [15]:
users_n_order_df = count_distinct(merged_df, "user_id", "order_id")
products_n_order_df = count_distinct(merged_df, "product_id", "order_id")

# Products by order
order_id_cnt_thres = products_n_order_df["order_id_count"].quantile(0.25)
n_products = products_n_order_df["product_id"].nunique()
mask = products_n_order_df["order_id_count"] < order_id_cnt_thres
to_remove = products_n_order_df[mask]["product_id"].unique()

print("Number of products: %d" % n_products)
print("Number products to remove: %d" % len(to_remove))
print("Number of remaining products: %d" % (n_products - len(to_remove)))

n_rows_before = len(merged_df)
mask = merged_df["product_id"].isin(to_remove)
clean_df = merged_df.drop(merged_df[mask].index)
n_rows_after = len(clean_df)

print("Number of rows to remove: %d" % mask.sum())
print("Number of remaining rows: %d" % n_rows_after)

# Products by user
products_n_users_df = count_distinct(clean_df, "product_id", "user_id")
user_id_cnt_thres = products_n_users_df["user_id_count"].quantile(0.25)
mask = products_n_users_df["user_id_count"] < user_id_cnt_thres
to_remove = products_n_users_df[mask]["product_id"].unique()

print("Number of products: %d" % n_products)
print("Number products to remove: %d" % len(to_remove))
print("Number of remaining products: %d" % (n_products - len(to_remove)))

n_rows_before = len(clean_df)
mask = clean_df["product_id"].isin(to_remove)
clean_df = clean_df.drop(clean_df[mask].index)
n_rows_after = len(clean_df)
print("Number of rows to remove: %d" % mask.sum())
print("Number of remaining rows: %d" % n_rows_after)

Number of products: 24860
Number products to remove: 5271
Number of remaining products: 19589
Number of rows to remove: 5271
Number of remaining rows: 1167041
Number of products: 24860
Number products to remove: 2939
Number of remaining products: 21921
Number of rows to remove: 15320
Number of remaining rows: 1151721


In [16]:
# Validation
users_n_order_df = count_distinct(clean_df, "user_id", "order_id")
assert users_n_order_df["order_id_count"].min() >= order_id_cnt_thres, "Il reste des utilisateurs avec moins de %d commandes différentes." % order_id_cnt_thres

products_n_order_df = count_distinct(clean_df, "product_id", "order_id")
assert products_n_order_df["order_id_count"].min() >= user_id_cnt_thres, "Il reste des produits présents dans moins de %d commandes différentes." % user_id_cnt_thres

products_n_user_df = count_distinct(clean_df, "product_id", "user_id")
assert products_n_user_df["user_id_count"].min() > 1, "Il reste des produits achetés seulement par un consommateur"

In [17]:
clean_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,special,user_id,store_id,distance,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,department,aisle
0,15,19660,1,1,15,54901,1,0.304608,51,3,11,2.0,Spring Water,115,7,beverages,water seltzer sparkling water
1,15,21195,2,1,0,54901,1,0.304608,51,3,11,2.0,Organic Extra Virgin Olive Oil,19,13,pantry,oils vinegars
2,15,7461,3,1,50,54901,1,0.304608,51,3,11,2.0,Pinto Beans No Salt Added,59,15,canned goods,canned meals beans
3,15,2996,4,1,0,54901,1,0.304608,51,3,11,2.0,Honeysuckle Hand Soap,25,11,personal care,soap
4,15,32463,5,1,0,54901,1,0.304608,51,3,11,2.0,Olive Oil & Aloe Vera Hand Soap,25,11,personal care,soap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1172307,3421069,21137,8,1,0,119835,6,1.806216,75,3,13,2.0,Organic Strawberries,24,4,produce,fresh fruits
1172308,3421069,30391,9,1,15,119835,6,1.806216,75,3,13,2.0,Organic Cucumber,83,4,produce,fresh vegetables
1172309,3421069,18656,10,1,0,119835,6,1.806216,75,3,13,2.0,Organic Red Potato,83,4,produce,fresh vegetables
1172310,3421069,26940,11,1,15,119835,6,1.806216,75,3,13,2.0,Organic Large Green Asparagus,83,4,produce,fresh vegetables


In [27]:
n_clean_users = clean_df["user_id"].nunique()
n_clean_orders = clean_df["order_id"].nunique()
n_clean_products = clean_df["product_id"].nunique()

# Ayant supprimé plusieurs produits, on peut réindexer les identifiants des produits pour sauver de la mémoire lors des prochaines étapes
product_n_orders_df = count_distinct(clean_df, "product_id", "order_id")
clean_df["product_reid"] = clean_df.index

print("Numbers of clean users: %d" % n_clean_users)
print("Numbers of clean orders: %d" % n_clean_orders)
print("Numbers of clean products: %d" % n_clean_products)

Numbers of clean users: 1374
Numbers of clean orders: 135816
Numbers of clean products: 16650


In [35]:
product_reid_mapping = product_n_orders_df["product_id"].to_dict()
for k, v in product_reid_mapping.items():
    mask = clean_df["product_id"] == v
    clean_df.loc[mask, "product_reid"] = k

In [36]:
# https://stackoverflow.com/questions/18138693/replicating-group-concat-for-pandas-dataframe
P = np.zeros((n_clean_products, n_clean_products))

complete_order_df = clean_df.groupby("order_id").apply(lambda x: list(x["product_reid"]))
for line in complete_order_df:
    for product in line:
        for other_product in line:
            P[product-1, other_product-1] += 1

In [48]:
n_products = complete_order_df.apply(lambda x: len(x)).sum()

1151721

In [53]:
priors = product_n_orders_df["order_id_count"] / product_n_orders_df["order_id_count"].sum()
priors

In [40]:
(clean_df["product_reid"] == 0).sum()

2711

In [33]:
complete_order_df

order_id
15                                           [0, 1, 2, 3, 4]
50                                   [5, 6, 7, 8, 9, 10, 11]
79                          [12, 13, 14, 15, 16, 17, 18, 19]
95         [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 3...
128                 [34, 35, 36, 37, 38, 39, 40, 41, 42, 43]
                                 ...                        
3420983    [1172245, 1172246, 1172247, 1172248, 1172249, ...
3420984    [1172265, 1172266, 1172267, 1172268, 1172269, ...
3421004    [1172273, 1172274, 1172275, 1172276, 1172277, ...
3421027    [1172288, 1172289, 1172290, 1172291, 1172292, ...
3421069    [1172300, 1172301, 1172302, 1172303, 1172304, ...
Length: 135816, dtype: object

In [34]:
clean_df.groupby("order_id").apply(lambda x: list(x["product_id"]))

order_id
15                         [19660, 21195, 7461, 2996, 32463]
50               [397, 34993, 17183, 37788, 1025, 4799, 276]
79         [4210, 4377, 39461, 13627, 13110, 12397, 4840,...
95         [9337, 10121, 37630, 22035, 20379, 39928, 1239...
128        [4421, 18770, 24184, 13176, 42265, 16249, 2827...
                                 ...                        
3420983    [40311, 33245, 21903, 23288, 9092, 25659, 2941...
3420984    [47209, 11744, 22760, 45448, 14240, 33750, 183...
3421004    [10603, 43858, 35958, 35425, 2732, 3793, 21137...
3421027    [9689, 37538, 24852, 46906, 28204, 20580, 9020...
3421069    [28199, 9426, 46667, 10831, 27966, 34243, 3838...
Length: 135816, dtype: object

In [19]:
clean_df

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,special,user_id,store_id,distance,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,department,aisle
0,15,19660,1,1,15,54901,1,0.304608,51,3,11,2.0,Spring Water,115,7,beverages,water seltzer sparkling water
1,15,21195,2,1,0,54901,1,0.304608,51,3,11,2.0,Organic Extra Virgin Olive Oil,19,13,pantry,oils vinegars
2,15,7461,3,1,50,54901,1,0.304608,51,3,11,2.0,Pinto Beans No Salt Added,59,15,canned goods,canned meals beans
3,15,2996,4,1,0,54901,1,0.304608,51,3,11,2.0,Honeysuckle Hand Soap,25,11,personal care,soap
4,15,32463,5,1,0,54901,1,0.304608,51,3,11,2.0,Olive Oil & Aloe Vera Hand Soap,25,11,personal care,soap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1172307,3421069,21137,8,1,0,119835,6,1.806216,75,3,13,2.0,Organic Strawberries,24,4,produce,fresh fruits
1172308,3421069,30391,9,1,15,119835,6,1.806216,75,3,13,2.0,Organic Cucumber,83,4,produce,fresh vegetables
1172309,3421069,18656,10,1,0,119835,6,1.806216,75,3,13,2.0,Organic Red Potato,83,4,produce,fresh vegetables
1172310,3421069,26940,11,1,15,119835,6,1.806216,75,3,13,2.0,Organic Large Green Asparagus,83,4,produce,fresh vegetables


In [None]:
for line in product_n_orders_df:
     for product in line: