In [1]:
import numpy as np
import pandas as pd
import torch

In [99]:
product_df = pd.read_csv("./data/products.csv")
aisles_df = pd.read_csv("./data/aisles.csv")
deparments_df = pd.read_csv("./data/departments.csv")

products_df = product_df.merge(deparments_df, on="department_id", how="inner")
products_df = products_df.merge(aisles_df, on="aisle_id", how="inner")
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id,department,aisle
0,1,Chocolate Sandwich Cookies,61,19,snacks,cookies cakes
1,78,Nutter Butter Cookie Bites Go-Pak,61,19,snacks,cookies cakes
2,102,Danish Butter Cookies,61,19,snacks,cookies cakes
3,172,Gluten Free All Natural Chocolate Chip Cookies,61,19,snacks,cookies cakes
4,285,Mini Nilla Wafers Munch Pack,61,19,snacks,cookies cakes
...,...,...,...,...,...,...
49683,22827,Organic Black Mission Figs,18,10,bulk,bulk dried fruits vegetables
49684,28655,Crystallized Ginger Chunks,18,10,bulk,bulk dried fruits vegetables
49685,30365,Vegetable Chips,18,10,bulk,bulk dried fruits vegetables
49686,38007,Naturally Sweet Plantain Chips,18,10,bulk,bulk dried fruits vegetables


In [16]:
dist_store_df = pd.read_csv("./data/orders_distance_stores_softmax.csv")
dist_store_df = dist_store_df.drop("Unnamed: 0", axis=1)
prod_prior_df = pd.read_csv("./data/order_products__prior_specials.csv")
prod_prior_df = prod_prior_df.drop("Unnamed: 0", axis=1)
prod_prior_df.head(5)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,special
0,15,19660,1,1,15
1,15,21195,2,1,0
2,15,7461,3,1,50
3,15,2996,4,1,0
4,15,32463,5,1,0


In [8]:
dist_store_df.head(5)

Unnamed: 0,user_id,store_id,distance,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,210,0,2.304404,1438665,prior,9,1,18,3.0
1,210,0,2.304404,2850206,prior,36,0,7,2.0
2,210,0,2.304404,2406913,prior,42,0,8,3.0
3,210,0,2.304404,1155933,prior,43,2,23,2.0
4,210,0,2.304404,271697,prior,56,5,17,3.0


In [11]:
basic_info_df = pd.DataFrame([
    ["orders_distance_stores_softmax", dist_store_df.shape[0], dist_store_df.shape[1]],
    ["order_products__prior_specials", prod_prior_df.shape[0], prod_prior_df.shape[1]]
    ],
    columns=["File", "# Instances", "# Attributes"]
)
basic_info_df.head(10)


Unnamed: 0,File,# Instances,# Attributes
0,orders_distance_stores_softmax,136026,9
1,order_products__prior_specials,1172312,5


In [25]:
dist_store_df["eval_set"].nunique()
dist_store_df["days_since_prior_order"].isna().sum()

1374

In [30]:
dist_store_df.isna().sum()

user_id                      0
store_id                     0
distance                     0
order_id                     0
eval_set                     0
order_number                 0
order_dow                    0
order_hour_of_day            0
days_since_prior_order    1374
dtype: int64

In [29]:
prod_prior_df.isna().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
special              0
dtype: int64

In [35]:
merged_df = prod_prior_df.merge(dist_store_df, on="order_id")
merged_df.head(13)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,special,user_id,store_id,distance,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,15,19660,1,1,15,54901,1,0.304608,prior,51,3,11,2.0
1,15,21195,2,1,0,54901,1,0.304608,prior,51,3,11,2.0
2,15,7461,3,1,50,54901,1,0.304608,prior,51,3,11,2.0
3,15,2996,4,1,0,54901,1,0.304608,prior,51,3,11,2.0
4,15,32463,5,1,0,54901,1,0.304608,prior,51,3,11,2.0
5,50,397,1,0,0,94330,7,1.00003,prior,25,1,13,4.0
6,50,34993,2,0,50,94330,7,1.00003,prior,25,1,13,4.0
7,50,17183,3,1,0,94330,7,1.00003,prior,25,1,13,4.0
8,50,37788,4,0,30,94330,7,1.00003,prior,25,1,13,4.0
9,50,1025,5,1,15,94330,7,1.00003,prior,25,1,13,4.0


In [47]:
n_users = merged_df["user_id"].nunique()
n_orders = merged_df.groupby("user_id")["order_id"].nunique().sum() / n_users
assert n_orders == 99, "Certains consommateurs ont effectué plus ou moins 99 commandes"

merged_df.groupby("user_id")["order_id"].nunique()

user_id
210       99
310       99
313       99
690       99
786       99
          ..
205483    99
205543    99
205878    99
205972    99
206105    99
Name: order_id, Length: 1374, dtype: int64

In [49]:
merged_df["days_since_prior_order"] = merged_df["days_since_prior_order"].fillna(0)
assert merged_df.isna().sum().sum() == 0, "Il reste des colonnes avec des valeurs NaN"

In [101]:
n_products = merged_df["product_id"].max()
n_stores = merged_df["store_id"].max()
n_users = merged_df["user_id"].max()

print("n_products={}\nn_stores={}\nn_users={}".format(n_products, n_stores, n_users))

n_products=49683
n_stores=9
n_users=206105


In [82]:
# https://stackoverflow.com/questions/18138693/replicating-group-concat-for-pandas-dataframe
P = np.zeros((n_products, n_products))
order_prod_grouped_df = merged_df.groupby("order_id").apply(lambda x: list(x["product_id"]))

In [84]:
for line in order_prod_grouped_df:
    for product in line:
        for other_product in line:
            P[product-1, other_product-1] += 1

In [92]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(P)
pca.explained_variance_ratio_


PCA(n_components=2)

In [80]:
order_prod_grouped_df

order_id
15                         [19660, 21195, 7461, 2996, 32463]
50               [397, 34993, 17183, 37788, 1025, 4799, 276]
79         [4210, 4377, 39461, 13627, 13110, 12397, 4840,...
95         [9337, 10121, 37630, 22035, 20379, 39928, 1239...
128        [4421, 18770, 24184, 13176, 42265, 16249, 2827...
                                 ...                        
3420983    [40311, 33245, 21903, 23288, 9092, 25659, 2941...
3420984    [47209, 11744, 22760, 45448, 14240, 33750, 183...
3421004    [10603, 43858, 35958, 35425, 2732, 3793, 21137...
3421027    [9689, 37538, 24852, 46906, 28204, 20580, 9020...
3421069    [28199, 9426, 46667, 10831, 27966, 34243, 3838...
Length: 136026, dtype: object