In [1]:
import random
import duckdb
import pandas as pd
random.seed(42)

## Création données

In [2]:
# Table des employés
sales = {
    'order_id': list(range(1110, 1198)),
    'customer_id': random.choices([11, 12, 13, 14, 15, 11, 12, 13, 14], k=88),
}

df_sales = pd.DataFrame(sales)
df_sales["date"] = [d // 3 + 1 for d in range(1, 89)]
df_sales

Unnamed: 0,order_id,customer_id,date
0,1110,11,1
1,1111,11,1
2,1112,13,2
3,1113,13,2
4,1114,12,2
...,...,...,...
83,1193,14,29
84,1194,14,29
85,1195,12,29
86,1196,14,30


## Exercice: retrouver tous les clients qui sont venus deux jours de suite

- Faire un self join pour avoir toutes les combinaisons de commandes possibles pour un même client
- Retirer les lignes qui correspondent à la combinaison de la même commande
- Récupérer les lignes où la difference entre la première date et la seconde est = 1

### Self join

In [9]:
combinations_of_sales_per_customer= (
    df_sales.merge(
    df_sales,
    left_on="customer_id",
    right_on="customer_id",
    suffixes=["_first_order", "_second_order"]
    )
)
combinations_of_sales_per_customer

Unnamed: 0,order_id_first_order,customer_id,date_first_order,order_id_second_order,date_second_order
0,1110,11,1,1110,1
1,1110,11,1,1111,1
2,1110,11,1,1117,3
3,1110,11,1,1119,4
4,1110,11,1,1122,5
...,...,...,...,...,...
1753,1197,15,30,1125,6
1754,1197,15,30,1142,12
1755,1197,15,30,1145,13
1756,1197,15,30,1187,27


<img src="images/first_order.gif" />

### interlude: s'assurer de la qualité de notre calcul

On peut s'assurer qu'on a le bon nombre de lignes de la manière suivante:

In [10]:
(df_sales["customer_id"].value_counts() ** 2).sum()

1758

En effet, à chaque fois qu'un client apparaît dans une ligne, il faut joindre cette ligne avec toutes les lignes où il apparaît (y compris la ligne qu'on est en train de traiter)

In [11]:
df_sales["customer_id"].value_counts()

11    25
13    22
14    18
12    17
15     6
Name: customer_id, dtype: int64

(Le client 11 apparaît 25 fois, le client 13 apprarait 22 fois, etc...)

Pour chaque client, on prends le nombre de fois où il apparaît et on mets ça au carré.
Si on fait ça pour l'ensemble des clients, ça donne ça:

In [12]:
df_sales["customer_id"].value_counts() ** 2

11    625
13    484
14    324
12    289
15     36
Name: customer_id, dtype: int64

On peut le vérifier en faisant un self-join de la table des ventes du client 15 sur elle-même:

In [13]:
df_customer_15 = df_sales.query("customer_id == 15")
df_customer_15.merge(
    df_customer_15,
    on="customer_id").shape

(36, 5)

La somme des carrés de toutes les transactions de chaque client =

In [14]:
(df_sales["customer_id"].value_counts() ** 2).sum()

1758

### Virer les lignes qui où une commande a été jointe avec elle même :

In [15]:
mask = (combinations_of_sales_per_customer["order_id_first_order"] != 
        combinations_of_sales_per_customer["order_id_second_order"])
# display(mask)
same_order_removed = combinations_of_sales_per_customer[mask]
same_order_removed

Unnamed: 0,order_id_first_order,customer_id,date_first_order,order_id_second_order,date_second_order
1,1110,11,1,1111,1
2,1110,11,1,1117,3
3,1110,11,1,1119,4
4,1110,11,1,1122,5
5,1110,11,1,1124,6
...,...,...,...,...,...
1752,1197,15,30,1121,5
1753,1197,15,30,1125,6
1754,1197,15,30,1142,12
1755,1197,15,30,1145,13


In [16]:
same_order_removed["delay_between_orders"] = same_order_removed["date_second_order"] - same_order_removed["date_first_order"]
same_order_removed

# beurk, on va voir comment cleaner ça ;) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_order_removed["delay_between_orders"] = same_order_removed["date_second_order"] - same_order_removed["date_first_order"]


Unnamed: 0,order_id_first_order,customer_id,date_first_order,order_id_second_order,date_second_order,delay_between_orders
1,1110,11,1,1111,1,0
2,1110,11,1,1117,3,2
3,1110,11,1,1119,4,3
4,1110,11,1,1122,5,4
5,1110,11,1,1124,6,5
...,...,...,...,...,...,...
1752,1197,15,30,1121,5,-25
1753,1197,15,30,1125,6,-24
1754,1197,15,30,1142,12,-18
1755,1197,15,30,1145,13,-17


In [18]:
same_order_removed.query("delay_between_orders == 1").shape

(51, 6)

## Clean pandas


In [47]:
combinations_of_sales_per_customer= (
    df_sales.merge(
    df_sales,
    left_on="customer_id",
    right_on="customer_id",
    suffixes=["_first_order", "_second_order"]
    )
)
mask = (combinations_of_sales_per_customer["order_id_first_order"] != 
        combinations_of_sales_per_customer["order_id_second_order"])
same_order_removed = combinations_of_sales_per_customer[mask]
same_order_removed["delay_between_orders"] = same_order_removed["date_second_order"] - same_order_removed["date_first_order"]
same_order_removed.query("delay_between_orders == 1").head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_order_removed["delay_between_orders"] = same_order_removed["date_second_order"] - same_order_removed["date_first_order"]


Unnamed: 0,order_id_first_order,customer_id,date_first_order,order_id_second_order,date_second_order,delay_between_orders
53,1117,11,3,1119,4,1
79,1119,11,4,1122,5,1
105,1122,11,5,1124,6,1
131,1124,11,6,1127,7,1
132,1124,11,6,1129,7,1
210,1136,11,10,1139,11,1
235,1137,11,10,1139,11,1
287,1147,11,13,1149,14,1
313,1149,11,14,1151,15,1
339,1151,11,15,1154,16,1


In [38]:
(
    df_sales.merge(
        df_sales,
        left_on="customer_id",
        right_on="customer_id",
        suffixes=["_first_order", "_second_order"],
    )
    .query("order_id_first_order != order_id_second_order")
    .query("date_second_order - date_first_order == 1")
)#.head(10)

Unnamed: 0,order_id_first_order,customer_id,date_first_order,order_id_second_order,date_second_order
53,1117,11,3,1119,4
79,1119,11,4,1122,5
105,1122,11,5,1124,6
131,1124,11,6,1127,7
132,1124,11,6,1129,7
210,1136,11,10,1139,11
235,1137,11,10,1139,11
287,1147,11,13,1149,14
313,1149,11,14,1151,15
339,1151,11,15,1154,16


## Exercice: faire la même chose en SQL

In [40]:
%load solutions/2self_join_orders_following_day.py

┌─────────────┬─────────────┬──────────────────┬──────────────┬─────────────────┐
│ customer_id │ first_order │ date_first_order │ second_order │ date_next_order │
│    int64    │    int64    │      int64       │    int64     │      int64      │
├─────────────┼─────────────┼──────────────────┼──────────────┼─────────────────┤
│          13 │        1188 │               27 │         1191 │              28 │
│          13 │        1189 │               27 │         1191 │              28 │
│          14 │        1193 │               29 │         1196 │              30 │
│          14 │        1194 │               29 │         1196 │              30 │
│          12 │        1175 │               23 │         1180 │              24 │
│          15 │        1142 │               12 │         1145 │              13 │
│          12 │        1173 │               22 │         1175 │              23 │
│          11 │        1172 │               22 │         1177 │              23 │
│          13 │ 