In [14]:
import pandas as pd
import duckdb
import random

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

## La clause OVER

In [15]:
# Define the furniture data
furniture_data = [
    ("Chairs", "Chair 1", 5.2),
    ("Chairs", "Chair 2", 4.5),
    ("Chairs", "Chair 3", 6.8),
    ("Sofas", "Sofa 1", 25.5),
    ("Sofas", "Sofa 2", 20.3),
    ("Sofas", "Sofa 3", 30.0),
    ("Tables", "Table 1", 15.0),
    ("Tables", "Table 2", 12.5),
    ("Tables", "Table 3", 18.2),
]

# Create a pandas DataFrame from the predefined data
df = pd.DataFrame(furniture_data, columns=["category", "item", "weight"])

# Print the DataFrame
df

Unnamed: 0,category,item,weight
0,Chairs,Chair 1,5.2
1,Chairs,Chair 2,4.5
2,Chairs,Chair 3,6.8
3,Sofas,Sofa 1,25.5
4,Sofas,Sofa 2,20.3
5,Sofas,Sofa 3,30.0
6,Tables,Table 1,15.0
7,Tables,Table 2,12.5
8,Tables,Table 3,18.2


### Over tout seul

On veut le poids total de tous les articles

In [16]:
query = """
SELECT  SUM(weight)  AS poids_total,
FROM df
"""
duckdb.sql(query).df()

Unnamed: 0,poids_total
0,138.0


On veut le poids total de tous les articles dans une colonne à part

In [17]:
query = """
SELECT *,
SUM(weight) OVER() AS poids_total,
FROM df
"""
duckdb.sql(query).df()

Unnamed: 0,category,item,weight,poids_total
0,Chairs,Chair 1,5.2,138.0
1,Chairs,Chair 2,4.5,138.0
2,Chairs,Chair 3,6.8,138.0
3,Sofas,Sofa 1,25.5,138.0
4,Sofas,Sofa 2,20.3,138.0
5,Sofas,Sofa 3,30.0,138.0
6,Tables,Table 1,15.0,138.0
7,Tables,Table 2,12.5,138.0
8,Tables,Table 3,18.2,138.0


<blockquote> Attends, tu nous as fait galérer à faire des GROUPING SETS + SELF JOIN pour avoir le total sur la droite de la table alors qu'on pouvait faire ça ?! </blockquote> 

C'était l'occasion de vous faire réviser le self join ;) 

Le Grouping Sets est surtout utile dans la situation où on veut la somme totale (sur l'ensemble) dans la même colonne que les sommes pour les sous-ensembles

Mais ce n'est pas tout ce qu'OVER peut faire: 

### Over(ORDER BY): running total

Il suffit ensuite de changer la fonction d'agrégation pour obtenir un running total:

In [18]:
query = """
SELECT *,
SUM(weight) OVER(ORDER BY item) AS poids_total,
FROM df
"""
duckdb.sql(query).df()

Unnamed: 0,category,item,weight,poids_total
0,Chairs,Chair 1,5.2,5.2
1,Chairs,Chair 2,4.5,9.7
2,Chairs,Chair 3,6.8,16.5
3,Sofas,Sofa 1,25.5,42.0
4,Sofas,Sofa 2,20.3,62.3
5,Sofas,Sofa 3,30.0,92.3
6,Tables,Table 1,15.0,107.3
7,Tables,Table 2,12.5,119.8
8,Tables,Table 3,18.2,138.0


Ou un moving average:

### Over (ORDER BY): Moving average

In [19]:
query = """
SELECT *,
AVG(weight) OVER(ORDER BY item DESC) AS poids_total,
FROM df
"""
duckdb.sql(query).df()

Unnamed: 0,category,item,weight,poids_total
0,Tables,Table 3,18.2,18.2
1,Tables,Table 2,12.5,15.35
2,Tables,Table 1,15.0,15.233333
3,Sofas,Sofa 3,30.0,18.925
4,Sofas,Sofa 2,20.3,19.2
5,Sofas,Sofa 1,25.5,20.25
6,Chairs,Chair 3,6.8,18.328571
7,Chairs,Chair 2,4.5,16.6
8,Chairs,Chair 1,5.2,15.333333


## Exercices

In [22]:
df = pd.read_csv("data/capteur_a_retrail.csv")
df

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0
5,2023-08-07,porte_a,4200.0,2,4920.0,3936.0
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0
7,2023-08-09,porte_a,5300.0,4,4920.0,3936.0
8,2023-08-10,porte_a,4400.0,5,4920.0,3936.0
9,2023-08-11,porte_a,5500.0,6,4920.0,3936.0


Récupérez la somme totale des visiteurs sur l'ensemble de la donnée :

In [23]:
query = """
SELECT 
    SUM(visiteurs_count) AS total_visiteur
FROM df
"""
duckdb.sql(query)

┌────────────────┐
│ total_visiteur │
│     double     │
├────────────────┤
│       146950.0 │
└────────────────┘

In [24]:
# %load solutions/1sum_total.py
query = """
SELECT  SUM(visiteurs_count)  AS total_visiteurs,
FROM df
"""
duckdb.sql(query).df()


Unnamed: 0,total_visiteurs
0,146950.0


Utilisez OVER() pour avoir ce même total sur l'ensemble de la table, <br />
Dans une nouvelle colonne "total_visiteurs"

In [26]:
query = """
SELECT 
    *,
    SUM(visiteurs_count) OVER() AS total_visiteur
FROM df
ORDER BY
    weekday
"""
duckdb.sql(query).df()

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,total_visiteur
0,2023-08-07,porte_a,4200.0,2,4920.0,3936.0,146950.0
1,2023-08-14,porte_a,3900.0,2,4920.0,3936.0,146950.0
2,2023-08-21,porte_a,4300.0,2,4920.0,3936.0,146950.0
3,2023-08-28,porte_a,4300.0,2,4920.0,3936.0,146950.0
4,2023-09-04,porte_a,4500.0,2,4700.0,3760.0,146950.0
5,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,146950.0
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0,146950.0
7,2023-08-15,porte_a,4200.0,3,4920.0,3936.0,146950.0
8,2023-08-22,porte_a,4900.0,3,4920.0,3936.0,146950.0
9,2023-08-29,porte_a,4750.0,3,4920.0,3936.0,146950.0


In [27]:
# %load solutions/2sum_over.py
query = """
SELECT *, 
SUM(visiteurs_count) OVER() AS total_visiteurs,
FROM df
ORDER BY weekday
"""
duckdb.sql(query).df().head(10)


Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,total_visiteurs
0,2023-08-07,porte_a,4200.0,2,4920.0,3936.0,146950.0
1,2023-08-14,porte_a,3900.0,2,4920.0,3936.0,146950.0
2,2023-08-21,porte_a,4300.0,2,4920.0,3936.0,146950.0
3,2023-08-28,porte_a,4300.0,2,4920.0,3936.0,146950.0
4,2023-09-04,porte_a,4500.0,2,4700.0,3760.0,146950.0
5,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,146950.0
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0,146950.0
7,2023-08-15,porte_a,4200.0,3,4920.0,3936.0,146950.0
8,2023-08-22,porte_a,4900.0,3,4920.0,3936.0,146950.0
9,2023-08-29,porte_a,4750.0,3,4920.0,3936.0,146950.0


Votre manager veut que la colonne total contienne le total progressif <br />
des visiteurs sur le mois.


In [28]:
query = """
SELECT 
    *,
    SUM(visiteurs_count) OVER(ORDER BY date) AS running_total_visiteurs
FROM df

"""
duckdb.sql(query).df()

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,running_total_visiteurs
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,4200.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0,9500.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0,13900.0
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0,19400.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0,25400.0
5,2023-08-07,porte_a,4200.0,2,4920.0,3936.0,29600.0
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0,34300.0
7,2023-08-09,porte_a,5300.0,4,4920.0,3936.0,39600.0
8,2023-08-10,porte_a,4400.0,5,4920.0,3936.0,44000.0
9,2023-08-11,porte_a,5500.0,6,4920.0,3936.0,49500.0


In [29]:
# %load solutions/3progressive_total.py
query = """
SELECT *, 
SUM(visiteurs_count) OVER(ORDER BY date) AS total_visiteurs,
FROM df
ORDER BY date
"""
duckdb.sql(query).df().head()


Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,total_visiteurs
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,4200.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0,9500.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0,13900.0
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0,19400.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0,25400.0


Il veut maintenant avoir une moyenne de visiteurs sur le mois <br />
qui se met à jour progressivement en fonction des résultats de la journée

In [31]:
query = """
SELECT 
    *,
    AVG(visiteurs_count) OVER(ORDER BY date) AS running_AVG_visiteurs
FROM df

"""
duckdb.sql(query).df()

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,running_AVG_visiteurs
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,4200.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0,4750.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0,4633.333333
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0,4850.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0,5080.0
5,2023-08-07,porte_a,4200.0,2,4920.0,3936.0,4933.333333
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0,4900.0
7,2023-08-09,porte_a,5300.0,4,4920.0,3936.0,4950.0
8,2023-08-10,porte_a,4400.0,5,4920.0,3936.0,4888.888889
9,2023-08-11,porte_a,5500.0,6,4920.0,3936.0,4950.0


In [33]:
# %load solutions/4progressive_avg.py
query = """
SELECT *, 
AVG(visiteurs_count) OVER(ORDER BY date) AS total_visiteurs,
FROM df
ORDER BY date
"""
duckdb.sql(query).df()


Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,total_visiteurs
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,4200.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0,4750.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0,4633.333333
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0,4850.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0,5080.0
5,2023-08-07,porte_a,4200.0,2,4920.0,3936.0,4933.333333
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0,4900.0
7,2023-08-09,porte_a,5300.0,4,4920.0,3936.0,4950.0
8,2023-08-10,porte_a,4400.0,5,4920.0,3936.0,4888.888889
9,2023-08-11,porte_a,5500.0,6,4920.0,3936.0,4950.0


C'est bien, mais le P.O. estime qu'il n'est pas nécessaire <br />
de garder les anciennes valeurs au delà d'une semaine.

Il veut un "moving average" uniquement sur les 7 derniers jours.