In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta
import duckdb

random.seed(42)

## Rappel

Avec la clause OVER(), on a vu qu'on pouvait calculer <br />
une agrégation sur l'ensemble des lignes de la table

Puis on a vu que OVER(ORDER BY col) permet d'avoir <br />
une agrégation de toutes les lignes précédentes jusqu'à la ligne en cours

Comment faire si on veut limiter le nombre de lignes précédentes <br /> prises en compte dans le calcul ?

In [13]:
start_date = datetime(2023, 9, 1)
end_date = datetime(2023, 9, 30)
date_range = [start_date + timedelta(days=x) for x in range((end_date - start_date).days + 1)]

sales_data = [random.randint(1, 7) * 1000 for _ in range(len(date_range))]

df = pd.DataFrame({"date": date_range, "daily_sales": sales_data})

df.head()

Unnamed: 0,date,daily_sales
0,2023-09-01,4000
1,2023-09-02,2000
2,2023-09-03,4000
3,2023-09-04,5000
4,2023-09-05,3000


Rappel: somme et moyenne progressives sur toute la donnée:

In [14]:
query = """
SELECT date, daily_sales,
SUM(daily_sales) OVER(ORDER BY date) as running_total,
COUNT(daily_sales) OVER(ORDER BY date) as running_count,
AVG(daily_sales) OVER(ORDER BY date) as running_mean,
from df
"""
duckdb.sql(query).df().head(10)

Unnamed: 0,date,daily_sales,running_total,running_count,running_mean
0,2023-09-01,4000,4000.0,1,4000.0
1,2023-09-02,2000,6000.0,2,3000.0
2,2023-09-03,4000,10000.0,3,3333.333333
3,2023-09-04,5000,15000.0,4,3750.0
4,2023-09-05,3000,18000.0,5,3600.0
5,2023-09-06,7000,25000.0,6,4166.666667
6,2023-09-07,7000,32000.0,7,4571.428571
7,2023-09-08,1000,33000.0,8,4125.0
8,2023-09-09,7000,40000.0,9,4444.444444
9,2023-09-10,7000,47000.0,10,4700.0


Rappel: le P.O. estime qu'il n'est pas nécessaire <br />
de garder les anciennes valeurs au delà d'une semaine.

Il veut un "moving average" uniquement sur les 7 derniers jours.

### La syntaxe ROWS BETWEEN 

In [4]:
query = """
SELECT date, daily_sales,
SUM(daily_sales) OVER(
    ORDER BY date 
    ROWS BETWEEN 2 PRECEDING and CURRENT ROW
    ) AS moving_total_three_last_rows,
SUM(daily_sales) OVER(
    ORDER BY date
    ) AS moving_total,
from df
"""
duckdb.sql(query).df().head(7)

Unnamed: 0,date,daily_sales,moving_total_three_last_rows,moving_total
0,2023-09-01,6000,6000.0,6000.0
1,2023-09-02,1000,7000.0,7000.0
2,2023-09-03,1000,8000.0,8000.0
3,2023-09-04,6000,8000.0,14000.0
4,2023-09-05,3000,10000.0,17000.0
5,2023-09-06,2000,11000.0,19000.0
6,2023-09-07,2000,7000.0,21000.0


Exercice: faites la même chose avec la moyenne  <br />
(pas de C/C => tapez l'instruction pour mieux mémoriser la syntaxe)

## Fenêtre avant après

In [5]:
query = """
SELECT date, daily_sales,
SUM(daily_sales) OVER(
    ORDER BY date 
    ROWS BETWEEN 1 PRECEDING and 1 FOLLOWING
    ) AS moving_total,
AVG(daily_sales) OVER(
    ORDER BY date 
    ROWS BETWEEN 1 PRECEDING and 1 FOLLOWING)
    AS moving_average,
from df
"""
duckdb.sql(query).df().head(7)

Unnamed: 0,date,daily_sales,moving_total,moving_average
0,2023-09-01,6000,7000.0,3500.0
1,2023-09-02,1000,8000.0,2666.666667
2,2023-09-03,1000,8000.0,2666.666667
3,2023-09-04,6000,10000.0,3333.333333
4,2023-09-05,3000,11000.0,3666.666667
5,2023-09-06,2000,7000.0,2333.333333
6,2023-09-07,2000,6000.0,2000.0


Exercice: <br />
Faites un moving average avec une fenêtre de cinq lignes <br />
deux avant, la ligne en cours, et deux lignes après

In [16]:
%load solutions/5moving_average_2before_2after.py

Unnamed: 0,date,daily_sales,moving_average
0,2023-09-01,4000,3333.333333
1,2023-09-02,2000,3750.0
2,2023-09-03,4000,3600.0
3,2023-09-04,5000,4200.0
4,2023-09-05,3000,5200.0
5,2023-09-06,7000,4600.0
6,2023-09-07,7000,5000.0


## Exercices

In [17]:
capteurs = pd.read_csv("data/capteur_a_retrail.csv")
capteurs.head(8)

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0
5,2023-08-07,porte_a,4200.0,2,4920.0,3936.0
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0
7,2023-08-09,porte_a,5300.0,4,4920.0,3936.0


### A) Faire la moyenne mobile sur les 7 derniers jours

In [19]:
%load solutions/6moving_average_capteurs_last_seven_days.py

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,seven_daysmoving_average
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,4200.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0,4750.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0,4633.333333
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0,4850.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0,5080.0
5,2023-08-07,porte_a,4200.0,2,4920.0,3936.0,4933.333333
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0,4900.0
7,2023-08-09,porte_a,5300.0,4,4920.0,3936.0,5057.142857
8,2023-08-10,porte_a,4400.0,5,4920.0,3936.0,4928.571429
9,2023-08-11,porte_a,5500.0,6,4920.0,3936.0,5085.714286


### B) Vérification de cette moyenne mobile

Ajouter le running total sur les sept derniers jours <br />
le count(*) sur les sept derniers jours <br />
la division de ce running_total par le count <br />
pour comparer avec la moyenne précédemment obtenue

In [22]:
%load solutions/7verification_calculs.py

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,seven_days_running_total,seven_days_moving_count,verif,seven_days_moving_average
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0,4200.0,1,4200.0,4200.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0,9500.0,2,4750.0,4750.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0,13900.0,3,4633.333333,4633.333333
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0,19400.0,4,4850.0,4850.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0,25400.0,5,5080.0,5080.0
5,2023-08-07,porte_a,4200.0,2,4920.0,3936.0,29600.0,6,4933.333333,4933.333333
6,2023-08-08,porte_a,4700.0,3,4920.0,3936.0,34300.0,7,4900.0,4900.0
7,2023-08-09,porte_a,5300.0,4,4920.0,3936.0,35400.0,7,5057.142857,5057.142857
8,2023-08-10,porte_a,4400.0,5,4920.0,3936.0,34500.0,7,4928.571429,4928.571429
9,2023-08-11,porte_a,5500.0,6,4920.0,3936.0,35600.0,7,5085.714286,5085.714286


Si on regarde les dernières lignes, on s'aperçoit qu'on a toujours notre problème

In [23]:
query = """
SELECT *,
AVG(visiteurs_count) OVER(
    ORDER BY DATE 
    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
    ) AS seven_daysmoving_average,
seven_daysmoving_average * 0.8 AS new_threshold
FROM capteurs
"""
duckdb.sql(query).df().tail(10)

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,seven_daysmoving_average,new_threshold
20,2023-08-24,porte_a,4700.0,5,4920.0,3936.0,4785.714286,3828.571429
21,2023-08-25,porte_a,5800.0,6,4920.0,3936.0,5100.0,4080.0
22,2023-08-26,porte_a,6000.0,7,4920.0,3936.0,5242.857143,4194.285714
23,2023-08-28,porte_a,4300.0,2,4920.0,3936.0,5071.428571,4057.142857
24,2023-08-29,porte_a,4750.0,3,4920.0,3936.0,5135.714286,4108.571429
25,2023-08-30,porte_a,5500.0,4,4920.0,3936.0,5221.428571,4177.142857
26,2023-08-31,porte_a,4700.0,5,4920.0,3936.0,5107.142857,4085.714286
27,2023-09-01,porte_a,5600.0,6,4700.0,3760.0,5235.714286,4188.571429
28,2023-09-02,porte_a,4000.0,7,4700.0,3760.0,4978.571429,3982.857143
29,2023-09-04,porte_a,4500.0,2,4700.0,3760.0,4764.285714,3811.428571


Conclusion: ça permet de faire les 7 derniers jours, mais pas les 7 derniers Samedi !