In [1]:
import pandas as pd
import duckdb

In [2]:
data = {
    'name': ['Toufik', 'Jean-Nicolas', 'Daniel', 'Kaouter', 'Sylvie',
             'Sebastien', 'Diane', 'Romain', 'François', 'Anna',
             'Zeinaba', 'Gregory', 'Karima', 'Arthur', 'Benjamin'],
    'wage': [60000, 75000, 55000, 100000, 70000,
             90000, 65000, 100000, 68000, 85000,
             100000, 120000, 95000, 83000, 110000],
    'department': ['IT', 'HR', 'SALES', 'IT', 'IT',
                   'HR', 'SALES', 'IT', 'HR', 'SALES',
                   'IT', 'IT', 'HR', 'SALES', 'CEO'],
    'sex': ['H', 'H', 'H', 'F', 'F',
           'H', 'F', 'H', 'H', 'F',
           'F', 'H', 'F', 'H', 'H',]
}
wages = pd.DataFrame(data)
wages

Unnamed: 0,name,wage,department,sex
0,Toufik,60000,IT,H
1,Jean-Nicolas,75000,HR,H
2,Daniel,55000,SALES,H
3,Kaouter,100000,IT,F
4,Sylvie,70000,IT,F
5,Sebastien,90000,HR,H
6,Diane,65000,SALES,F
7,Romain,100000,IT,H
8,François,68000,HR,H
9,Anna,85000,SALES,F


## La clause QUALIFY

Pour ne filtrer sur une agrégation après un GROUPBY, <br />
Il existe le keyword "HAVING":

In [6]:
query = """
SELECT department,
MAX(wage) AS max_wage
FROM wages
-- WHERE max_wage > 100000  -- won't work
GROUP BY department
-- HAVING max_wage > 100000
"""
duckdb.sql(query).df()

Unnamed: 0,department,max_wage
0,IT,120000
1,HR,95000
2,SALES,85000
3,CEO,110000


In [11]:
query = """
SELECT *,
DENSE_RANK() OVER(PARTITION BY department ORDER BY wage DESC) AS index
FROM wages
-- WHERE index = 2  -- won't work either
-- HAVING index = 2  -- nice try, but nope
"""
duckdb.sql(query).df()

Unnamed: 0,name,wage,department,sex,index
0,Karima,95000,HR,F,1
1,Sebastien,90000,HR,H,2
2,Jean-Nicolas,75000,HR,H,3
3,François,68000,HR,H,4
4,Gregory,120000,IT,H,1
5,Kaouter,100000,IT,F,2
6,Romain,100000,IT,H,2
7,Zeinaba,100000,IT,F,2
8,Sylvie,70000,IT,F,3
9,Toufik,60000,IT,H,4


Si on souhaite faire un filtrage similaire <br />
sur une colonne crée avec une window function, <br />
il faut utiliser QUALIFY:

In [13]:
query = """
SELECT *,
DENSE_RANK() OVER(
    PARTITION BY department
    ORDER BY wage DESC) AS index
FROM wages
QUALIFY index = 2
"""
duckdb.sql(query).df()

Unnamed: 0,name,wage,department,sex,index
0,Sebastien,90000,HR,H,2
1,Kaouter,100000,IT,F,2
2,Romain,100000,IT,H,2
3,Zeinaba,100000,IT,F,2
4,Arthur,83000,SALES,H,2


### Exercice: 

In [14]:
import random

df = pd.read_csv("data/capteur_a_retrail.csv")
df_porte_b = df.copy()
# On ajoute une porte_b
df_porte_b["capteur_id"] = "porte_b"
df_porte_b["visiteurs_count"] = df_porte_b["visiteurs_count"].apply(lambda x: round(x * (random.random() + 0.5), 0))
df = pd.concat([df, df_porte_b])

On souhaite avoir la liste des date pour lesquelles <br />
le capteur A ou B ont dépassé les 6000 visiteurs journaliers <br />
en moyenne sur les 4 derniers jours similaires

("si depuis un mois, tous les samedi, on est à 6000+ visiteurs en moyenne, faut agrandir la porte !")

In [16]:
query = """
SELECT *,
AVG(visiteurs_count) OVER(
    PARTITION BY capteur_id, weekday
    ORDER BY Date
    ) AS avg_visitors_weekday
FROM df
QUALIFY avg_visitors_weekday > 6000
"""
duckdb.sql(query).df()


Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,avg_visitors_weekday
0,2023-08-01,porte_b,6264.0,3,4920.0,3936.0,6264.0
1,2023-08-04,porte_b,7202.0,6,4920.0,3936.0,7202.0
2,2023-08-05,porte_b,8794.0,7,4920.0,3936.0,8794.0
3,2023-08-12,porte_b,7858.0,7,4920.0,3936.0,8326.0
4,2023-08-19,porte_b,5197.0,7,4920.0,3936.0,7283.0
5,2023-08-26,porte_b,5965.0,7,4920.0,3936.0,6953.5
6,2023-09-02,porte_b,5982.0,7,4700.0,3760.0,6759.2


<img src="images/zach_wilson_qualify_wow.png" />