In [1]:
import pandas as pd
import duckdb

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [2]:
data = {
    'name': ['Toufik', 'Jean-Nicolas', 'Daniel', 'Kaouter', 'Sylvie',
             'Sebastien', 'Diane', 'Romain', 'François', 'Anna',
             'Zeinaba', 'Gregory', 'Karima', 'Arthur', 'Benjamin'],
    'wage': [60000, 75000, 55000, 100000, 70000,
             90000, 65000, 100000, 68000, 85000,
             100000, 120000, 95000, 83000, 110000],
    'department': ['IT', 'HR', 'SALES', 'IT', 'IT',
                   'HR', 'SALES', 'IT', 'HR', 'SALES',
                   'IT', 'IT', 'HR', 'SALES', 'CEO'],
    'sex': ['H', 'H', 'H', 'F', 'F',
           'H', 'F', 'H', 'H', 'F',
           'F', 'H', 'F', 'H', 'H',]
}
wages = pd.DataFrame(data)
wages

Unnamed: 0,name,wage,department,sex
0,Toufik,60000,IT,H
1,Jean-Nicolas,75000,HR,H
2,Daniel,55000,SALES,H
3,Kaouter,100000,IT,F
4,Sylvie,70000,IT,F
5,Sebastien,90000,HR,H
6,Diane,65000,SALES,F
7,Romain,100000,IT,H
8,François,68000,HR,H
9,Anna,85000,SALES,F


## La clause QUALIFY

Pour ne filtrer sur une agrégation après un GROUPBY, <br />
Il existe le keyword "HAVING":

In [3]:
query = """
SELECT department,
MAX(wage) AS max_wage
FROM wages
-- WHERE max_wage > 100000  -- won't work
GROUP BY department
-- HAVING max_wage > 100000
"""
duckdb.sql(query).df()

Unnamed: 0,department,max_wage
0,HR,95000
1,CEO,110000
2,IT,120000
3,SALES,85000


In [4]:
query = """
SELECT *,
DENSE_RANK() OVER(PARTITION BY department ORDER BY wage DESC) AS index
FROM wages
-- WHERE index = 2  -- won't work either
-- HAVING index = 2  -- nice try, but nope
"""
duckdb.sql(query).df()

Unnamed: 0,name,wage,department,sex,index
0,Gregory,120000,IT,H,1
1,Kaouter,100000,IT,F,2
2,Romain,100000,IT,H,2
3,Zeinaba,100000,IT,F,2
4,Sylvie,70000,IT,F,3
5,Toufik,60000,IT,H,4
6,Karima,95000,HR,F,1
7,Sebastien,90000,HR,H,2
8,Jean-Nicolas,75000,HR,H,3
9,François,68000,HR,H,4


Si on souhaite faire un filtrage similaire <br />
sur une colonne crée avec une window function, <br />
il faut utiliser QUALIFY:

In [5]:
query = """
SELECT *,
DENSE_RANK() OVER(
    PARTITION BY department
    ORDER BY wage DESC) AS index
FROM wages
QUALIFY index = 2
"""
duckdb.sql(query).df()

Unnamed: 0,name,wage,department,sex,index
0,Sebastien,90000,HR,H,2
1,Kaouter,100000,IT,F,2
2,Romain,100000,IT,H,2
3,Zeinaba,100000,IT,F,2
4,Arthur,83000,SALES,H,2


### Exercice: 

In [6]:
import random

df = pd.read_csv("data/capteur_a_retrail.csv")
df_porte_b = df.copy()
# On ajoute une porte_b
df_porte_b["capteur_id"] = "porte_b"
df_porte_b["visiteurs_count"] = df_porte_b["visiteurs_count"].apply(lambda x: round(x * (random.random() + 0.5), 0))
df = pd.concat([df, df_porte_b])

On souhaite avoir la liste des date pour lesquelles <br />
le capteur A ou B ont dépassé les 6000 visiteurs journaliers <br />
en moyenne sur les 4 derniers jours similaires

("si depuis un mois, tous les samedi, on est à 6000+ visiteurs en moyenne, faut agrandir la porte !")

In [7]:
df.head()

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct
0,2023-08-01,porte_a,4200.0,3,4920.0,3936.0
1,2023-08-02,porte_a,5300.0,4,4920.0,3936.0
2,2023-08-03,porte_a,4400.0,5,4920.0,3936.0
3,2023-08-04,porte_a,5500.0,6,4920.0,3936.0
4,2023-08-05,porte_a,6000.0,7,4920.0,3936.0


In [15]:
query = """
SELECT
    *,
    AVG(visiteurs_count) OVER(
        PARTITION BY capteur_id, weekday ORDER BY  date ROWS BETWEEN 3 PRECEDING and CURRENT ROW) AS mean_4_same_days
FROM df
QUALIFY mean_4_same_days > 6000

"""
duckdb.sql(query).df()

Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,mean_4_same_days
0,2023-08-12,porte_b,8182.0,7,4920.0,3936.0,6086.5
1,2023-08-01,porte_b,6027.0,3,4920.0,3936.0,6027.0
2,2023-09-01,porte_b,8201.0,6,4700.0,3760.0,6464.5


In [16]:
query = """
SELECT *,
AVG(visiteurs_count) OVER(
    PARTITION BY capteur_id, weekday
    ORDER BY Date
    ) AS avg_visitors_weekday
FROM df
QUALIFY avg_visitors_weekday > 6000
"""
duckdb.sql(query).df()


Unnamed: 0,date,capteur_id,visiteurs_count,weekday,moyenne_du_mois,threshold_twenty_pct,avg_visitors_weekday
0,2023-08-01,porte_b,6027.0,3,4920.0,3936.0,6027.0
1,2023-09-01,porte_b,8201.0,6,4700.0,3760.0,6072.4
2,2023-08-12,porte_b,8182.0,7,4920.0,3936.0,6086.5


<img src="images/zach_wilson_qualify_wow.png" />