# Case When à l'intérieur des fonctions d'agrégation

In [1]:
import pandas as pd
import duckdb

# Calculer la somme des ventes après réduction

In [2]:
data = {
    'order_id': [1, 2, 3, 4, 5, 6],
    'product_id': [101, 102, 101, 103, 102, 103],
    'quantity': [5, 3, 2, 4, 6, 2],
    'price_per_unit': [10.0, 25.0, 10.0, 8.0, 25.0, 8.0],
    'discount_code': [None, 'DISCOUNT10', 'DISCOUNT20', None, None, 'UNKNOWN']
}

df = pd.DataFrame(data)
df

Unnamed: 0,order_id,product_id,quantity,price_per_unit,discount_code
0,1,101,5,10.0,
1,2,102,3,25.0,DISCOUNT10
2,3,101,2,10.0,DISCOUNT20
3,4,103,4,8.0,
4,5,102,6,25.0,
5,6,103,2,8.0,UNKNOWN


On aimerait savoir combien on a vendu au total. 
Pour ça, il faut créer la colonne 

In [3]:
def set_discount(discount_code):
    if discount_code == "DISCOUNT10":
        return 0.9
    elif discount_code == "DISCOUNT20":
        return 0.8
    else:
        return 1

In [10]:
# %%timeit plus rapide
df['discount'] = df["discount_code"].apply(set_discount)
df["revenue_after_discount"] = df["quantity"] * df["price_per_unit"] * df['discount']
df

Unnamed: 0,order_id,product_id,quantity,price_per_unit,discount_code,discount,revenue_after_discount
0,1,101,5,10.0,,1.0,50.0
1,2,102,3,25.0,DISCOUNT10,0.9,67.5
2,3,101,2,10.0,DISCOUNT20,0.8,16.0
3,4,103,4,8.0,,1.0,32.0
4,5,102,6,25.0,,1.0,150.0
5,6,103,2,8.0,UNKNOWN,1.0,16.0


In [11]:
df["revenue_after_discount"].sum()

331.5

In [5]:
# %%timeit plus long mais plus "effective pandas"
df.assign(
    discount = df["discount_code"].apply(set_discount),
    revenue_after_discount = lambda df_: df_["quantity"] * df_["price_per_unit"] * df_['discount']
)

Unnamed: 0,order_id,product_id,quantity,price_per_unit,discount_code,discount,revenue_after_discount
0,1,101,5,10.0,,1.0,50.0
1,2,102,3,25.0,DISCOUNT10,0.9,67.5
2,3,101,2,10.0,DISCOUNT20,0.8,16.0
3,4,103,4,8.0,,1.0,32.0
4,5,102,6,25.0,,1.0,150.0
5,6,103,2,8.0,UNKNOWN,1.0,16.0


In [6]:
df.assign(
    discount = df["discount_code"].apply(set_discount),
    revenue_after_discount = lambda df_: df_["quantity"] * df_["price_per_unit"] * df_['discount']
)["revenue_after_discount"].sum()

331.5

# WITH SQL

In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,order_id,product_id,quantity,price_per_unit,discount_code
0,1,101,5,10.0,
1,2,102,3,25.0,DISCOUNT10
2,3,101,2,10.0,DISCOUNT20
3,4,103,4,8.0,
4,5,102,6,25.0,
5,6,103,2,8.0,UNKNOWN


Exercice:
- Faites une CTE avec le CASE WHEN dedans (la colonne crée par le CASE WHEN s'appelle "total_revenue"
- Utilisez cette table intermédiaire pour calculer le revenu total une fois les réductions déduites

On aimerait savoir combien on a vendu au total. 
Pour ça, il faut créer la colonne 

In [18]:
query = """
WITH new_col_df AS(
SELECT
    *,
    CASE
        WHEN discount_code = 'DISCOUNT10' THEN quantity * price_per_unit * 0.9
        WHEN discount_code = 'DISCOUNT20' THEN quantity * price_per_unit * 0.8
        ELSE quantity * price_per_unit
    END AS revenue_after_discount
FROM df
)

SELECT
    sum(revenue_after_discount)
FROM new_col_df
"""

duckdb.sql(query)

┌─────────────────────────────┐
│ sum(revenue_after_discount) │
│           double            │
├─────────────────────────────┤
│                       331.5 │
└─────────────────────────────┘

In [23]:
query = """
SELECT
    SUM(
        CASE
            WHEN discount_code = 'DISCOUNT10' THEN quantity * price_per_unit * 0.9
            WHEN discount_code = 'DISCOUNT20' THEN quantity * price_per_unit * 0.8
            ELSE quantity * price_per_unit
        END 
        ) AS revenue_after_discount
FROM df
"""

duckdb.sql(query)

┌────────────────────────┐
│ revenue_after_discount │
│         double         │
├────────────────────────┤
│                  331.5 │
└────────────────────────┘

In [24]:
# %load solutions/2case_when_inside_cte_sql.py
# %%timeit (514 µs ± 5.71 µs per loop)
# On peut retirer le groupby pour avoir la somme globale
query = """
WITH total_revenue AS(
SELECT discount_code, CASE
            WHEN discount_code = 'DISCOUNT10' THEN quantity * price_per_unit * 0.9
            WHEN discount_code = 'DISCOUNT20' THEN quantity * price_per_unit * 0.8
            ELSE quantity * price_per_unit
        END as total_revenue
FROM df)

SELECT SUM(total_revenue) 
FROM
    total_revenue
GROUP BY 
    discount_code
""" 
duckdb.query(query)


┌────────────────────┐
│ sum(total_revenue) │
│       double       │
├────────────────────┤
│               16.0 │
│               67.5 │
│              232.0 │
│               16.0 │
└────────────────────┘

Maintenant, essayez de tout faire en une seule requête, en "englobant" votre case when par un "SUM()"

In [25]:
# %load solutions/3case_when_inside_sum.py
# %%timeit (507 µs ± 6.31 µs per loop)
# On peut retirer le groupby pour avoir la somme globale
query = """
SELECT
    SUM(
        CASE
            WHEN discount_code = 'DISCOUNT10' THEN quantity * price_per_unit * 0.9
            WHEN discount_code = 'DISCOUNT20' THEN quantity * price_per_unit * 0.8
            ELSE quantity * price_per_unit
        END
    ) AS total_revenue
FROM
    df
GROUP BY
    discount_code
""" 
duckdb.query(query)


┌───────────────┐
│ total_revenue │
│    double     │
├───────────────┤
│          16.0 │
│          16.0 │
│         232.0 │
│          67.5 │
└───────────────┘

## Case when inside group by

Il est également possible de faire des CASE WHEN à l'intérieur de la clause GROUP BY:

In [26]:
# On reprend l'exemple des salaires du notebook précédent:
data = {
    'name': ['Toufik', 'Jean-Nicolas', 'Daniel', 'Kaouter', 'Sylvie', 
             'Sebastien', 'Diane', 'Romain', 'François', 'Anna',
             'Zeinaba', 'Gregory', 'Karima', 'Arthur', 'Benjamin'],
    'wage': [60000, 35000, 55000, 80000, 70000, 
             90000, 65000, 72000, 68000, 85000, 
             100000, 120000, 95000, 83000, 110000],
    'department': ['IT', 'HR', 'SALES', 'IT', 'IT', 
                   'HR', 'SALES', 'IT', 'HR', 'SALES', 
                   'IT', 'IT', 'HR', 'SALES', 'CEO']
}

wages = pd.DataFrame(data)

wages.tail()

Unnamed: 0,name,wage,department
10,Zeinaba,100000,IT
11,Gregory,120000,IT
12,Karima,95000,HR
13,Arthur,83000,SALES
14,Benjamin,110000,CEO


On cherche à classer les revenus par catégories:
- Low si < 50 000
- medium si < 90 000
- sinon: high

In [27]:
# %%timeit (453 µs ± 11.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each))
query = """
SELECT 
  department, 
  CASE 
      WHEN wage <= 50000 THEN 'Low' 
      WHEN wage < 90000 THEN 'Medium' 
      ELSE 'High' 
  END AS salary_range, 
  AVG(wage) AS average_salary
FROM 
  wages 
GROUP BY 
  department, 
  CASE 
      WHEN wage <= 50000 THEN 'Low' 
      WHEN wage < 90000 THEN 'Medium' 
      ELSE 'High' 
  END;
"""
duckdb.sql(query)

┌────────────┬──────────────┬────────────────┐
│ department │ salary_range │ average_salary │
│  varchar   │   varchar    │     double     │
├────────────┼──────────────┼────────────────┤
│ HR         │ Medium       │        68000.0 │
│ CEO        │ High         │       110000.0 │
│ IT         │ High         │       110000.0 │
│ HR         │ Low          │        35000.0 │
│ HR         │ High         │        92500.0 │
│ IT         │ Medium       │        70500.0 │
│ SALES      │ Medium       │        72000.0 │
└────────────┴──────────────┴────────────────┘

Selon moi, les CASE WHEN dans les groupby ne sont pas la solution la plus facile à maintenir. 

Je préfère largement les CTE pour ce type de besoin:

Exercice: refaites la requête ci-dessus <br />
en utilisant une CTE qui crée la colonne "salary_range" avec un CASE WHEN

Puis en faisant la moyenne des salaires par salary_range

In [29]:
# %load solutions/4case_when_inside_groupby_as_cte.py
# %%timeit (455 µs ± 8.74 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each))
query = """
WITH salary_range AS (
  SELECT 
    department, 
    wage, 
    CASE 
    WHEN wage <= 50000 THEN 'Low' 
    WHEN wage < 90000 THEN 'Medium' 
    ELSE 'High' 
    END AS salary_range, 
  FROM 
    wages
) 

SELECT 
  department, 
  salary_range, 
  AVG(wage) AS average_salary,
FROM 
  salary_range 
group by 
  department, 
  salary_range

"""
duckdb.sql(query)


┌────────────┬──────────────┬────────────────┐
│ department │ salary_range │ average_salary │
│  varchar   │   varchar    │     double     │
├────────────┼──────────────┼────────────────┤
│ IT         │ Medium       │        70500.0 │
│ SALES      │ Medium       │        72000.0 │
│ HR         │ Medium       │        68000.0 │
│ CEO        │ High         │       110000.0 │
│ HR         │ Low          │        35000.0 │
│ HR         │ High         │        92500.0 │
│ IT         │ High         │       110000.0 │
└────────────┴──────────────┴────────────────┘

Exercice: ajoutez une ligne de code qui permet de savoir combien de personnes sont incluses dans chaque "regroupement"

In [34]:
query = """
WITH salary_range AS (
  SELECT 
    department, 
    wage, 
    CASE 
    WHEN wage <= 50000 THEN 'Low' 
    WHEN wage < 90000 THEN 'Medium' 
    ELSE 'High' 
    END AS salary_range, 
  FROM 
    wages
) 

SELECT 
  department, 
  salary_range, 
  AVG(wage) AS average_salary,
  COUNT(*) AS number_of_person
FROM 
  salary_range 
group by 
  department, 
  salary_range

"""
duckdb.sql(query)


┌────────────┬──────────────┬────────────────┬──────────────────┐
│ department │ salary_range │ average_salary │ number_of_person │
│  varchar   │   varchar    │     double     │      int64       │
├────────────┼──────────────┼────────────────┼──────────────────┤
│ IT         │ Medium       │        70500.0 │                4 │
│ SALES      │ Medium       │        72000.0 │                4 │
│ HR         │ Medium       │        68000.0 │                1 │
│ CEO        │ High         │       110000.0 │                1 │
│ IT         │ High         │       110000.0 │                2 │
│ HR         │ Low          │        35000.0 │                1 │
│ HR         │ High         │        92500.0 │                2 │
└────────────┴──────────────┴────────────────┴──────────────────┘

In [31]:
# resultat attendu: 