In [1]:
import pandas as pd
import duckdb

## data

In [2]:
dpt_to_region_mapping = {
    "59":"HDF",
    "62":"HDF",
    "75":"IDF",
    "95":"IDF",
    "83":"PACA",
    "84":"PACA",
}

dpts_dfs = []
for year in range(2016,2022):
    # https://www.insee.fr/fr/statistiques/1893198
    df = pd.read_excel("data/estim-pop-dep-sexe-gca-1975-2023.xls", str(year), skiprows=3)
    subset = df[df["Départements"].isin(["59", "62", "75", "95", "83", "84"])][["Départements", "Unnamed: 7"]]
    subset["year"] = year
    dpts_dfs.append(subset)

dpts_dfs = pd.concat(dpts_dfs)
dpts_dfs.columns = ["departement", "population", "year"]
dpts_dfs["region"] = dpts_dfs["departement"].apply(lambda x: dpt_to_region_mapping.get(x))
dpts_dfs = dpts_dfs[["region", "departement", "year", "population"]]
dpts_dfs.head()

Unnamed: 0,region,departement,year,population
60,HDF,59,2016,2603723
63,HDF,62,2016,1470725
76,IDF,75,2016,2190327
84,PACA,83,2016,1055821
85,PACA,84,2016,559014


## Exercice

Exercice: Utilisez une agrégation couplée avec un FILTER <br />
pour avoir, <b>pour chaque année</b>, la population de la region IDF <br />
en face du total pour calculer la part de la pop IDF sur le total

In [9]:
query = """
SELECT
    year,
    sum(population) FILTER (where region = 'IDF') AS population_IDF,
    sum(population),
    sum(population) FILTER (where region = 'IDF') / sum(population) AS IDF_perct
FROM dpts_dfs
GROUP BY year

"""
duckdb.sql(query).df()

Unnamed: 0,year,population_IDF,sum(population),IDF_perct
0,2021,3389621.0,9113121.0,0.37195
1,2018,3414182.0,9114649.0,0.374582
2,2020,3397710.0,9114753.0,0.37277
3,2017,3416144.0,9106742.0,0.375123
4,2019,3415097.0,9126901.0,0.374179
5,2016,3412250.0,9101533.0,0.374909


In [26]:
# %load solutions/5pop_filter.py
query = """
SELECT 
year,
SUM(population) FILTER(WHERE region = 'IDF') as 'IDF', 
SUM(population) AS total_pop,
IDF / total_pop,
FROM dpts_dfs
GROUP BY "year"
"""
duckdb.sql(query).df()


Unnamed: 0,year,IDF,total_pop,(IDF / total_pop)
0,2020,3397710.0,9114753.0,0.37277
1,2016,3412250.0,9101533.0,0.374909
2,2018,3414182.0,9114649.0,0.374582
3,2021,3389621.0,9113121.0,0.37195
4,2017,3416144.0,9106742.0,0.375123
5,2019,3415097.0,9126901.0,0.374179


Exercice: faites la même chose avec un CASE WHEN

In [27]:
query = """
SELECT 
    year,
    SUM(CASE WHEN region = 'IDF' THEN population END) AS IDF_population,
    SUM(population) AS total_population,
    SUM(CASE WHEN region = 'IDF' THEN population END) / SUM(population) AS IDF_ratio
FROM 
    dpts_dfs
GROUP BY 
    year;
"""
duckdb.sql(query).df()

Unnamed: 0,year,IDF_population,total_population,IDF_ratio
0,2016,3412250.0,9101533.0,0.374909
1,2020,3397710.0,9114753.0,0.37277
2,2021,3389621.0,9113121.0,0.37195
3,2018,3414182.0,9114649.0,0.374582
4,2017,3416144.0,9106742.0,0.375123
5,2019,3415097.0,9126901.0,0.374179


In [50]:
query = """
SELECT 
    year,
    region,
    SUM(population) AS region_pop,
    SUM(SUM(population)) OVER (PARTITION BY year) AS total_population,
    region_pop / total_population AS region_ratio
FROM 
    dpts_dfs
GROUP BY 
    year, region
ORDER BY
    year, region
"""
duckdb.sql(query).df()

Unnamed: 0,year,region,region_pop,total_population,region_ratio
0,2016,HDF,4074448.0,9101533.0,0.447666
1,2016,IDF,3412250.0,9101533.0,0.374909
2,2016,PACA,1614835.0,9101533.0,0.177425
3,2017,HDF,4072379.0,9106742.0,0.447183
4,2017,IDF,3416144.0,9106742.0,0.375123
5,2017,PACA,1618219.0,9106742.0,0.177695
6,2018,HDF,4072977.0,9114649.0,0.446861
7,2018,IDF,3414182.0,9114649.0,0.374582
8,2018,PACA,1627490.0,9114649.0,0.178558
9,2019,HDF,4073624.0,9126901.0,0.446332


In [13]:
# %load solutions/6pop_casewhen.py
query = """
SELECT 
year,
SUM(
    CASE 
    WHEN region = 'IDF' THEN population 
    END
) AS 'IDF', 
SUM(population) AS total_pop,
IDF / total_pop,
FROM dpts_dfs
GROUP BY "year"
"""
duckdb.sql(query).df()


Unnamed: 0,year,IDF,total_pop,(IDF / total_pop)
0,2016,3412250.0,9101533.0,0.374909
1,2018,3414182.0,9114649.0,0.374582
2,2020,3397710.0,9114753.0,0.37277
3,2017,3416144.0,9106742.0,0.375123
4,2019,3415097.0,9126901.0,0.374179
5,2021,3389621.0,9113121.0,0.37195


In [37]:
query = """
SELECT 
    year,
    region,
    SUM(population) AS population_region,
    sum(sum(population)) OVER (PARTITION BY year) AS total_populationyear,
    SUM(population) / SUM(SUM(population)) OVER (PARTITION BY year) AS ratio
FROM 
    dpts_dfs
GROUP BY year, region

"""
duckdb.sql(query).df()

Unnamed: 0,year,region,population_region,total_populationyear,ratio
0,2020,HDF,4069913.0,9114753.0,0.446519
1,2020,IDF,3397710.0,9114753.0,0.37277
2,2020,PACA,1647130.0,9114753.0,0.18071
3,2016,PACA,1614835.0,9101533.0,0.177425
4,2016,HDF,4074448.0,9101533.0,0.447666
5,2016,IDF,3412250.0,9101533.0,0.374909
6,2017,PACA,1618219.0,9106742.0,0.177695
7,2017,HDF,4072379.0,9106742.0,0.447183
8,2017,IDF,3416144.0,9106742.0,0.375123
9,2018,HDF,4072977.0,9114649.0,0.446861
