# Summary statistics

In [1]:
hs = 'HS17'             # Change this!
release = '202301'      # Change this!

filename = f'BACI_{hs}_V{release}'

In [2]:
import pandas as pd
import duckdb

## Tabulate number of observations for each aggregation level

In [6]:
duckdb.sql(
    f"""
    (
        SELECT '6-digit' AS agg, COUNT(*) AS n
        FROM read_parquet('final/{filename}.parquet')
    )
    UNION ALL (
        SELECT '4-digit' AS agg, COUNT(*) AS n
        FROM read_parquet('final/{filename}-4digit.parquet')
    )
    UNION ALL (
        SELECT '2-digit' AS agg, COUNT(*) AS n
        FROM read_parquet('final/{filename}-2digit.parquet')
    )
    UNION ALL (
        SELECT 'country' AS agg, COUNT(*) AS n
        FROM read_parquet('final/{filename}-country.parquet')
    )
    """
).df()

Unnamed: 0,agg,n
0,6-digit,53712314
1,4-digit,21723459
2,2-digit,4207039
3,country,144078


## Tabulate the unique categories of each year

### 6-digit level

In [3]:
duckdb.sql(
    f"""
    SELECT 
        t1.t, 
        t1.obs, 
        t1.unique_i, 
        t1.unique_j, 
        t2.unique_ij, 
        t1.unique_k, 
        t3.q_is_na,
        t3.q_is_na / t1.obs AS sh_q_is_na

    FROM (
        SELECT t, 
            COUNT(*) as obs, 
            COUNT(DISTINCT i) as unique_i,
            COUNT(DISTINCT j) as unique_j,
            COUNT(DISTINCT k) as unique_k
        FROM read_parquet('final/{filename}.parquet')
        GROUP BY t
    ) AS t1

    JOIN (
        SELECT t, COUNT(*) as unique_ij
        FROM (
            SELECT DISTINCT t, i, j
            FROM read_parquet('final/{filename}.parquet')
        ) as subquery
        GROUP BY t
    ) AS t2
    ON t1.t = t2.t

    JOIN (
        SELECT t, COUNT(*) as q_is_na
        FROM read_parquet('final/{filename}.parquet')
        WHERE q LIKE '%NA%'
        GROUP BY t
    ) AS t3
    ON t1.t = t3.t

    ORDER BY t1.t
    """
).df()

Unnamed: 0,t,obs,unique_i,unique_j,unique_ij,unique_k,q_is_na,sh_q_is_na
0,2017,9918490,226,226,26194,5381,209115,0.021083
1,2018,10755928,226,226,28901,5381,239195,0.022238
2,2019,11085524,226,226,29787,5381,257058,0.023189
3,2020,10818483,226,226,29575,5381,251051,0.023206
4,2021,11133889,226,226,29621,5381,288397,0.025903


### 4-digit level

In [5]:
duckdb.sql(
    f"""
    SELECT 
        t1.t, 
        t1.obs, 
        t1.unique_i, 
        t1.unique_j, 
        t2.unique_ij, 
        t1.unique_k

    FROM (
        SELECT t, 
            COUNT(*) as obs, 
            COUNT(DISTINCT i) as unique_i,
            COUNT(DISTINCT j) as unique_j,
            COUNT(DISTINCT k4) as unique_k
        FROM read_parquet('final/{filename}-4digit.parquet')
        GROUP BY t
    ) AS t1

    JOIN (
        SELECT t, COUNT(*) as unique_ij
        FROM (
            SELECT DISTINCT t, i, j
            FROM read_parquet('final/{filename}-4digit.parquet')
        ) as subquery
        GROUP BY t
    ) AS t2
    ON t1.t = t2.t

    ORDER BY t1.t
    """
).df()

Unnamed: 0,t,obs,unique_i,unique_j,unique_ij,unique_k
0,2017,4001975,226,226,26194,1222
1,2018,4346872,226,226,28901,1222
2,2019,4482545,226,226,29787,1222
3,2020,4390643,226,226,29575,1222
4,2021,4501424,226,226,29621,1222


### 2-digit level

In [8]:
duckdb.sql(
    f"""
    SELECT 
        t1.t, 
        t1.obs, 
        t1.unique_i, 
        t1.unique_j, 
        t2.unique_ij, 
        t1.unique_k

    FROM (
        SELECT t, 
            COUNT(*) as obs, 
            COUNT(DISTINCT i) as unique_i,
            COUNT(DISTINCT j) as unique_j,
            COUNT(DISTINCT k2) as unique_k
        FROM read_parquet('final/{filename}-2digit.parquet')
        GROUP BY t
    ) AS t1

    JOIN (
        SELECT t, COUNT(*) as unique_ij
        FROM (
            SELECT DISTINCT t, i, j
            FROM read_parquet('final/{filename}-2digit.parquet')
        ) as subquery
        GROUP BY t
    ) AS t2
    ON t1.t = t2.t

    ORDER BY t1.t
    """
).df()

Unnamed: 0,t,obs,unique_i,unique_j,unique_ij,unique_k
0,2017,768992,226,226,26194,96
1,2018,840586,226,226,28901,96
2,2019,869260,226,226,29787,96
3,2020,855331,226,226,29575,96
4,2021,872870,226,226,29621,96


### Country level

In [None]:
duckdb.sql(
    f"""
    SELECT 
        t1.t, 
        t1.obs, 
        t1.unique_i, 
        t1.unique_j, 
        t2.unique_ij, 
        t1.unique_k

    FROM (
        SELECT t, 
            COUNT(*) as obs, 
            COUNT(DISTINCT i) as unique_i,
            COUNT(DISTINCT j) as unique_j
        FROM read_parquet('final/{filename}-country.parquet')
        GROUP BY t
    ) AS t1

    JOIN (
        SELECT t, COUNT(*) as unique_ij
        FROM (
            SELECT DISTINCT t, i, j
            FROM read_parquet('final/{filename}-country.parquet')
        ) as subquery
        GROUP BY t
    ) AS t2
    ON t1.t = t2.t

    ORDER BY t1.t
    """
).df()