# Coding for Economists - Advanced Session 4

## 1. Setup Environment

In [1]:
%pip install line-profiler memory-profiler dask 'dask[distributed]'

In [2]:
# Install in Google Colab
# !pip install line-profiler memory-profiler

In [3]:
import numpy as np
import pandas as pd
# Turn on copy on write
pd.options.mode.copy_on_write = True

In [4]:
# Load profilers
%load_ext line_profiler
%load_ext memory_profiler

## 2. A Slow Application

### 2.1 Load Data

In [86]:
import csv

gdp = {}
path = 'gdp_panel.csv'
with open(path) as f:
    reader = csv.DictReader(f)
    for row in reader:
        country = row['code']
        value = float(row['value'])
        if country not in gdp:
            gdp[country] = []
        gdp[country].append(value)

regions = {}
path = 'country_region.csv'
with open(path) as f:
    reader = csv.DictReader(f)
    for row in reader:
        country = row['code']
        region = row['region']
        if country not in regions:
            regions[country] = region

print(gdp.keys())

dict_keys(['AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GTM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRT', 'PRY', 'QAT', 'ROU', 'R

In [87]:
print(regions['IRL'])

Europe


### 2.2 Compute GDP Growth and Volatility

In [89]:
from statistics import pstdev

metrics = {}
window = 52
for country, series in gdp.items():
    growth = []
    vol = []
    for i in range(window, len(series)):
        prev = series[i-window]
        curr = series[i]
        growth.append((curr - prev) / prev)
        vol.append(pstdev(series[i-window:i]))
    metrics[country] = {'growth': growth, 'vol': vol}

print(metrics.keys())

dict_keys(['AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GTM', 'GUY', 'HKG', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LCA', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NAM', 'NER', 'NGA', 'NIC', 'NLD', 'NOR', 'NPL', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PLW', 'PNG', 'POL', 'PRT', 'PRY', 'QAT', 'ROU', 'R

### 2.3 Aggregate Regions

In [90]:
# country → region
agg = {}  # plain dict

# Collect all growth & vol values by region
for country, m in metrics.items():
    region = regions[country]
    if region not in agg:
        agg[region] = {'growth': [], 'vol': []}
    agg[region]['growth'].extend(m['growth'])
    agg[region]['vol'].extend(m['vol'])

# Compute regional averages
result = {}
for region, d in agg.items():
    total_growth = sum(d['growth'])
    total_vol    = sum(d['vol'])
    count_growth = len(d['growth'])
    count_vol    = len(d['vol'])
    result[region] = {
        'avg_growth': total_growth / count_growth if count_growth else 0,
        'avg_vol':    total_vol    / count_vol    if count_vol    else 0
    }

print(result)

{'Asia': {'avg_growth': 0.04330520206247363, 'avg_vol': 2574.585938176005}, 'Africa': {'avg_growth': 0.03458519294974054, 'avg_vol': 39.16879991869174}, 'Europe': {'avg_growth': 0.01903380340329299, 'avg_vol': 33.2089496557581}, 'South America': {'avg_growth': 0.02698163863184202, 'avg_vol': 328.9790282587653}, 'North America': {'avg_growth': 0.025703795612439923, 'avg_vol': 8.29425195157262}, 'Oceania': {'avg_growth': 0.019067501900675687, 'avg_vol': 0.8709104509906413}}


### 2.4 Time and Memory Profiling

In [91]:
def run_analysis_v0():
    # Load Data
    gdp = {}
    path = 'gdp_panel.csv'
    with open(path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            country = row['code']
            value = float(row['value'])
            if country not in gdp:
                gdp[country] = []
            gdp[country].append(value)
    
    regions = {}
    path = 'country_region.csv'
    with open(path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            country = row['code']
            region = row['region']
            if country not in regions:
                regions[country] = region

    # Compute dgp growth rate and volitility
    metrics = {}
    window = 52
    for country, series in gdp.items():
        growth = []
        vol = []
        for i in range(window, len(series)):
            prev = series[i-window]
            curr = series[i]
            growth.append((curr - prev) / prev)
            vol.append(pstdev(series[i-window:i]))
        metrics[country] = {'growth': growth, 'vol': vol}

    # Aggregate Regions
    agg = {}  # plain dict

    for country, m in metrics.items():
        region = regions[country]
        if region not in agg:
            agg[region] = {'growth': [], 'vol': []}
        agg[region]['growth'].extend(m['growth'])
        agg[region]['vol'].extend(m['vol'])
    
    result = {}
    for region, d in agg.items():
        total_growth = sum(d['growth'])
        total_vol    = sum(d['vol'])
        count_growth = len(d['growth'])
        count_vol    = len(d['vol'])
        result[region] = {
            'avg_growth': total_growth / count_growth if count_growth else 0,
            'avg_vol':    total_vol    / count_vol    if count_vol    else 0
        }
    return result
    
%lprun -f run_analysis_v0 run_analysis_v0()

Timer unit: 1e-09 s

Total time: 21.4275 s
File: /var/folders/j4/72zz472s4sdd1r1ssyml8k5w0000gn/T/ipykernel_30612/3544963303.py
Function: run_analysis_v0 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def run_analysis_v0():
     2                                               # Load Data
     3         1       1000.0   1000.0      0.0      gdp = {}
     4         1          0.0      0.0      0.0      path = 'gdp_panel.csv'
     5         2     509000.0 254500.0      0.0      with open(path) as f:
     6         1      16000.0  16000.0      0.0          reader = csv.DictReader(f)
     7    359997  429302000.0   1192.5      2.0          for row in reader:
     8    359996   21779000.0     60.5      0.1              country = row['code']
     9    359996   60667000.0    168.5      0.3              value = float(row['value'])
    10    359996   24662000.0     68.5      0.1              if country not in gdp:
    11

In [119]:
import sys
!"{sys.executable}" -m memory_profiler app_v0.py

Filename: app_v0.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    10   54.812 MiB   54.812 MiB           1   @profile
    11                                         def run_analysis_v0():
    12                                             # Load Data
    13   54.812 MiB    0.000 MiB           1       gdp = {}
    14   54.812 MiB    0.000 MiB           1       path = 'gdp_panel.csv'
    15   68.703 MiB    0.000 MiB           2       with open(path) as f:
    16   54.812 MiB    0.000 MiB           1           reader = csv.DictReader(f)
    17   68.703 MiB    0.156 MiB      359997           for row in reader:
    18   68.703 MiB    0.000 MiB      359996               country = row['code']
    19   68.703 MiB    0.016 MiB      359996               value = float(row['value'])
    20   68.703 MiB   11.000 MiB      359996               if country not in gdp:
    21   68.672 MiB    0.000 MiB         189                   gdp[country] = []
    22   68.703 MiB    2.719 MiB  

## 3. Panda Dataframe

### 3.1 Load Data

In [93]:
import pandas as pd
gdp = pd.read_csv(
    'gdp_panel.csv',
    usecols=['code','year','value'], 
    parse_dates=['year'],
    dtype={'code':'category'}
).sort_values(['code','year'])

gdp.head()

Unnamed: 0,code,year,value
0,AFG,2002-01-06,235.731
1,AFG,2002-01-13,235.731
2,AFG,2002-01-20,235.731
3,AFG,2002-01-27,235.731
4,AFG,2002-02-03,235.731


In [94]:
regions = pd.read_csv(
    'country_region.csv',
    usecols=['code','region'], 
    dtype={'code':'category', 'region':'category'}
).sort_values(['code'])

regions.head()

Unnamed: 0,code,region
11,ABW,North America
0,AFG,Asia
6,AGO,Africa
7,AIA,North America
1,ALA,Europe


### 3.2 Compute GDP Growth and Volatility

In [97]:
window = 52
# pivot to wide form: rows=date, cols=country
panel = gdp.pivot(index='year', columns='code', values='value')

# rolling growth: pct_change over window periods
growth = panel.pct_change(periods=window, fill_method=None)

# rolling volatility: std over window periods
vol = panel.rolling(window=window).std()

# melt back to long form & merge region info
growth = growth.stack().rename('growth').reset_index()
vol    = vol.stack().rename('vol').reset_index()
metrics = pd.merge(growth, vol, on=['year','code'])
metrics = metrics.merge(regions, on='code')

metrics.shape

(350168, 5)

### 3.3 Aggregate Regions

In [98]:
result = (
    metrics
    .groupby(['region'], observed=False)
    .agg(avg_growth=('growth','mean'),
         avg_vol=('vol','mean'))
)

print(result)

               avg_growth      avg_vol
region                                
Africa           0.034585    39.550944
Asia             0.043305  2599.704445
Europe           0.019034    33.532537
North America    0.025704     8.375173
Oceania          0.019068     0.879407
South America    0.026982   332.188656


### 3.4 Time and Memory Profiling

In [99]:
def run_analysis_v1():
    # Load Data
    gdp = pd.read_csv(
        'gdp_panel.csv',
        usecols=['code','year','value'], # Only read necessary columns
        parse_dates=['year'],
        dtype={'code':'category'}
    ).sort_values(['code','year'])

    regions = pd.read_csv(
        'country_region.csv',
        usecols=['code','region'], # Only read necessary columns
        dtype={'code':'category', 'region':'category'}
    ).sort_values(['code'])

    window = 52
    # Compute GDP Growth and Volatility
    # pivot to wide form: rows=date, cols=country
    panel = gdp.pivot(index='year', columns='code', values='value')
    
    # rolling growth: pct_change over window periods
    growth = panel.pct_change(periods=window, fill_method=None)
    
    # rolling volatility: std over window periods
    vol = panel.rolling(window=window).std()
    
    # melt back to long form & merge region info
    growth = growth.stack().rename('growth').reset_index()
    vol    = vol.stack().rename('vol').reset_index()
    metrics = pd.merge(growth, vol, on=['year','code'])
    metrics = metrics.merge(regions, on='code')

    # Aggregate Regions
    result = (
        metrics
        .groupby(['region'], observed=False)
        .agg(avg_growth=('growth','mean'),
             avg_vol=('vol','mean'))
    )
    return result

%lprun -f run_analysis_v1 run_analysis_v1()

Timer unit: 1e-09 s

Total time: 0.526235 s
File: /var/folders/j4/72zz472s4sdd1r1ssyml8k5w0000gn/T/ipykernel_30612/3883623108.py
Function: run_analysis_v1 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def run_analysis_v1():
     2                                               # Load Data
     3         3  448159000.0    1e+08     85.2      gdp = pd.read_csv(
     4         1          0.0      0.0      0.0          'gdp_panel.csv',
     5         1          0.0      0.0      0.0          usecols=['code','year','value'], # Only read necessary columns
     6         1          0.0      0.0      0.0          parse_dates=['year'],
     7         1       1000.0   1000.0      0.0          dtype={'code':'category'}
     8         1    3727000.0    4e+06      0.7      ).sort_values(['code','year'])
     9                                           
    10         3    2109000.0 703000.0      0.4      regions = pd.read_c

In [100]:
import sys
!"{sys.executable}" -m memory_profiler app_v1.py

Filename: app_v1.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     9  121.172 MiB  121.172 MiB           1   @profile
    10                                         def run_analysis_v1():
    11                                             # Load Data
    12  192.031 MiB   59.484 MiB           3       gdp = pd.read_csv(
    13  121.172 MiB    0.000 MiB           1           'gdp_panel.csv',
    14  121.172 MiB    0.000 MiB           1           usecols=['code','year','value'], # Only read necessary columns
    15  121.172 MiB    0.000 MiB           1           parse_dates=['year'],
    16  121.172 MiB    0.000 MiB           1           dtype={'code':'category'}
    17  192.031 MiB   11.375 MiB           1       ).sort_values(['code','year'])
    18                                         
    19  192.109 MiB    0.062 MiB           3       regions = pd.read_csv(
    20  192.031 MiB    0.000 MiB           1           'country_region.csv',
    21  192.031 MiB    0.000 

## 4. Memory Bound -> Process by Chunk

### 4.1 Prepare Parquet Data

In [101]:
import dask.dataframe as dd

# Read the raw CSV in parallel
ddf = dd.read_csv(
    'gdp_panel.csv',
    usecols=['code','year','value'],
    parse_dates=['year'],
    dtype={'code':'str'}
)

# Write out as partitioned Parquet
#    Here we partition by 'country' so that each country’s data lives in its own sub-folder.
ddf.to_parquet(
    'gdp_panel_parquet/',
    engine='pyarrow',
    write_index=False,         # drop the old CSV index
    partition_on=['code'],  # creates subfolders country=AAA, country=BBB, …
    compression='snappy'       # fast compression
)

# Read the raw CSV in parallel
ddf_region = dd.read_csv(
    'country_region.csv',
    usecols=['code','region'],
    dtype={'code':'str'}
)

# Write out as partitioned Parquet
ddf_region.to_parquet(
    'country_region_parquet/',
    engine='pyarrow',
    write_index=False,         # drop the old CSV index
    partition_on=['code'],  # creates subfolders country=AAA, country=BBB, …
    compression='snappy'       # fast compression
)

### 4.2 Load Data

In [102]:
import dask.dataframe as dd
import pandas as pd

ddf = dd.read_parquet(
    'gdp_panel_parquet/',
    columns=['code','year','value'],
    engine='pyarrow'
)

regions = dd.read_parquet('country_region_parquet/')

### 4.3 Compute GDP Growth and Volatility

In [103]:
def summarize_country(df, window=52):
    # If this slice is empty, return an empty DataFrame
    if df.empty:
        return pd.DataFrame(columns=['code','mean_growth','mean_vol'])

    # Otherwise do the rolling stats
    df = df.sort_values('year')
    growth = df['value'].pct_change(periods=window, fill_method=None).mean()
    vol    = df['value'].rolling(window=window).std().mean()
    return pd.DataFrame({
        'code':        [df['code'].iat[0]],
        'mean_growth': [growth],
        'mean_vol':    [vol]
    })

# Build a proper meta so Dask knows what comes back
meta = pd.DataFrame({
    'code':        pd.Series(dtype='object'),
    'mean_growth': pd.Series(dtype='float64'),
    'mean_vol':    pd.Series(dtype='float64'),
})

country_summaries = ddf.groupby('code', observed=True).apply(
    summarize_country,
    meta=meta
)

country_summaries = country_summaries.reset_index(drop=True)

### 4.4 Aggregate Regions

In [104]:
country_summaries = country_summaries.astype({'code': 'string'})
regions = regions.astype({'code': 'string'})

country_summaries = country_summaries.merge(regions, on='code')

regional_stats = (
    country_summaries
    .groupby('region', observed=True)[['mean_growth', 'mean_vol']]
    .mean()
    .compute()
)

regional_stats

Unnamed: 0_level_0,mean_growth,mean_vol
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,0.034808,37.397032
Asia,0.043784,2325.302294
North America,0.025704,8.371062
Europe,0.019083,38.417443
South America,0.026982,332.025578
Oceania,0.017529,0.754321


### 4.5 Time and Memory Profiling

In [105]:
def run_analysis_v2():
    # Load data
    ddf = dd.read_parquet(
        'gdp_panel_parquet/',
        columns=['code','year','value'],
        engine='pyarrow'
    )
    
    regions = dd.read_parquet('country_region_parquet/')

    # Compute GDP Growth and Volatility
    meta = pd.DataFrame({
        'code':        pd.Series(dtype='object'),
        'mean_growth': pd.Series(dtype='float64'),
        'mean_vol':    pd.Series(dtype='float64'),
    })
    
    country_summaries = ddf.groupby('code', observed=True).apply(
        summarize_country,
        meta=meta
    )
    
    country_summaries = country_summaries.reset_index(drop=True)

    # Aggregate regions
    country_summaries = country_summaries.astype({'code': 'string'})
    regions = regions.astype({'code': 'string'})
    
    country_summaries = country_summaries.merge(regions, on='code')
    
    result = (
        country_summaries
        .groupby('region', observed=True)[['mean_growth', 'mean_vol']]
        .mean()
        .compute()
    )
    return result

%lprun -f run_analysis_v2 run_analysis_v2()

Timer unit: 1e-09 s

Total time: 2.49499 s
File: /var/folders/j4/72zz472s4sdd1r1ssyml8k5w0000gn/T/ipykernel_30612/1602383827.py
Function: run_analysis_v2 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def run_analysis_v2():
     2                                               # Load data
     3         2   54152000.0    3e+07      2.2      ddf = dd.read_parquet(
     4         1          0.0      0.0      0.0          'gdp_panel_parquet/',
     5         1       1000.0   1000.0      0.0          columns=['code','year','value'],
     6         1          0.0      0.0      0.0          engine='pyarrow'
     7                                               )
     8                                           
     9         1   40865000.0    4e+07      1.6      regions = dd.read_parquet('country_region_parquet/')
    10                                           
    11                                               # 

In [106]:
import sys
!"{sys.executable}" -m memory_profiler app_v2.py

Filename: app_v2.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    25  158.516 MiB  158.516 MiB           1   @profile
    26                                         def run_analysis_v2():
    27                                             # Load data
    28  164.500 MiB    5.984 MiB           2       ddf = dd.read_parquet(
    29  158.516 MiB    0.000 MiB           1           'gdp_panel_parquet/',
    30  158.516 MiB    0.000 MiB           1           columns=['code','year','value'],
    31  158.516 MiB    0.000 MiB           1           engine='pyarrow'
    32                                             )
    33                                             
    34  165.453 MiB    0.953 MiB           1       regions = dd.read_parquet('country_region_parquet/')
    35                                         
    36                                             # Compute GDP Growth and Volatility
    37  165.453 MiB    0.000 MiB           2       meta = pd.DataFrame({


## 5. CPU Bound -> Parallel Computing

In [107]:
import os
import glob
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from dask.distributed import Client

### 5.1 Prepare Parquet Data

In [108]:
# Read the raw CSV in parallel
ddf = dd.read_csv(
    'gdp_panel.csv',
    usecols=['code','year','value'],
    parse_dates=['year'],
    dtype={'code':'str'}
)

# Write out as partitioned Parquet
#    Here we partition by 'country' so that each country’s data lives in its own sub-folder.
ddf.to_parquet(
    'gdp_panel_parquet/',
    engine='pyarrow',
    write_index=False,         # drop the old CSV index
    partition_on=['code'],  # creates subfolders country=AAA, country=BBB, …
    compression='snappy'       # fast compression
)

### 5.2 Functions to Detect and Process Single Files

In [109]:
def discover_country_tasks(base_dir):
    tasks = []
    for entry in os.listdir(base_dir):
        full = os.path.join(base_dir, entry)
        if not (entry.startswith("code=") and os.path.isdir(full)):
            continue

        code = entry.split("=", 1)[1]
        pattern = os.path.join(full, "*.parquet")
        files = sorted(glob.glob(pattern))
        if files:
            tasks.append((code, files))
    return tasks

In [110]:
def process_country(code, parquet_paths, window = 52):
    # parquet_paths: list of all .parquet files for a single country-partition
    # read them all (value only), concatenate into one 1-D numpy array
    arrays = []
    for path in parquet_paths:
        tbl = pq.read_table(path, columns=['value'])
        arrays.append(tbl.column('value').to_numpy())
    series = np.concatenate(arrays)
    # now do the window‐period rolling growth & vol
    growth = (series[window:] - series[:-window]) / series[:-window]
    windows = np.lib.stride_tricks.sliding_window_view(series, window)
    vol    = np.std(windows, axis=1)
    return code, (growth.mean(), vol.mean())

### 5.3 Process Files in Parallel

In [111]:
# Discover tasks
tasks = discover_country_tasks('gdp_panel_parquet/')
tasks[:5]

[('BRN', ['gdp_panel_parquet/code=BRN/part.0.parquet']),
 ('AGO', ['gdp_panel_parquet/code=AGO/part.0.parquet']),
 ('GNB', ['gdp_panel_parquet/code=GNB/part.0.parquet']),
 ('OMN', ['gdp_panel_parquet/code=OMN/part.0.parquet']),
 ('MNE', ['gdp_panel_parquet/code=MNE/part.0.parquet'])]

In [112]:
# Launch Dask
client = Client() 
print(client)

<Client: 'tcp://127.0.0.1:52985' processes=5 threads=10, memory=16.00 GiB>


In [113]:
# Submit tasks
futures = client.map(lambda args: process_country(*args), tasks)
results = client.gather(futures)

In [114]:
# Close Dask
client.close() 

In [115]:
# Build DataFrame & merge
df = pd.DataFrame(
    [(code, g, v) for code, (g, v) in results],
    columns=['code','avg_growth','avg_vol']
)
regions = pd.read_csv('country_region.csv', dtype={"code": str})
df = df.merge(regions, on="code", how="left")

### 5.4 Aggregate Region

In [116]:
# Aggregate by region
result = df.groupby("region")[["avg_growth", "avg_vol"]].mean()
result

Unnamed: 0_level_0,avg_growth,avg_vol
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,0.034808,37.0357
Asia,0.043784,2302.835078
Europe,0.019083,38.046251
North America,0.025704,8.29018
Oceania,0.017529,0.747032
South America,0.026982,328.817527


### 5.5 Time and Memory Profiling

In [117]:
def run_analysis_v3():
    # Discover tasks
    tasks = discover_country_tasks('gdp_panel_parquet/')

    # Launch Dask
    client = Client() 

    # Submit Tasks
    futures = client.map(lambda args: process_country(*args), tasks)
    results = client.gather(futures)

    # Close Dask
    client.close() 

    # Build DataFrame & merge
    df = pd.DataFrame(
        [(code, g, v) for code, (g, v) in results],
        columns=['code','avg_growth','avg_vol']
    )
    regions = pd.read_csv('country_region.csv', dtype={"code": str})
    df = df.merge(regions, on="code", how="left")

    # Aggregate by region
    result = df.groupby("region")[["avg_growth", "avg_vol"]].mean()

    return result

%lprun -f run_analysis_v3 run_analysis_v3()

Timer unit: 1e-09 s

Total time: 1.02506 s
File: /var/folders/j4/72zz472s4sdd1r1ssyml8k5w0000gn/T/ipykernel_30612/188904422.py
Function: run_analysis_v3 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def run_analysis_v3():
     2                                               # Discover tasks
     3         1   38320000.0    4e+07      3.7      tasks = discover_country_tasks('gdp_panel_parquet/')
     4                                           
     5                                               # Launch Dask
     6         1  454413000.0    5e+08     44.3      client = Client() 
     7                                           
     8                                               # Submit Tasks
     9         1    7538000.0    8e+06      0.7      futures = client.map(lambda args: process_country(*args), tasks)
    10         1  398899000.0    4e+08     38.9      results = client.gather(futures)
    11        

In [118]:
import sys
!"{sys.executable}" -m memory_profiler app_v3.py

Filename: app_v3.py

Line #    Mem usage    Increment  Occurrences   Line Contents
    42  139.344 MiB  139.344 MiB           1   @profile
    43                                         def run_analysis_v3():
    44                                             # Discover tasks
    45  139.406 MiB    0.062 MiB           1       tasks = discover_country_tasks('gdp_panel_parquet/')
    46                                         
    47                                             # Launch Dask
    48  144.125 MiB    4.719 MiB           1       client = Client() 
    49                                         
    50                                             # Submit Tasks
    51  144.312 MiB    0.188 MiB           1       futures = client.map(lambda args: process_country(*args), tasks)
    52  146.766 MiB    2.453 MiB           1       results = client.gather(futures)
    53                                         
    54                                             # Close Dask
    55  14