In [1]:
import pandas as pd
import numpy as np




In [2]:
# Large dataset — transactions
np.random.seed(0)
transactions = pd.DataFrame({
    'transaction_id': range(1, 10_000_001),
    'country_code': np.random.choice(['US', 'IN', 'CA', 'DE', 'FR', 'UK'], 10_000_000),
    'amount': np.random.randint(10, 500, 10_000_000)
})



In [4]:
# Small dataset — country metadata
countries = pd.DataFrame({
    'country_code': ['US', 'IN', 'CA', 'DE', 'FR', 'UK'],
    'country_name': ['United States', 'India', 'Canada', 'Germany', 'France', 'United Kingdom'],
    'currency': ['USD', 'INR', 'CAD', 'EUR', 'EUR', 'GBP']
})


In [7]:

transactions.head()

Unnamed: 0,transaction_id,country_code,amount
0,1,FR,214
1,2,UK,368
2,3,US,281
3,4,DE,259
4,5,DE,168


In [6]:
countries

Unnamed: 0,country_code,country_name,currency
0,US,United States,USD
1,IN,India,INR
2,CA,Canada,CAD
3,DE,Germany,EUR
4,FR,France,EUR
5,UK,United Kingdom,GBP


In [8]:
transactions['country_code'].value_counts()

country_code
CA    1669277
DE    1668287
IN    1667009
US    1666907
FR    1665017
UK    1663503
Name: count, dtype: int64

In [10]:
transactions.head() , countries


(   transaction_id country_code  amount
 0               1           FR     214
 1               2           UK     368
 2               3           US     281
 3               4           DE     259
 4               5           DE     168,
   country_code    country_name currency
 0           US   United States      USD
 1           IN           India      INR
 2           CA          Canada      CAD
 3           DE         Germany      EUR
 4           FR          France      EUR
 5           UK  United Kingdom      GBP)

In [11]:
# Normal join (simulate shuffle)
merged_normal = transactions.merge(countries, on='country_code', how='left')

# Simulated broadcast join (no shuffle)
# In Spark, this would look like: broadcast(countries)
broadcast_countries = dict(zip(countries['country_code'], zip(countries['country_name'], countries['currency'])))
transactions['country_name'] = transactions['country_code'].map(lambda x: broadcast_countries[x][0])
transactions['currency'] = transactions['country_code'].map(lambda x: broadcast_countries[x][1])


In [12]:
merged_normal.head()

Unnamed: 0,transaction_id,country_code,amount,country_name,currency
0,1,FR,214,France,EUR
1,2,UK,368,United Kingdom,GBP
2,3,US,281,United States,USD
3,4,DE,259,Germany,EUR
4,5,DE,168,Germany,EUR


In [14]:
broadcast_countries

{'US': ('United States', 'USD'),
 'IN': ('India', 'INR'),
 'CA': ('Canada', 'CAD'),
 'DE': ('Germany', 'EUR'),
 'FR': ('France', 'EUR'),
 'UK': ('United Kingdom', 'GBP')}

In [15]:
transactions.head()

Unnamed: 0,transaction_id,country_code,amount,country_name,currency
0,1,FR,214,France,EUR
1,2,UK,368,United Kingdom,GBP
2,3,US,281,United States,USD
3,4,DE,259,Germany,EUR
4,5,DE,168,Germany,EUR
