In [1]:
import pandas as pd
import json
import re
from pathlib import Path
import unidecode

In [63]:
# Load data
events_df = pd.read_csv('../eda/data/events.csv')
venues_df = pd.read_csv('../eda/data/venues.csv')
medals_df = pd.read_csv('../eda/data/medals.csv')
medallists_df = pd.read_csv('../eda/data/medallists.csv')
athletes_df = pd.read_csv('../eda/data/athletes.csv')

# Display some sample data
print(f"Events data shape: {events_df.shape}")
print(f"Venues data shape: {venues_df.shape}")
print(f"Medals data shape: {medals_df.shape}")
print(f"Medallists data shape: {medallists_df.shape}")
print(f"Athletes data shape: {athletes_df.shape}")

events_df.head(2)

Events data shape: (329, 5)
Venues data shape: (35, 6)
Medals data shape: (1044, 13)
Medallists data shape: (2315, 21)
Athletes data shape: (11113, 36)


Unnamed: 0,event,tag,sport,sport_code,sport_url
0,Men's Individual,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
1,Women's Individual,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery


In [64]:
# Display columns
print("Events columns:")
print(events_df.columns.tolist())
print("\nVenues columns:")
print(venues_df.columns.tolist())
print("\nMedals columns:")
print(medals_df.columns.tolist())
print("\nMedallists columns:")
print(medallists_df.columns.tolist())

Events columns:
['event', 'tag', 'sport', 'sport_code', 'sport_url']

Venues columns:
['venue', 'sports', 'date_start', 'date_end', 'tag', 'url']

Medals columns:
['medal_type', 'medal_code', 'medal_date', 'name', 'gender', 'discipline', 'event', 'event_type', 'url_event', 'code', 'country_code', 'country', 'country_long']

Medallists columns:
['medal_date', 'medal_type', 'medal_code', 'name', 'gender', 'country_code', 'country', 'country_long', 'nationality_code', 'nationality', 'nationality_long', 'team', 'team_gender', 'discipline', 'event', 'event_type', 'url_event', 'birth_date', 'code_athlete', 'code_team', 'is_medallist']


In [65]:
# function to generate a slug from a string
def generate_slug(text):
    if pd.isna(text) or text == '':
        return ''
    
    slug = text.lower().replace(' ', '-')
    slug = unidecode.unidecode(slug)
    slug = re.sub(r'[^a-z0-9-]', '', slug)
    return slug

# function to convert 3-letter country code to 2-letter
def convert_country_code(code_3):
    # Map of Alpha-3 country codes to Alpha-2 codes
    country_codes = {
        'AFG': 'AF', 'ALA': 'AX', 'ALB': 'AL', 'DZA': 'DZ', 'ASM': 'AS', 'AND': 'AD', 'AGO': 'AO', 'AIA': 'AI',
        'ATA': 'AQ', 'ATG': 'AG', 'ARG': 'AR', 'ARM': 'AM', 'ABW': 'AW', 'AUS': 'AU', 'AUT': 'AT', 'AZE': 'AZ',
        'BHS': 'BS', 'BHR': 'BH', 'BGD': 'BD', 'BRB': 'BB', 'BLR': 'BY', 'BEL': 'BE', 'BLZ': 'BZ', 'BEN': 'BJ',
        'BMU': 'BM', 'BTN': 'BT', 'BOL': 'BO', 'BES': 'BQ', 'BIH': 'BA', 'BWA': 'BW', 'BVT': 'BV', 'BRA': 'BR',
        'IOT': 'IO', 'BRN': 'BN', 'BGR': 'BG', 'BFA': 'BF', 'MMR': 'MM', 'BDI': 'BI', 'KHM': 'KH', 'CMR': 'CM',
        'CAN': 'CA', 'CPV': 'CV', 'CYM': 'KY', 'CAF': 'CF', 'TCD': 'TD', 'CHL': 'CL', 'CHN': 'CN', 'CXR': 'CX',
        'CCK': 'CC', 'COL': 'CO', 'COM': 'KM', 'COD': 'CD', 'COG': 'CG', 'COK': 'CK', 'CRI': 'CR', 'CIV': 'CI',
        'HRV': 'HR', 'CUB': 'CU', 'CUW': 'CW', 'CYP': 'CY', 'CZE': 'CZ', 'DNK': 'DK', 'DJI': 'DJ', 'DMA': 'DM',
        'DOM': 'DO', 'ECU': 'EC', 'EGY': 'EG', 'SLV': 'SV', 'GNQ': 'GQ', 'ERI': 'ER', 'EST': 'EE', 'ETH': 'ET',
        'FLK': 'FK', 'FRO': 'FO', 'FJI': 'FJ', 'FIN': 'FI', 'FRA': 'FR', 'GUF': 'GF', 'PYF': 'PF', 'ATF': 'TF',
        'GAB': 'GA', 'GMB': 'GM', 'GEO': 'GE', 'DEU': 'DE', 'GHA': 'GH', 'GIB': 'GI', 'GRC': 'GR', 'GRL': 'GL',
        'GRD': 'GD', 'GLP': 'GP', 'GUM': 'GU', 'GTM': 'GT', 'GGY': 'GG', 'GNB': 'GW', 'GIN': 'GN', 'GUY': 'GY',
        'HTI': 'HT', 'HMD': 'HM', 'VAT': 'VA', 'HND': 'HN', 'HKG': 'HK', 'HUN': 'HU', 'ISL': 'IS', 'IND': 'IN',
        'IDN': 'ID', 'IRN': 'IR', 'IRQ': 'IQ', 'IRL': 'IE', 'IMN': 'IM', 'ISR': 'IL', 'ITA': 'IT', 'JAM': 'JM',
        'JPN': 'JP', 'JEY': 'JE', 'JOR': 'JO', 'KAZ': 'KZ', 'KEN': 'KE', 'KIR': 'KI', 'PRK': 'KP', 'KOR': 'KR',
        'XKX': 'XK', 'KWT': 'KW', 'KGZ': 'KG', 'LAO': 'LA', 'LVA': 'LV', 'LBN': 'LB', 'LSO': 'LS', 'LBR': 'LR',
        'LBY': 'LY', 'LIE': 'LI', 'LTU': 'LT', 'LUX': 'LU', 'MAC': 'MO', 'MKD': 'MK', 'MDG': 'MG', 'MWI': 'MW',
        'MYS': 'MY', 'MDV': 'MV', 'MLI': 'ML', 'MLT': 'MT', 'MHL': 'MH', 'MTQ': 'MQ', 'MRT': 'MR', 'MUS': 'MU',
        'MYT': 'YT', 'MEX': 'MX', 'FSM': 'FM', 'MDA': 'MD', 'MCO': 'MC', 'MNG': 'MN', 'MNE': 'ME', 'MSR': 'MS',
        'MAR': 'MA', 'MOZ': 'MZ', 'NAM': 'NA', 'NRU': 'NR', 'NPL': 'NP', 'ANT': 'AN', 'NLD': 'NL', 'NCL': 'NC',
        'NZL': 'NZ', 'NIC': 'NI', 'NER': 'NE', 'NGA': 'NG', 'NIU': 'NU', 'NFK': 'NF', 'MNP': 'MP', 'NOR': 'NO',
        'OMN': 'OM', 'PAK': 'PK', 'PLW': 'PW', 'PSE': 'PS', 'PAN': 'PA', 'PNG': 'PG', 'PRY': 'PY', 'PER': 'PE',
        'PHL': 'PH', 'PCN': 'PN', 'POL': 'PL', 'PRT': 'PT', 'PRI': 'PR', 'QAT': 'QA', 'REU': 'RE', 'ROU': 'RO',
        'RUS': 'RU', 'RWA': 'RW', 'BLM': 'BL', 'SHN': 'SH', 'KNA': 'KN', 'LCA': 'LC', 'MAF': 'MF', 'SPM': 'PM',
        'VCT': 'VC', 'WSM': 'WS', 'SMR': 'SM', 'STP': 'ST', 'SAU': 'SA', 'SEN': 'SN', 'SRB': 'RS', 'SYC': 'SC',
        'SLE': 'SL', 'SGP': 'SG', 'SXM': 'SX', 'SVK': 'SK', 'SVN': 'SI', 'SLB': 'SB', 'SOM': 'SO', 'ZAF': 'ZA',
        'SGS': 'GS', 'SSD': 'SS', 'ESP': 'ES', 'LKA': 'LK', 'SDN': 'SD', 'SUR': 'SR', 'SJM': 'SJ', 'SWZ': 'SZ',
        'SWE': 'SE', 'CHE': 'CH', 'SYR': 'SY', 'TWN': 'TW', 'TJK': 'TJ', 'TZA': 'TZ', 'THA': 'TH', 'TLS': 'TL',
        'TGO': 'TG', 'TKL': 'TK', 'TON': 'TO', 'TTO': 'TT', 'TUN': 'TN', 'TUR': 'TR', 'TKM': 'TM', 'TCA': 'TC',
        'TUV': 'TV', 'UGA': 'UG', 'UKR': 'UA', 'ARE': 'AE', 'GBR': 'GB', 'UMI': 'UM', 'USA': 'US', 'URY': 'UY',
        'UZB': 'UZ', 'VUT': 'VU', 'VEN': 'VE', 'VNM': 'VN', 'VGB': 'VG', 'VIR': 'VI', 'WLF': 'WF', 'ESH': 'EH',
        'YEM': 'YE', 'ZMB': 'ZM', 'ZWE': 'ZW',
        # Special cases 
        'GER': 'DE', 'NED': 'NL', 'SUI': 'CH', 'ROM': 'RO', 'RSA': 'ZA'
    }
    
    if pd.isna(code_3) or code_3 == '':
        return ''
    
    return country_codes.get(code_3, code_3[:2])  # Default to first 2 letters if not in mapping

# function to generate a first-name-last-name slug
def generate_name_slug(name):
    if pd.isna(name) or name == '':
        return ''
    
    # Split the name into parts
    parts = name.split()
    
    # For names with at least two parts
    if len(parts) >= 2:
        # If second part is all caps, assume it's "Firstname LASTNAME"
        if parts[1].isupper():
            name = f"{parts[0]} {parts[1]}"
        # Otherwise, assume "LASTNAME Firstname" format and reorder
        else:
            name = f"{parts[1]} {parts[0]}"
    
    # Convert to lowercase and replace spaces with hyphens
    slug = name.lower().replace(' ', '-')
    # Remove accents
    slug = unidecode.unidecode(slug)
    # Remove any special characters
    slug = re.sub(r'[^a-z0-9-]', '', slug)
    return slug

In [66]:
# Examine the events data to understand the structure
events_df.head(10)

Unnamed: 0,event,tag,sport,sport_code,sport_url
0,Men's Individual,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
1,Women's Individual,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
2,Men's Team,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
3,Women's Team,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
4,Mixed Team,archery,Archery,ARC,https://olympics.com/en/paris-2024/sports/archery
5,Men's Team,artistic-gymnastics,Artistic Gymnastics,GAR,https://olympics.com/en/paris-2024/sports/arti...
6,Men's All-Around,artistic-gymnastics,Artistic Gymnastics,GAR,https://olympics.com/en/paris-2024/sports/arti...
7,Men's Floor Exercise,artistic-gymnastics,Artistic Gymnastics,GAR,https://olympics.com/en/paris-2024/sports/arti...
8,Men's Pommel Horse,artistic-gymnastics,Artistic Gymnastics,GAR,https://olympics.com/en/paris-2024/sports/arti...
9,Men's Rings,artistic-gymnastics,Artistic Gymnastics,GAR,https://olympics.com/en/paris-2024/sports/arti...


In [67]:
# Get unique sports
sports_df = events_df[['sport', 'sport_code', 'tag']].drop_duplicates()
print(f"Number of unique sports: {len(sports_df)}")
sports_df.head(10)

Number of unique sports: 45


Unnamed: 0,sport,sport_code,tag
0,Archery,ARC,archery
5,Artistic Gymnastics,GAR,artistic-gymnastics
19,Artistic Swimming,SWA,artistic-swimming
21,Athletics,ATH,athletics
69,Badminton,BDM,badminton
74,Basketball,BKB,basketball
76,3x3 Basketball,BK3,3x3-basketball
78,Beach Volleyball,VBV,beach-volleyball
80,Boxing,BOX,boxing
93,Breaking,BKG,breaking


In [68]:
# Check venues data
venues_df.head(10)

Unnamed: 0,venue,sports,date_start,date_end,tag,url
0,Aquatics Centre,"['Artistic Swimming', 'Diving', 'Water Polo']",2024-07-27T09:00:00Z,2024-08-10T20:00:00Z,aquatics-centre,https://olympics.com/en/paris-2024/venues/aqua...
1,Bercy Arena,"['Artistic Gymnastics', 'Basketball', 'Trampol...",2024-07-27T09:00:00Z,2024-08-11T16:00:00Z,bercy-arena,https://olympics.com/en/paris-2024/venues/berc...
2,Bordeaux Stadium,['Football'],2024-07-25T17:00:00Z,2024-08-02T21:59:00Z,bordeaux-stadium,https://olympics.com/en/paris-2024/venues/bord...
3,Champ de Mars Arena,"['Judo', 'Wrestling']",2024-07-27T08:00:00Z,2024-08-11T12:00:00Z,champ-de-mars-arena,https://olympics.com/en/paris-2024/venues/cham...
4,Château de Versailles,"['Equestrian', 'Modern Pentathlon']",2024-07-27T07:30:00Z,2024-08-11T11:30:00Z,chateau-de-versailles,https://olympics.com/en/paris-2024/venues/chat...
5,Chateauroux Shooting Centre,['Shooting'],2024-07-27T07:00:00Z,2024-08-05T14:35:00Z,chateauroux-shooting-centre,https://olympics.com/en/paris-2024/venues/chat...
6,Eiffel Tower Stadium,['Beach Volleyball'],2024-07-27T12:00:00Z,2024-08-10T21:59:00Z,eiffel-tower-stadium,https://olympics.com/en/paris-2024/venues/eiff...
7,Elancourt Hill,['Cycling Mountain Bike'],2024-07-28T12:00:00Z,2024-07-29T14:30:00Z,elancourt-hill,https://olympics.com/en/paris-2024/venues/elan...
8,Geoffroy-Guichard Stadium,['Football'],2024-07-25T13:00:00Z,2024-07-31T19:00:00Z,geoffroy-guichard-stadium,https://olympics.com/en/paris-2024/venues/geof...
9,Grand Palais,"['Fencing', 'Taekwondo']",2024-07-27T08:00:00Z,2024-08-10T21:00:00Z,grand-palais,https://olympics.com/en/paris-2024/venues/gran...


In [69]:
# Get sports venues mapping - fixing the issue with brackets and quotes in keys
venues_sports = {}

# Process venues data and clean up sport names
for _, venue in venues_df.iterrows():
    venue_slug = generate_slug(venue['venue'])
    venue_name = venue['venue']
    
    if pd.notna(venue['sports']):
        # The sports column might contain weird formatting with brackets and quotes
        # Let's clean it up
        sports_text = venue['sports']
        
        # Remove square brackets and quotes
        sports_text = sports_text.replace("[", "").replace("]", "").replace("'", "")
        
        # Split by commas
        sports_list = [sport.strip() for sport in sports_text.split(',')]
        
        for sport in sports_list:
            if sport not in venues_sports:
                venues_sports[sport] = []
            
            # Check if this venue is already in the list for this sport
            venue_exists = False
            for existing_venue in venues_sports[sport]:
                if existing_venue['slug'] == venue_slug:
                    venue_exists = True
                    break
            
            if not venue_exists:
                venues_sports[sport].append({
                    'slug': venue_slug,
                    'name': venue_name,
                    'description': f"Venue for {sport}"
                })

# Check some venues by sport
sports_with_venues = sum(1 for sport, venues in venues_sports.items() if venues)
print(f"Found venues for {sports_with_venues} sports")
for sport in list(venues_sports.keys())[:5]:
    print(f"{sport}: {venues_sports[sport]}")

Found venues for 45 sports
Artistic Swimming: [{'slug': 'aquatics-centre', 'name': 'Aquatics Centre', 'description': 'Venue for Artistic Swimming'}]
Diving: [{'slug': 'aquatics-centre', 'name': 'Aquatics Centre', 'description': 'Venue for Diving'}]
Water Polo: [{'slug': 'aquatics-centre', 'name': 'Aquatics Centre', 'description': 'Venue for Water Polo'}, {'slug': 'paris-la-defense-arena', 'name': 'Paris La Defense Arena', 'description': 'Venue for Water Polo'}]
Artistic Gymnastics: [{'slug': 'bercy-arena', 'name': 'Bercy Arena', 'description': 'Venue for Artistic Gymnastics'}]
Basketball: [{'slug': 'bercy-arena', 'name': 'Bercy Arena', 'description': 'Venue for Basketball'}, {'slug': 'pierre-mauroy-stadium', 'name': 'Pierre Mauroy Stadium', 'description': 'Venue for Basketball'}]


In [70]:
# Get list of events by sport
sport_events = {}
for _, event in events_df.iterrows():
    sport = event['sport']
    if sport not in sport_events:
        sport_events[sport] = []
    
    if 'event' in event and pd.notna(event['event']):
        sport_events[sport].append(event['event'])

# Check some events by sport
for sport in list(sport_events.keys())[:3]:
    print(f"{sport}: {sport_events[sport][:5]}...")

Archery: ["Men's Individual", "Women's Individual", "Men's Team", "Women's Team", 'Mixed Team']...
Artistic Gymnastics: ["Men's Team", "Men's All-Around", "Men's Floor Exercise", "Men's Pommel Horse", "Men's Rings"]...
Artistic Swimming: ['Duet', 'Team']...


In [71]:
# Check medallists data to find top athletes and countries by sport
medallists_df.head(5)

Unnamed: 0,medal_date,medal_type,medal_code,name,gender,country_code,country,country_long,nationality_code,nationality,...,team,team_gender,discipline,event,event_type,url_event,birth_date,code_athlete,code_team,is_medallist
0,2024-07-27,Gold Medal,1.0,EVENEPOEL Remco,Male,BEL,Belgium,Belgium,BEL,Belgium,...,,,Cycling Road,Men's Individual Time Trial,ATH,/en/paris-2024/results/cycling-road/men-s-indi...,2000-01-25,1903136,,True
1,2024-07-27,Silver Medal,2.0,GANNA Filippo,Male,ITA,Italy,Italy,ITA,Italy,...,,,Cycling Road,Men's Individual Time Trial,ATH,/en/paris-2024/results/cycling-road/men-s-indi...,1996-07-25,1923520,,True
2,2024-07-27,Bronze Medal,3.0,van AERT Wout,Male,BEL,Belgium,Belgium,BEL,Belgium,...,,,Cycling Road,Men's Individual Time Trial,ATH,/en/paris-2024/results/cycling-road/men-s-indi...,1994-09-15,1903147,,True
3,2024-07-27,Gold Medal,1.0,BROWN Grace,Female,AUS,Australia,Australia,AUS,Australia,...,,,Cycling Road,Women's Individual Time Trial,ATH,/en/paris-2024/results/cycling-road/women-s-in...,1992-07-07,1940173,,True
4,2024-07-27,Silver Medal,2.0,HENDERSON Anna,Female,GBR,Great Britain,Great Britain,GBR,Great Britain,...,,,Cycling Road,Women's Individual Time Trial,ATH,/en/paris-2024/results/cycling-road/women-s-in...,1998-11-14,1912525,,True


In [72]:
# Create a function to get top athletes and countries by sport with medal counts
def get_top_athletes_by_sport(sport_name):
    # Filter medallists by the sport (discipline)
    sport_medallists = medallists_df[medallists_df['discipline'] == sport_name]
    
    # Count medals by type for each athlete
    athletes_medal_data = {}
    for _, medal in sport_medallists.iterrows():
        athlete_name = medal['name']
        athlete_code = medal['code_athlete']
        medal_type = medal['medal_type']
        
        if athlete_code not in athletes_medal_data:
            athletes_medal_data[athlete_code] = {
                'name': athlete_name,
                'code': athlete_code,
                'Gold Medal': 0,
                'Silver Medal': 0,
                'Bronze Medal': 0,
                'total': 0
            }
        
        if pd.notna(medal_type):
            athletes_medal_data[athlete_code][medal_type] += 1
            athletes_medal_data[athlete_code]['total'] += 1
    
    # Convert to list and sort by Olympic ranking (gold first, then silver, then bronze)
    athletes_list = list(athletes_medal_data.values())
    athletes_list.sort(key=lambda x: (x['Gold Medal'], x['Silver Medal'], x['Bronze Medal']), reverse=True)
    
    # Get information for top athletes
    top_athletes = []
    for athlete_data in athletes_list:
        athlete_name = athlete_data['name']
        athlete_code = athlete_data['code']
        
        # Find this athlete in athlete data
        athlete_info = {}
        try:
            # Match by code in athletes dataframe
            if not pd.isna(athlete_code):
                athlete_df_data = athletes_df[athletes_df['code'] == athlete_code]
                if not athlete_df_data.empty:
                    athlete_row = athlete_df_data.iloc[0]
                    athlete_slug = generate_name_slug(athlete_name)
                    athlete_info = {
                        'slug': athlete_slug,
                        'name': athlete_name,
                        'countryCode': convert_country_code(athlete_row['country_code']) if pd.notna(athlete_row['country_code']) else '',
                        'country': athlete_row['country'] if pd.notna(athlete_row['country']) else '',
                        'medals': {
                            'gold': athlete_data['Gold Medal'],
                            'silver': athlete_data['Silver Medal'],
                            'bronze': athlete_data['Bronze Medal'],
                            'total': athlete_data['total']
                        }
                    }
                else:
                    # If not found by code, create info from medallists data
                    athlete_medal_row = sport_medallists[sport_medallists['code_athlete'] == athlete_code].iloc[0]
                    athlete_slug = generate_name_slug(athlete_name)
                    athlete_info = {
                        'slug': athlete_slug,
                        'name': athlete_name,
                        'countryCode': convert_country_code(athlete_medal_row['country_code']),
                        'country': athlete_medal_row['country'],
                        'medals': {
                            'gold': athlete_data['Gold Medal'],
                            'silver': athlete_data['Silver Medal'],
                            'bronze': athlete_data['Bronze Medal'],
                            'total': athlete_data['total']
                        }
                    }
        except:
            # Fallback to just using the name if there are issues
            athlete_slug = generate_name_slug(athlete_name)
            athlete_info = {
                'slug': athlete_slug,
                'name': athlete_name,
                'medals': {
                    'gold': athlete_data['Gold Medal'],
                    'silver': athlete_data['Silver Medal'],
                    'bronze': athlete_data['Bronze Medal'],
                    'total': athlete_data['total']
                }
            }
        
        if athlete_info:
            top_athletes.append(athlete_info)
    
    return top_athletes

def get_top_countries_by_sport(sport_name):
    # Filter medallists by the sport (discipline)
    sport_medallists = medallists_df[medallists_df['discipline'] == sport_name]
    
    # Count medals by type for each country
    countries_medal_data = {}
    for _, medal in sport_medallists.iterrows():
        country_name = medal['country']
        country_code = medal['country_code']
        medal_type = medal['medal_type']
        
        if country_code not in countries_medal_data:
            countries_medal_data[country_code] = {
                'name': country_name,
                'code': country_code,
                'Gold Medal': 0,
                'Silver Medal': 0,
                'Bronze Medal': 0,
                'total': 0
            }
        
        if pd.notna(medal_type):
            countries_medal_data[country_code][medal_type] += 1
            countries_medal_data[country_code]['total'] += 1
    
    # Convert to list and sort by Olympic ranking (gold first, then silver, then bronze)
    countries_list = list(countries_medal_data.values())
    countries_list.sort(key=lambda x: (x['Gold Medal'], x['Silver Medal'], x['Bronze Medal']), reverse=True)
    
    # Get information for top countries
    top_countries = []
    for country_data in countries_list:
        country_slug = generate_slug(country_data['name'])
        country_info = {
            'slug': country_slug,
            'name': country_data['name'],
            'code': convert_country_code(country_data['code']),
            'medals': {
                'gold': country_data['Gold Medal'],
                'silver': country_data['Silver Medal'],
                'bronze': country_data['Bronze Medal'],
                'total': country_data['total']
            }
        }
        top_countries.append(country_info)
    
    return top_countries

In [73]:
# Let's test our functions with a few sports
test_sports = ['Swimming', 'Athletics', 'Gymnastics Artistic']

for sport in test_sports:
    print(f"\n{sport}:")
    top_athletes = get_top_athletes_by_sport(sport)
    print(f"  Top athletes: {len(top_athletes)}")
    if top_athletes:
        print(f"  Sample: {top_athletes[:3]}")
    
    top_countries = get_top_countries_by_sport(sport)
    print(f"  Top countries: {len(top_countries)}")
    if top_countries:
        print(f"  Sample: {top_countries[:3]}")


Swimming:
  Top athletes: 125
  Sample: [{'slug': 'leon-marchand', 'name': 'MARCHAND Leon', 'countryCode': 'FR', 'country': 'France', 'medals': {'gold': 4, 'silver': 0, 'bronze': 1, 'total': 5}}, {'slug': 'torri-huske', 'name': 'HUSKE Torri', 'countryCode': 'US', 'country': 'United States', 'medals': {'gold': 3, 'silver': 2, 'bronze': 0, 'total': 5}}, {'slug': 'mollie-ocallaghan', 'name': "O'CALLAGHAN Mollie", 'countryCode': 'AU', 'country': 'Australia', 'medals': {'gold': 3, 'silver': 1, 'bronze': 1, 'total': 5}}]
  Top countries: 19
  Sample: [{'slug': 'united-states', 'name': 'United States', 'code': 'US', 'medals': {'gold': 27, 'silver': 36, 'bronze': 7, 'total': 70}}, {'slug': 'australia', 'name': 'Australia', 'code': 'AU', 'medals': {'gold': 17, 'silver': 19, 'bronze': 15, 'total': 51}}, {'slug': 'china', 'name': 'China', 'code': 'CN', 'medals': {'gold': 6, 'silver': 8, 'bronze': 22, 'total': 36}}]

Athletics:
  Top athletes: 193
  Sample: [{'slug': 'gabrielle-thomas', 'name': '

In [74]:
# Process all sports and create the final JSON data
sports_json = {}

for _, sport_row in sports_df.iterrows():
    sport_name = sport_row['sport']
    sport_code = sport_row['sport_code']
    sport_slug = sport_row['tag']  # Using the tag as slug as requested
    
    # Get venues for this sport
    venues = venues_sports.get(sport_name, [])
    
    # Get events for this sport
    events = sport_events.get(sport_name, [])
    
    # Get top athletes and countries
    athletes = get_top_athletes_by_sport(sport_name)
    countries = get_top_countries_by_sport(sport_name)
    
    # Create and store the sport object
    sport_data = {
        'name': sport_name,
        'code': sport_code,
        'slug': sport_slug,
        'description': f"{sport_name} competitions at the Paris 2024 Olympic Games",
        'events': events,
        'athletes': athletes,
        'countries': countries,
        'venues': venues
    }
    
    sports_json[sport_slug] = sport_data

# Count sports with data
sports_with_athletes = sum(1 for slug, data in sports_json.items() if data['athletes'] and len(data['athletes']) > 0)
sports_with_countries = sum(1 for slug, data in sports_json.items() if data['countries'] and len(data['countries']) > 0)
sports_with_venues = sum(1 for slug, data in sports_json.items() if data['venues'] and len(data['venues']) > 0)

print(f"Processed {len(sports_json)} sports")
print(f"  Sports with athletes: {sports_with_athletes}")
print(f"  Sports with countries: {sports_with_countries}")
print(f"  Sports with venues: {sports_with_venues}")

sports_without_venues = [slug for slug, data in sports_json.items() if not data['venues'] or len(data['venues']) == 0]
print(f"Sports without venues: {sports_without_venues}")

Processed 45 sports
  Sports with athletes: 45
  Sports with countries: 45
  Sports with venues: 45
Sports without venues: []


In [75]:
# Look at a few examples of the sports data
for slug in list(sports_json.keys())[:3]:
    sport = sports_json[slug]
    print(f"\n{sport['name']} ({slug}):")
    print(f"  Events: {len(sport['events'])}")
    print(f"  Athletes: {len(sport['athletes'])}")
    print(f"  Countries: {len(sport['countries'])}")
    print(f"  Venues: {len(sport['venues'])}")


Archery (archery):
  Events: 5
  Athletes: 23
  Countries: 7
  Venues: 1

Artistic Gymnastics (artistic-gymnastics):
  Events: 14
  Athletes: 45
  Countries: 17
  Venues: 1

Artistic Swimming (artistic-swimming):
  Events: 2
  Athletes: 31
  Countries: 5
  Venues: 1


In [76]:
# Create directory if it doesn't exist
Path('../public/data').mkdir(parents=True, exist_ok=True)

# Save the data as JSON
with open('../public/data/sports.json', 'w', encoding='utf-8') as f:
    json.dump(sports_json, f, ensure_ascii=False, indent=2)

print("Data saved to ../public/data/sports.json")

Data saved to ../public/data/sports.json
