# Process Athletes and Medallists Data

This notebook converts the athletes and medallists CSV data into a JSON format for use in the application.

In [2]:
import pandas as pd
import json
import os
from unidecode import unidecode
from ast import literal_eval
import re
from pathlib import Path

In [3]:
# Load athletes data
athletes_df = pd.read_csv('../eda/data/athletes.csv')

# Load medallists data
medallists_df = pd.read_csv('../eda/data/medallists.csv')

# Display sample data
print(f"Athletes data shape: {athletes_df.shape}")
print(f"Medallists data shape: {medallists_df.shape}")

athletes_df.head(2)

Athletes data shape: (11113, 36)
Medallists data shape: (2315, 21)


Unnamed: 0,code,current,name,name_short,name_tv,gender,function,country_code,country,country_long,...,family,lang,coach,reason,hero,influence,philosophy,sporting_relatives,ritual,other_sports
0,1532872,True,ALEKSANYAN Artur,ALEKSANYAN A,Artur ALEKSANYAN,Male,Athlete,ARM,Armenia,Armenia,...,"Father, Gevorg Aleksanyan","Armenian, English, Russian","Gevorg Aleksanyan (ARM), father",He followed his father and his uncle into the ...,"Footballer Zinedine Zidane (FRA), World Cup wi...","His father, Gevorg Aleksanyan","""Wrestling is my life."" (mediamax.am. 18 May 2...",,,
1,1532873,True,AMOYAN Malkhas,AMOYAN M,Malkhas AMOYAN,Male,Athlete,ARM,Armenia,Armenia,...,,Armenian,,,,,"""To become a good athlete, you first have to b...","Uncle, Roman Amoyan (wrestling), 2008 Olympic ...",,


In [4]:
# Show medallists sample
medallists_df.head(2)

Unnamed: 0,medal_date,medal_type,medal_code,name,gender,country_code,country,country_long,nationality_code,nationality,...,team,team_gender,discipline,event,event_type,url_event,birth_date,code_athlete,code_team,is_medallist
0,2024-07-27,Gold Medal,1.0,EVENEPOEL Remco,Male,BEL,Belgium,Belgium,BEL,Belgium,...,,,Cycling Road,Men's Individual Time Trial,ATH,/en/paris-2024/results/cycling-road/men-s-indi...,2000-01-25,1903136,,True
1,2024-07-27,Silver Medal,2.0,GANNA Filippo,Male,ITA,Italy,Italy,ITA,Italy,...,,,Cycling Road,Men's Individual Time Trial,ATH,/en/paris-2024/results/cycling-road/men-s-indi...,1996-07-25,1923520,,True


In [5]:
# List columns in athletes data
print("Athletes columns:")
print(athletes_df.columns.tolist())

# List columns in medallists data
print("\nMedallists columns:")
print(medallists_df.columns.tolist())

Athletes columns:
['code', 'current', 'name', 'name_short', 'name_tv', 'gender', 'function', 'country_code', 'country', 'country_long', 'nationality', 'nationality_long', 'nationality_code', 'height', 'weight', 'disciplines', 'events', 'birth_date', 'birth_place', 'birth_country', 'residence_place', 'residence_country', 'nickname', 'hobbies', 'occupation', 'education', 'family', 'lang', 'coach', 'reason', 'hero', 'influence', 'philosophy', 'sporting_relatives', 'ritual', 'other_sports']

Medallists columns:
['medal_date', 'medal_type', 'medal_code', 'name', 'gender', 'country_code', 'country', 'country_long', 'nationality_code', 'nationality', 'nationality_long', 'team', 'team_gender', 'discipline', 'event', 'event_type', 'url_event', 'birth_date', 'code_athlete', 'code_team', 'is_medallist']


In [6]:
# function to safely convert string representation of lists to actual lists
def safe_eval(value):
    if pd.isna(value) or value == '':
        return []
    try:
        if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
            return literal_eval(value)
        return value
    except (ValueError, SyntaxError):
        return value

In [7]:
# function to convert 3-letter country code to 2-letter
def convert_country_code(code_3):
    # Map of Alpha-3 country codes to Alpha-2 codes, generated from CSV file
    country_codes = {
        'AFG': 'AF', 'ALA': 'AX', 'ALB': 'AL', 'DZA': 'DZ', 'ASM': 'AS', 'AND': 'AD', 'AGO': 'AO', 'AIA': 'AI',
        'ATA': 'AQ', 'ATG': 'AG', 'ARG': 'AR', 'ARM': 'AM', 'ABW': 'AW', 'AUS': 'AU', 'AUT': 'AT', 'AZE': 'AZ',
        'BHS': 'BS', 'BHR': 'BH', 'BGD': 'BD', 'BRB': 'BB', 'BLR': 'BY', 'BEL': 'BE', 'BLZ': 'BZ', 'BEN': 'BJ',
        'BMU': 'BM', 'BTN': 'BT', 'BOL': 'BO', 'BES': 'BQ', 'BIH': 'BA', 'BWA': 'BW', 'BVT': 'BV', 'BRA': 'BR',
        'IOT': 'IO', 'BRN': 'BN', 'BGR': 'BG', 'BFA': 'BF', 'MMR': 'MM', 'BDI': 'BI', 'KHM': 'KH', 'CMR': 'CM',
        'CAN': 'CA', 'CPV': 'CV', 'CYM': 'KY', 'CAF': 'CF', 'TCD': 'TD', 'CHL': 'CL', 'CHN': 'CN', 'CXR': 'CX',
        'CCK': 'CC', 'COL': 'CO', 'COM': 'KM', 'COD': 'CD', 'COG': 'CG', 'COK': 'CK', 'CRI': 'CR', 'CIV': 'CI',
        'HRV': 'HR', 'CUB': 'CU', 'CUW': 'CW', 'CYP': 'CY', 'CZE': 'CZ', 'DNK': 'DK', 'DJI': 'DJ', 'DMA': 'DM',
        'DOM': 'DO', 'ECU': 'EC', 'EGY': 'EG', 'SLV': 'SV', 'GNQ': 'GQ', 'ERI': 'ER', 'EST': 'EE', 'ETH': 'ET',
        'FLK': 'FK', 'FRO': 'FO', 'FJI': 'FJ', 'FIN': 'FI', 'FRA': 'FR', 'GUF': 'GF', 'PYF': 'PF', 'ATF': 'TF',
        'GAB': 'GA', 'GMB': 'GM', 'GEO': 'GE', 'DEU': 'DE', 'GHA': 'GH', 'GIB': 'GI', 'GRC': 'GR', 'GRL': 'GL',
        'GRD': 'GD', 'GLP': 'GP', 'GUM': 'GU', 'GTM': 'GT', 'GGY': 'GG', 'GNB': 'GW', 'GIN': 'GN', 'GUY': 'GY',
        'HTI': 'HT', 'HMD': 'HM', 'VAT': 'VA', 'HND': 'HN', 'HKG': 'HK', 'HUN': 'HU', 'ISL': 'IS', 'IND': 'IN',
        'IDN': 'ID', 'IRN': 'IR', 'IRQ': 'IQ', 'IRL': 'IE', 'IMN': 'IM', 'ISR': 'IL', 'ITA': 'IT', 'JAM': 'JM',
        'JPN': 'JP', 'JEY': 'JE', 'JOR': 'JO', 'KAZ': 'KZ', 'KEN': 'KE', 'KIR': 'KI', 'PRK': 'KP', 'KOR': 'KR',
        'XKX': 'XK', 'KWT': 'KW', 'KGZ': 'KG', 'LAO': 'LA', 'LVA': 'LV', 'LBN': 'LB', 'LSO': 'LS', 'LBR': 'LR',
        'LBY': 'LY', 'LIE': 'LI', 'LTU': 'LT', 'LUX': 'LU', 'MAC': 'MO', 'MKD': 'MK', 'MDG': 'MG', 'MWI': 'MW',
        'MYS': 'MY', 'MDV': 'MV', 'MLI': 'ML', 'MLT': 'MT', 'MHL': 'MH', 'MTQ': 'MQ', 'MRT': 'MR', 'MUS': 'MU',
        'MYT': 'YT', 'MEX': 'MX', 'FSM': 'FM', 'MDA': 'MD', 'MCO': 'MC', 'MNG': 'MN', 'MNE': 'ME', 'MSR': 'MS',
        'MAR': 'MA', 'MOZ': 'MZ', 'NAM': 'NA', 'NRU': 'NR', 'NPL': 'NP', 'ANT': 'AN', 'NLD': 'NL', 'NCL': 'NC',
        'NZL': 'NZ', 'NIC': 'NI', 'NER': 'NE', 'NGA': 'NG', 'NIU': 'NU', 'NFK': 'NF', 'MNP': 'MP', 'NOR': 'NO',
        'OMN': 'OM', 'PAK': 'PK', 'PLW': 'PW', 'PSE': 'PS', 'PAN': 'PA', 'PNG': 'PG', 'PRY': 'PY', 'PER': 'PE',
        'PHL': 'PH', 'PCN': 'PN', 'POL': 'PL', 'PRT': 'PT', 'PRI': 'PR', 'QAT': 'QA', 'REU': 'RE', 'ROU': 'RO',
        'RUS': 'RU', 'RWA': 'RW', 'BLM': 'BL', 'SHN': 'SH', 'KNA': 'KN', 'LCA': 'LC', 'MAF': 'MF', 'SPM': 'PM',
        'VCT': 'VC', 'WSM': 'WS', 'SMR': 'SM', 'STP': 'ST', 'SAU': 'SA', 'SEN': 'SN', 'SRB': 'RS', 'SYC': 'SC',
        'SLE': 'SL', 'SGP': 'SG', 'SXM': 'SX', 'SVK': 'SK', 'SVN': 'SI', 'SLB': 'SB', 'SOM': 'SO', 'ZAF': 'ZA',
        'SGS': 'GS', 'SSD': 'SS', 'ESP': 'ES', 'LKA': 'LK', 'SDN': 'SD', 'SUR': 'SR', 'SJM': 'SJ', 'SWZ': 'SZ',
        'SWE': 'SE', 'CHE': 'CH', 'SYR': 'SY', 'TWN': 'TW', 'TJK': 'TJ', 'TZA': 'TZ', 'THA': 'TH', 'TLS': 'TL',
        'TGO': 'TG', 'TKL': 'TK', 'TON': 'TO', 'TTO': 'TT', 'TUN': 'TN', 'TUR': 'TR', 'TKM': 'TM', 'TCA': 'TC',
        'TUV': 'TV', 'UGA': 'UG', 'UKR': 'UA', 'ARE': 'AE', 'GBR': 'GB', 'UMI': 'UM', 'USA': 'US', 'URY': 'UY',
        'UZB': 'UZ', 'VUT': 'VU', 'VEN': 'VE', 'VNM': 'VN', 'VGB': 'VG', 'VIR': 'VI', 'WLF': 'WF', 'ESH': 'EH',
        'YEM': 'YE', 'ZMB': 'ZM', 'ZWE': 'ZW',
        # Special cases from original function
        'GER': 'DE', 'NED': 'NL', 'SUI': 'CH', 'ROM': 'RO', 'RSA': 'ZA'
    }
    
    if pd.isna(code_3) or code_3 == '':
        return ''
    
    return country_codes.get(code_3, code_3[:2])  # Default to first 2 letters if not in mapping


# function to generate a first-name-last-name slug
def generate_name_slug(name):
    if pd.isna(name) or name == '':
        return ''
    
    # Split the name into parts
    parts = name.split()
    
    # For names with at least two parts
    if len(parts) >= 2:
        # If second part is all caps, assume it's "Firstname LASTNAME"
        if parts[1].isupper():
            name = f"{parts[0]} {parts[1]}"
        # Otherwise, assume "LASTNAME Firstname" format and reorder
        else:
            name = f"{parts[1]} {parts[0]}"
    
    # Convert to lowercase and replace spaces with hyphens
    slug = name.lower().replace(' ', '-')
    # Remove accents
    slug = unidecode(slug)
    # Remove any special characters
    slug = re.sub(r'[^a-z0-9-]', '', slug)
    return slug

print(generate_name_slug('MARCHAND Léon'))
print(generate_name_slug('Eugenia BOSCO'))
print(generate_name_slug('McINTOSH Summer'))
print(generate_name_slug("O'CROININ Emma"))

leon-marchand
eugenia-bosco
summer-mcintosh
emma-ocroinin


In [12]:
# Read the events.csv file to create the mapping
events_df = pd.read_csv('../eda/data/events.csv')
sport_to_tag = dict(zip(events_df['sport'], events_df['tag']))

# Now update the athletes_json creation code
athletes_json = {}

for index, athlete in athletes_df.iterrows():
    athlete_slug = generate_name_slug(athlete['name'])
    
    # handle duplicate slugs by adding a number if needed
    base_slug = athlete_slug
    counter = 1
    while athlete_slug in athletes_json:
        athlete_slug = f"{base_slug}-{counter}"
        counter += 1
    
    # Convert disciplines and events to lists
    disciplines = safe_eval(athlete['disciplines'])
    events = safe_eval(athlete['events'])
    
    # Get athlete medals if any
    athlete_code = athlete['code']
    
    athlete_medals = medallists_df[medallists_df['code_athlete'] == athlete_code]
    medals = []
    
    for _, medal in athlete_medals.iterrows():
        medal_obj = {
            'date': medal['medal_date'] if 'medal_date' in medal else '',
            'type': medal['medal_type'] if 'medal_type' in medal else '',
            'event': medal['event'] if 'event' in medal else '',
            'discipline': medal['discipline'] if 'discipline' in medal else ''
        }
        medals.append(medal_obj)
    
    # Convert 3-letter country code to 2-letter
    country_code_2 = convert_country_code(athlete['country_code'])
    
    # Create the athlete object with proper null handling
    athlete_data = {
        'name': athlete['name'],
        'name_short': athlete['name_short'] if pd.notna(athlete['name_short']) else '',
        'gender': athlete['gender'] if pd.notna(athlete['gender']) else '',
        'photoUrl': f"/athletes/{athlete_slug}.jpg",  # Default path
        'country': {
            'slug': generate_name_slug(athlete['country']) if pd.notna(athlete['country']) else '',
            'name': athlete['country'] if pd.notna(athlete['country']) else '',
            'code': country_code_2
        },
        'nationality': athlete['nationality'] if pd.notna(athlete['nationality']) else '',
        'birth_date': athlete['birth_date'] if pd.notna(athlete['birth_date']) else '',
        'age': 2024 - pd.to_datetime(athlete['birth_date']).year if pd.notna(athlete['birth_date']) else None,
        'birth_place': athlete['birth_place'] if pd.notna(athlete['birth_place']) else '',
        'birth_country': athlete['birth_country'] if pd.notna(athlete['birth_country']) else '',
        'height': float(athlete['height']) if pd.notna(athlete['height']) else 0,
        'weight': float(athlete['weight']) if pd.notna(athlete['weight']) else 0,
        'sports': [{'slug': sport_to_tag.get(sport, ''), 'name': sport, 'description': ''} for sport in disciplines] if disciplines else [],
        'events': events if events else [],
        'bio': athlete['philosophy'] if pd.notna(athlete['philosophy']) else '',
        'nickname': athlete['nickname'] if pd.notna(athlete['nickname']) else '',
        'hobbies': athlete['hobbies'] if pd.notna(athlete['hobbies']) else '',
        'occupation': athlete['occupation'] if pd.notna(athlete['occupation']) else '',
        'education': athlete['education'] if pd.notna(athlete['education']) else '',
        'family': athlete['family'] if pd.notna(athlete['family']) else '',
        'languages': athlete['lang'] if pd.notna(athlete['lang']) else '',
        'coach': athlete['coach'] if pd.notna(athlete['coach']) else '',
        'hero': athlete['hero'] if pd.notna(athlete['hero']) else '',
        'achievements': medals,
    }
    
    athletes_json[athlete_slug] = athlete_data

# Count athletes with medals
athletes_with_medals = sum(1 for slug, data in athletes_json.items() if data['achievements'] and len(data['achievements']) > 0)
print(f"Processed {len(athletes_json)} athletes, {athletes_with_medals} with medals")

Processed 11113 athletes, 2054 with medals


In [13]:
# Create directory if it doesn't exist
Path('../public/data').mkdir(parents=True, exist_ok=True)

# Save the data as JSON
with open('../public/data/athletes.json', 'w', encoding='utf-8') as f:
    json.dump(athletes_json, f, ensure_ascii=False, indent=2)

print("Data saved to ../public/data/athletes.json")

Data saved to ../public/data/athletes.json
