In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import hashlib
import matplotlib.pyplot as plt

In [2]:
class AthleticsDataScraper:
    def __init__(self):
        self.base_url = 'https://www.alltime-athletics.com/'
    
    def generate_url(self, event, is_legal):
        # Handle special cases with different URL patterns
        special_cases = {
            '100m': ('m_100ok.htm', 'm100mno.htm'),
            'trip': ('mtripok.htm', 'mtripno.htm'),
            'long': ('mlongok.htm', 'mlongno.htm'),
            '110h': ('m_110hok.htm', 'm_110hno.htm'),
            'pole': ('mpoleok.htm','mpoleno.htm'),
            'shot': ('mshotok.htm','mshotno.htm'),
            'disc': ('mdiscok.htm','mdiscno.htm'),
            'jave': ('mjaveok.htm','mjaveno.htm'),
            'hamm': ('mhammok.htm','mhammno.htm'),
            'deca': ('mdecaok.htm','mdecano.htm'),
            '60m':   ('m60mok.htm','m60mno.htm')
            # Add more special cases here if needed
        }
        if event in special_cases:
            legal_suffix, illegal_suffix = special_cases[event]
            suffix = legal_suffix if is_legal else illegal_suffix
        else:
            suffix = f"m_{event}{'ok' if is_legal else 'no'}.htm"
        
        return f"{self.base_url}{suffix}"
    
    def fetch_data(self, event, is_legal):
        url = self.generate_url(event, is_legal)
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        pre_tag = soup.find('pre')
        table_text = pre_tag.get_text()
        rows = table_text.split('\n')

        def process_row(row):
            parts = re.split(r'\s{2,}', row)
            return [part.strip() for part in parts]

        data = []
        max_length = 0
        for row in rows:
            if row.strip():
                processed_row = process_row(row)
                data.append(processed_row)
                max_length = max(max_length, len(processed_row))

        # Define column names based on the maximum row length
        if max_length == 10:
            column_names = ["Test", "Rank", "Time", "Wind", "Name", "Country", "DOB", "Position_in_race", "City", "Date"]
        else:
            column_names = ["Test", "Rank", "Time", "Name", "Country", "DOB", "Position_in_race", "City", "Date"]

        df = pd.DataFrame(data, columns=column_names[:max_length])
        df.drop('Test', inplace=True, axis=1, errors='ignore')
        df['Legal'] = 'Y' if is_legal else 'N'
        has_wind = 'Wind' in df.columns
        return df, has_wind
    
    def add_all_conditions_rank(self, df, event):
        if re.search(r'\d', event):
            # This is a race event
            df['All Conditions Rank'] = df['Time'].rank(method='min')
        else:
            # This is a field event
            df['All Conditions Rank'] = df['Time'].rank(ascending=False, method='min')
        return df
    
    
    def add_age_at_time_of_race(self, df):
        # Convert DOB and Date columns to datetime with the correct format
        df['DOB'] = pd.to_datetime(df['DOB'], format='%d.%m.%Y', errors='coerce')
        df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y', errors='coerce')
    
        # Correct misinterpreted dates
        df['DOB'] = df['DOB'].apply(lambda x: x if pd.isnull(x) or x.year < 2023 else pd.Timestamp(year=x.year - 100, month=x.month, day=x.day))
    
        # Calculate age at the time of race
        df['Age at Time of Race'] = df.apply(lambda row: row['Date'].year - row['DOB'].year - 
                                             ((row['Date'].month, row['Date'].day) < (row['DOB'].month, row['DOB'].day)) if pd.notnull(row['DOB']) else pd.NA, axis=1)
    
        return df

    def add_competition_id(self, df):
        # Ensure 'Date' is formatted correctly
        df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

        # Create a concatenated string of Date and City
        df['competition_id'] = df.apply(lambda row: f"{row['Date']}_{row['City']}", axis=1)

        # Hash the concatenated string to create a unique ID
        df['competition_id'] = df['competition_id'].apply(lambda x: hashlib.sha1(x.encode()).hexdigest())

        return df

    

    def get_combined_data(self, event):
        df_legal, has_wind = self.fetch_data(event, True)

        if has_wind:
            df_illegal, _ = self.fetch_data(event, False)
            df_combined = pd.concat([df_legal, df_illegal], ignore_index=True)
        else:
            df_combined = df_legal

        df_combined.dropna(inplace=True)
        df_combined['Date'] = pd.to_datetime(df_combined['Date'], format='%d.%m.%Y')
        df_combined['DOB'] = pd.to_datetime(df_combined['DOB'], format='%d.%m.%y', errors='coerce')


        # Extract any letters and '#' from the 'Time' column and put them into a new 'Note' column
        df_combined['Note'] = df_combined['Time'].str.extract(r'([a-zA-Z#*@+´]+)', expand=False)
        # Remove the letters and '#' from the 'Time' column
        df_combined['Time'] = df_combined['Time'].str.replace(r'[a-zA-Z#*@+´]', '', regex=True)
        df_combined['Time'] = df_combined['Time'].astype('float')
        df_combined['Sex'] = 'Male'
        df_combined['Event'] = event
        df_combined = self.add_all_conditions_rank(df_combined, event)

        df_combined.loc[df_combined['Legal'] == 'N', 'Rank'] = pd.NA
        df_combined = self.add_age_at_time_of_race(df_combined)

        df_combined = self.add_competition_id(df_combined)


        return df_combined


scraper = AthleticsDataScraper()
df_200m = scraper.get_combined_data('200')
df_100m = scraper.get_combined_data('100m')
df_400m = scraper.get_combined_data('400')
df_long = scraper.get_combined_data('long')
df_trip = scraper.get_combined_data('trip')
df_110h = scraper.get_combined_data('110h')
df_400h = scraper.get_combined_data('400h')
df_pole = scraper.get_combined_data('pole')
df_shot = scraper.get_combined_data('shot')
df_disc = scraper.get_combined_data('disc')
df_jave = scraper.get_combined_data('jave')
df_hamm = scraper.get_combined_data('hamm')
df_deca = scraper.get_combined_data('deca')
df_60m = scraper.get_combined_data('60m')
df_300m = scraper.get_combined_data('300')


In [3]:
class AthleticsDataScraper_w:
    def __init__(self):
        self.base_url = 'https://www.alltime-athletics.com/'
    
    def generate_url(self, event, is_legal):
        # Handle special cases with different URL patterns
        special_cases = {
            '100m': ('w_100ok.htm', 'w_100no.htm'),
            'trip': ('wtripleok.htm', 'wtripleno.htm'),
            'long': ('wlongok.htm', 'wlongno.htm'),
            '100h': ('w_100hok.htm', 'w_100hno.htm'),
            'pole': ('wpoleok.htm','wpoleno.htm'),
            'shot': ('wshotok.htm','wshotno.htm'),
            'disc': ('wdiscok.htm','wdiscno.htm'),
            'jave': ('wjaveok.htm','wjaveno.htm'),
            'hamm': ('whammok.htm','whammno.htm'),
            'hept': ('whepaok.htm','wheptno.htm'),
            '60m':   ('w60mok.htm','w60mno.htm')
            # Add more special cases here if needed
        }
        if event in special_cases:
            legal_suffix, illegal_suffix = special_cases[event]
            suffix = legal_suffix if is_legal else illegal_suffix
        else:
            suffix = f"w_{event}{'ok' if is_legal else 'no'}.htm"
        
        return f"{self.base_url}{suffix}"
    
    def fetch_data(self, event, is_legal):
        url = self.generate_url(event, is_legal)
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        pre_tag = soup.find('pre')
        table_text = pre_tag.get_text()
        rows = table_text.split('\n')

        def process_row(row):
            parts = re.split(r'\s{2,}', row)
            return [part.strip() for part in parts]

        data = []
        max_length = 0
        for row in rows:
            if row.strip():
                processed_row = process_row(row)
                data.append(processed_row)
                max_length = max(max_length, len(processed_row))

        # Define column names based on the maximum row length
        if max_length == 10:
            column_names = ["Test", "Rank", "Time", "Wind", "Name", "Country", "DOB", "Position_in_race", "City", "Date"]
        else:
            column_names = ["Test", "Rank", "Time", "Name", "Country", "DOB", "Position_in_race", "City", "Date"]

        df = pd.DataFrame(data, columns=column_names[:max_length])
        df.drop('Test', inplace=True, axis=1, errors='ignore')
        df['Legal'] = 'Y' if is_legal else 'N'
        has_wind = 'Wind' in df.columns
        return df, has_wind
    
    def add_all_conditions_rank(self, df, event):
        if re.search(r'\d', event):
            # This is a race event
            df['All Conditions Rank'] = df['Time'].rank(method='min')
        else:
            # This is a field event
            df['All Conditions Rank'] = df['Time'].rank(ascending=False, method='min')
        return df
    
    
    def add_age_at_time_of_race(self, df):
        # Convert DOB and Date columns to datetime with the correct format
        df['DOB'] = pd.to_datetime(df['DOB'], format='%d.%m.%Y', errors='coerce')
        df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y', errors='coerce')
    
        # Correct misinterpreted dates
        df['DOB'] = df['DOB'].apply(lambda x: x if pd.isnull(x) or x.year < 2023 else pd.Timestamp(year=x.year - 100, month=x.month, day=x.day))
    
        # Calculate age at the time of race
        df['Age at Time of Race'] = df.apply(lambda row: row['Date'].year - row['DOB'].year - 
                                             ((row['Date'].month, row['Date'].day) < (row['DOB'].month, row['DOB'].day)) if pd.notnull(row['DOB']) else pd.NA, axis=1)
    
        return df



    def add_competition_id(self, df):
        # Ensure 'Date' is formatted correctly
        df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

        # Create a concatenated string of Date and City
        df['competition_id'] = df.apply(lambda row: f"{row['Date']}_{row['City']}", axis=1)

        # Hash the concatenated string to create a unique ID
        df['competition_id'] = df['competition_id'].apply(lambda x: hashlib.sha1(x.encode()).hexdigest())

        return df

    

    def get_combined_data(self, event):
        df_legal, has_wind = self.fetch_data(event, True)

        if has_wind:
            df_illegal, _ = self.fetch_data(event, False)
            df_combined = pd.concat([df_legal, df_illegal], ignore_index=True)
        else:
            df_combined = df_legal

        df_combined.dropna(inplace=True)
        df_combined['Date'] = pd.to_datetime(df_combined['Date'], format='%d.%m.%Y',errors='coerce')
        df_combined['DOB'] = pd.to_datetime(df_combined['DOB'], format='%d.%m.%y', errors='coerce')


        # Extract any letters and '#' from the 'Time' column and put them into a new 'Note' column
        df_combined['Note'] = df_combined['Time'].str.extract(r'([a-zA-Z#*@+´]+)', expand=False)
        # Remove the letters and '#' from the 'Time' column
        df_combined['Time'] = df_combined['Time'].str.replace(r'[a-zA-Z#*@+´]', '', regex=True)

        df_combined['Time'] = df_combined['Time'].astype('float')
        df_combined = self.add_all_conditions_rank(df_combined, event)

        df_combined.loc[df_combined['Legal'] == 'N', 'Rank'] = pd.NA
        df_combined = self.add_age_at_time_of_race(df_combined)
        df_combined['Sex'] = 'Female'
        df_combined['Event'] = event
        df_combined = self.add_competition_id(df_combined)


        return df_combined


scraper = AthleticsDataScraper_w()
df_200m_w = scraper.get_combined_data('200')
df_100m_w = scraper.get_combined_data('100m')
df_400m_w = scraper.get_combined_data('400')
df_long_w = scraper.get_combined_data('long')
df_trip_w = scraper.get_combined_data('trip')
df_100h_w = scraper.get_combined_data('100h')
df_400h_w = scraper.get_combined_data('400h')
df_pole_w = scraper.get_combined_data('pole')
df_shot_w = scraper.get_combined_data('shot')
df_disc_w = scraper.get_combined_data('disc')
df_jave_w = scraper.get_combined_data('jave')
df_hamm_w = scraper.get_combined_data('hamm')
df_hept_w = scraper.get_combined_data('hept')
df_60m_w = scraper.get_combined_data('60m')
df_300m_w = scraper.get_combined_data('300')


In [4]:
dfs_women = [df_200m_w, df_100m_w, df_400m_w, df_long_w, df_trip_w, df_100h_w, df_400h_w, df_pole_w, df_shot_w, df_disc_w, df_jave_w, df_hamm_w, df_hept_w, df_60m_w, df_300m_w]
combined_women = pd.concat(dfs_women, ignore_index=True)

# Combine all men's dataframes
dfs_men = [df_200m, df_100m, df_400m, df_long, df_trip, df_110h, df_400h, df_pole, df_shot, df_disc, df_jave, df_hamm, df_deca, df_60m, df_300m]
combined_men = pd.concat(dfs_men, ignore_index=True)

# Ensure the 'Wind' column is present in all dataframes
if 'Wind' not in combined_women.columns:
    combined_women['Wind'] = pd.NA
if 'Wind' not in combined_men.columns:
    combined_men['Wind'] = pd.NA

# Combine women's and men's dataframes
combined_all = pd.concat([combined_women, combined_men], ignore_index=True)

In [11]:
combined_all

Unnamed: 0,Rank,Time,Wind,Name,Country,DOB,Position_in_race,City,Date,Legal,Note,All Conditions Rank,Age at Time of Race,Sex,Event,competition_id
0,1,21.34,+1.3,Florence Griffith-Joyner,USA,1959-12-21,1,Seoul,1988-09-29,Y,,1.0,28,Female,200,457294ca525d32c99efe07c6c4bc06c44b631fbb
1,2,21.41,+0.1,Shericka Jackson,JAM,1994-07-16,1,Budapest,2023-08-25,Y,,2.0,29,Female,200,bfac2418455160c83a85740206a7e756f7908d07
2,3,21.45,+0.6,Shericka Jackson,JAM,1994-07-16,1,Eugene,2022-07-21,Y,,3.0,28,Female,200,dc98c5290c6154aab880c72e64f6d6e076d2da3f
3,4,21.48,+0.2,Shericka Jackson,JAM,1994-07-16,1,Bruxelles,2023-09-08,Y,,4.0,29,Female,200,c8214a1fc1bcd9d9ac9b1c9f3ea49b4a8ff31ba3
4,5,21.53,+0.8,Elaine Thompson-Herah,JAM,1992-06-28,1,Tokyo,2021-08-03,Y,,5.0,29,Female,200,35d526797ba44571a435009202a8f8788b4f99c9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85491,557,32.99,,Manteo Mitchell,USA,1987-07-06,6,Naimette-Xhovémont,2015-07-15,Y,,555.0,28,Male,300,f6df05cb8c015e96586e09a92a6b49e80827bd7d
85492,557,32.99,,Jan Jirka,CZE,1993-10-05,1r1,Praha,2017-05-06,Y,,555.0,23,Male,300,694030d64dd74f0edab2ac42be5296a7afaf1603
85493,557,32.99,,Keenan Blake,NED,2003-01-01,1rB,Lisse,2023-05-13,Y,,555.0,20,Male,300,68879804699b4c55eee3d73c1f1b0ccd986e4941
85494,557,32.99,,Samuel García,ESP,1991-12-04,2r2,Pliezhausen,2023-05-14,Y,,555.0,31,Male,300,bf17ba857d6256de123e75552f1a098ee552b5e9
