In [1]:
import urllib2
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
#urls to be scraped
urls = {('Men','100 m'):'http://www.alltime-athletics.com/m_100ok.htm',
       ('Men','200 m'):'http://www.alltime-athletics.com/m_200ok.htm',
       ('Men','400 m'):'http://www.alltime-athletics.com/m_400ok.htm',
       ('Men','800 m'):'http://www.alltime-athletics.com/m_800ok.htm',
       ('Men','1500 m'):'http://www.alltime-athletics.com/m_1500ok.htm',
       ('Men','5000 m'):'http://www.alltime-athletics.com/m_5000ok.htm',
       ('Men','10,000 m'):'http://www.alltime-athletics.com/m_10kok.htm',
       ('Men','Half marathon'):'http://www.alltime-athletics.com/mhmaraok.htm',
       ('Men','Marathon'):'http://www.alltime-athletics.com/mmaraok.htm',
       ('Women','100 m'):'http://www.alltime-athletics.com/w_100ok.htm',
       ('Women','200 m'):'http://www.alltime-athletics.com/w_200ok.htm',
       ('Women','400 m'):'http://www.alltime-athletics.com/w_400ok.htm',
       ('Women','800 m'):'http://www.alltime-athletics.com/w_800ok.htm',
       ('Women','1500 m'):'http://www.alltime-athletics.com/w_1500ok.htm',
       ('Women','5000 m'):'http://www.alltime-athletics.com/w_5000ok.htm',
       ('Women','10,000 m'):'http://www.alltime-athletics.com/w_10kok.htm',
       ('Women','Half marathon'):'http://www.alltime-athletics.com/whmaraok.htm',
       ('Women','Marathon'):'http://www.alltime-athletics.com/wmaraok.htm'}

In [3]:
#function to format all the times the same way
def full_time(time):
    result = time
    #add zeros for hours and minutes
    if(time.count(':')==0):
        result = '0:0:'+time
    elif(time.count(':')==1):
        minutes = int(re.match(r'[\d]+',time).group(0))
        #convert 61:00 to 1:01:00
        if(minutes>59):
            result = '1:'+str(minutes-60)+time[2:]
        else:
            result = '0:'+time
    #add zeros for tenths of second
    if(result.count('.')==0):
        result = result+'.0'
    return result

In [4]:
def make_df(key):
    url = urls[key]
    
    #scrape the webpage
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page,'lxml')
    data = str(soup.pre)
    
    lists = [re.split(r'\s\s+',line)[1:] for line in data.split('\r\n')]
    #compute number of columns
    max_len = len(max(lists, key=len))
    
    #ignore irrelevant rows
    lists_clean = [line for line in lists if len(line)==max_len]
    
    #removed wind information if it exists and select the top 1000 performances
    if max_len == 9:
        lists_top = [line[:2]+line[3:] for line in lists_clean if int(line[0])<= 1000]
    else:
        lists_top = [line for line in lists_clean if int(line[0])<= 1000]
    array = np.asarray(lists_top)
    
    df = pd.DataFrame(data = array, 
                  columns=['Rank','Time','Name','Country','Date of Birth','Place','City','Date'])

    #clean time, dates, etc
    df['Time'] = df['Time'].apply(lambda x: full_time(re.match(r'[\d.:]+',x).group(0)))
    df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S.%f').dt.time
    df['Date'] = df['Date'].apply(lambda x: re.match(r'[\d.:]+',x).group(0))
    df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
    df['Place'] = df['Place'].apply(lambda x: re.match(r'\d+',x).group(0) if bool(re.match(r'\d+',x)) else np.nan)
    df['Date of Birth'] = df['Date of Birth'].apply(lambda x: '01.01.'+x if x.count('.')==0 else x)
    df['Date of Birth'] = df['Date of Birth'].apply(lambda x: x[:-2]+'20'+x[-2:] if int(x[-2:]<40) else x[:-2]+'19'+x[-2:])
    df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], infer_datetime_format=True)
    df['Gender'] = key[0]
    df['Event'] = key[1]
    return df

In [5]:
#build data frame and export as csv file
full_df = pd.concat([make_df(key) for key in urls])
full_df.to_csv(r'Output/data.csv',index=False)