In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np


In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\huang\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [3]:
# go to website to scrape
url = "https://www.olympedia.org/editions"
browser.visit(url)

# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')

In [4]:
# read html with pandas (bs4 and parser working on backend)
test_df = pd.read_html(url) # returs list of all tables on the page


In [5]:
test_df

[         #  Year               City  Country        Opened        Closed  \
 0        I  1896             Athina      NaN       6 April      15 April   
 1       II  1900              Paris      NaN           NaN           NaN   
 2      III  1904          St. Louis      NaN        14 May           NaN   
 3       IV  1908             London      NaN       13 July           NaN   
 4        V  1912          Stockholm      NaN        6 July       27 July   
 5       VI  1916             Berlin      NaN           NaN           NaN   
 6      VII  1920          Antwerpen      NaN     14 August     30 August   
 7     VIII  1924              Paris      NaN        5 July       27 July   
 8       IX  1928          Amsterdam      NaN       28 July     12 August   
 9        X  1932        Los Angeles      NaN       30 July     14 August   
 10      XI  1936             Berlin      NaN      1 August     16 August   
 11     XII  1940           Helsinki      NaN           NaN           NaN   

In [6]:
print(f'Total tables: {len(test_df)}')

Total tables: 8


In [7]:
# NOTE: code below could also be used to pull Winter from the winter table 
# look for tables with Tokyo
table_tokyo = pd.read_html(url, match="Tokyo", converters={"City":str})
# len(table_tokyo) # returns 1 so that is the table that we want

In [8]:
# read table as DF
sum_olympic_org = table_tokyo[0]
sum_olympic_org.head()

# NOTE: country coming in as NaN because it is a picture of a flag. Scrape country codes from the img tags. 

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7
0,I,1896,Athina,,6 April,15 April,6 â 13 April,
1,II,1900,Paris,,,,14 May â 28 October,
2,III,1904,St. Louis,,14 May,,1 July â 23 November,
3,IV,1908,London,,13 July,,27 April â 31 October,
4,V,1912,Stockholm,,6 July,27 July,5 May â 27 July,


In [9]:
# trying to get the country codes
# get the image tag from the correct column in the first table
# set table reference
table_to_scrape = html_soup.find_all('table')[0]

# for td in html_soup.find_all('td'):
#     if td.img:
#         print(td.img['src'])

# get table rows
table_body = table_to_scrape.find_all('tr')


In [10]:
# get number of rows in the table
body_rows = table_body[0:]
# len(body_rows)

In [11]:
body_rows

[<tr>
 <th>#</th>
 <th>Year</th>
 <th>City</th>
 <th>Country</th>
 <th>Opened</th>
 <th>Closed</th>
 <th>Competition</th>
 <th></th>
 </tr>,
 <tr>
 <td><a href="/editions/1">I</a></td>
 <td><a href="/editions/1">1896</a></td>
 <td><a href="/editions/1">Athina</a></td>
 <td><img src="/images/flags/GRE.png" style="padding-right: 2px; vertical-align: middle"/></td>
 <td> 6 April</td>
 <td>15 April</td>
 <td>6 – 13 April</td>
 <td>
 </td>
 </tr>,
 <tr>
 <td><a href="/editions/2">II</a></td>
 <td><a href="/editions/2">1900</a></td>
 <td><a href="/editions/2">Paris</a></td>
 <td><img src="/images/flags/FRA.png" style="padding-right: 2px; vertical-align: middle"/></td>
 <td></td>
 <td></td>
 <td>14 May – 28 October</td>
 <td>
 </td>
 </tr>,
 <tr>
 <td><a href="/editions/3">III</a></td>
 <td><a href="/editions/3">1904</a></td>
 <td><a href="/editions/3">St. Louis</a></td>
 <td><img src="/images/flags/USA.png" style="padding-right: 2px; vertical-align: middle"/></td>
 <td>14 May</td>
 <td></td>

In [12]:
# loop through rows and get the image source tags


# define empty list to hold the src tags
flag_images = []

# set row number to pass to table_body[]
for row_num in range(len(body_rows)):

    # loop through each row in the table body and find all rows
    for row in table_body[row_num].find_all('td'):
         
        # check to see if the row has an img tag 
        if row.img:
            # append the img src tag to a list
            flag_images.append(row.img['src'])

In [13]:
# use regex to get the country code from the img src
import re
country_codes = []
# loop through list and pull 3 alphanumeric before '.'
for src in flag_images:
    code = re.findall(r"(\w{3})\.", src) # maybe not the most efficient way but all the strings were the same so took a shot at it
    # append to list
    country_codes.append(code)

In [14]:
# regex loop returned a list of lists so extract into single list
country_codes = [x[0] for x in country_codes]

In [15]:
# get rid of columns we don't need
sum_olympic_cln = sum_olympic_org[['Year', 'City', 'Country']].copy()
sum_olympic_cln.head()

Unnamed: 0,Year,City,Country
0,1896,Athina,
1,1900,Paris,
2,1904,St. Louis,
3,1908,London,
4,1912,Stockholm,


In [16]:
# replace the NaNs in the Country column with country_codes list
sum_olympic_cln['Country'] = country_codes
sum_olympic_cln.head()

Unnamed: 0,Year,City,Country
0,1896,Athina,GRE
1,1900,Paris,FRA
2,1904,St. Louis,USA
3,1908,London,GBR
4,1912,Stockholm,SWE


In [17]:
# check DF
sum_olympic_cln.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Year     34 non-null     int64 
 1   City     34 non-null     object
 2   Country  34 non-null     object
dtypes: int64(1), object(2)
memory usage: 944.0+ bytes


In [18]:
sum_olympic_org

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7
0,I,1896,Athina,,6 April,15 April,6 â 13 April,
1,II,1900,Paris,,,,14 May â 28 October,
2,III,1904,St. Louis,,14 May,,1 July â 23 November,
3,IV,1908,London,,13 July,,27 April â 31 October,
4,V,1912,Stockholm,,6 July,27 July,5 May â 27 July,
5,VI,1916,Berlin,,,,â,Not held due to war
6,VII,1920,Antwerpen,,14 August,30 August,23 April â 12 September,
7,VIII,1924,Paris,,5 July,27 July,4 May â 27 July,
8,IX,1928,Amsterdam,,28 July,12 August,17 May â 12 August,
9,X,1932,Los Angeles,,30 July,14 August,30 July â 14 August,


In [19]:
# get rid of everything before 1964 and after 2016
analysis_years = sum_olympic_cln.copy()
analysis_years.drop(analysis_years[(analysis_years['Year'] < 1964) | (analysis_years['Year'] > 2016)].index, inplace=True)

In [20]:
analysis_years.rename(columns={"City": "Host_City", "Country":"Host_Country"}, inplace=True)
# analysis_years.head()

In [21]:
# send years with host cities and countries to csv
# analysis_years.to_csv("host_cities.csv")

In [22]:
# merge the dataframes - doing this after becuase I didn't want to re-run the scrape.

# load host cities as DF
# host_cities_df = pd.read_csv("host_cities.csv", index_col=0)

In [23]:
# load medals scrape csv as DF
# all_country_medals_df = pd.read_csv("complete_year_datascrape_07182021.csv", index_col=0)

In [24]:
# merge the dataframes on year
# combined_data = all_country_medals_df.merge(right=host_cities_df, how="left", on="Year")
# combined_data.head()

In [25]:
# send merged dataframe to .csv
# combined_data.to_csv("medal_data_by_year.csv")

# Moving to individual year medals pages
1 - navigate to correct page

2 - locate medal table

3 - load medal table as DF with index as the year

4 - merge with the summer olympic table - 

In [26]:
# get years into a list so you can use them for href and click
years_list = analysis_years['Year'].tolist()

# years are int - need them to be string to pass to scraping loop so convert
years_list = [str(x) for x in years_list]

In [27]:
years_list

['1964',
 '1968',
 '1972',
 '1976',
 '1980',
 '1984',
 '1988',
 '1992',
 '1996',
 '2000',
 '2004',
 '2008',
 '2012',
 '2016']

In [35]:
# go to website to scrape
url = "https://www.olympedia.org/editions"
browser.visit(url)

# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')

# test list = successful for 2 years
test_years = ['1964', '1968']
fix_years = ['2016']
# set iterator
# counter = 0

# df dictionary - testing
# https://stackoverflow.com/questions/30233982/merge-dataframes-in-a-dictionary

all_years = {}
df_MasterCompetitionData = pd.DataFrame()

# start for loop to get all of the year medal tables
for year in fix_years: # change back to years_list when testing complete

    browser.find_by_text(year).click() # this works - loop step 1
    print("new olympics, going to...   " + year)
##### IN YEAR LOOP #####
    # get page url - loop step 2
    page_url = browser.url
    print(page_url)
    # parse HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    #test_disc_df = pd.read_html(url)
    table_disciplines = pd.read_html(page_url)
    other_disciplines = pd.read_html(page_url)
    # len(table_tokyo) # returns 1 so that is the table that we want
    #table_disciplines
    #other_disciplines


    #Gather List of Main Disciplines
    df_table_disciplines = pd.read_html(page_url)[3]
    if (year == '1984'):
        print("----------its 1984-------------")
        df_table_disciplines = table_disciplines[2]
    #df_table_disciplines

    df_table0 = df_table_disciplines[0]
    df_table1 = df_table_disciplines[1]
    df_table2 = df_table_disciplines[2]

    pieces = {"1": df_table0,"2": df_table1,"3": df_table2}
    df_table_disciplines = pd.concat(pieces)

    main_disciplines = df_table_disciplines.tolist()

#    #Gather list of Other Disciplines
#    df_other_disc = pd.read_html(page_url)[4]
#
#
#    df_table0 = df_other_disc[0]
#    df_table1 = df_other_disc[1]
#    df_table2 = df_other_disc[2]

#    pieces = {"1": df_table0,"2": df_table1,"3": df_table2}
#    df_other_disc = pd.concat(pieces)

#    other_disciplines = df_other_disc.tolist()

    # Combine List of Disciplines and Drop NaN
#    discipline_combo = main_disciplines + other_disciplines
#    df_disciplines_all = pd.DataFrame(discipline_combo)
    df_disciplines_all = pd.DataFrame(main_disciplines)
    df_disciplines_all = df_disciplines_all.dropna()

    
    #####################################
    ##### IN COMPETITION LOOP #####
    for competition in df_disciplines_all[0]:
        # Bug 2000 CTR
        if (competition == 'Cycling Track' or competition == 'Sailing'):
            print("CTR Bug, avoiding")
            continue
        if (year == '2008' and (competition == 'Diving' or competition == 'Shooting')):
            print("CTR Bug, avoiding")
            continue       
        if (year == '2012' and (competition == 'Rowing')):
            print("CTR Bug, avoiding")
            continue
        if (year == '2016' and (competition == 'Diving' or competition == 'Rugby Sevens')):
            print("CTR Bug, avoiding")
            continue   
        #Find the right competition

        #competition = df_disciplines_all[0][0]
        print("going to...   " + str(competition))
        print("     completed " + year + " " + str(competition))
        browser.find_by_text(competition).click() # this works - loop step 1
        
        # get page url
        page_url = browser.url
        # reset soup
        # parse HTML
        html = browser.html
        html_soup = soup(html, 'html.parser')
        page_dfs = pd.read_html(page_url)

        # Gather Results Table
        df_results = pd.read_html(page_url)[2]
        df_results

        # Clean/Resolve Ties

        #df_results_tiebreaker


        # Add Year, Discipline to Results Table
        df_results['Year'] = year
        df_results['Discipline'] = competition
        # Merge Data into MasterCompetitionData
        df_MasterCompetitionData = df_MasterCompetitionData.append(df_results)
        
        #df_MasterCompetitionData = df_results #first run
        #df_MasterCompetitionData.append(df_results)
        # Go Back to Year Page
        browser.back()
    ##### OUT COMPETITION LOOP #####
    
        
    # go back to the main page
    print("-----completed " + year + " -----")
    browser.back()
##### OUT YEAR LOOP #####


browser.quit()

new olympics, going to...   2016
https://www.olympedia.org/editions/59
going to...   Archery
     completed 2016 Archery
going to...   Artistic Gymnastics
     completed 2016 Artistic Gymnastics
going to...   Artistic Swimming
     completed 2016 Artistic Swimming
going to...   Athletics
     completed 2016 Athletics
going to...   Badminton
     completed 2016 Badminton
going to...   Basketball
     completed 2016 Basketball
going to...   Beach Volleyball
     completed 2016 Beach Volleyball
going to...   Boxing
     completed 2016 Boxing
going to...   Canoe Slalom
     completed 2016 Canoe Slalom
going to...   Canoe Sprint
     completed 2016 Canoe Sprint
going to...   Cycling BMX Racing
     completed 2016 Cycling BMX Racing
going to...   Cycling Mountain Bike
     completed 2016 Cycling Mountain Bike
going to...   Cycling Road
     completed 2016 Cycling Road
CTR Bug, avoiding
CTR Bug, avoiding
going to...   Equestrian Dressage
     completed 2016 Equestrian Dressage
going to...   E

In [None]:
# debug
df_disciplines_all[0][competition]

In [32]:
df_MasterCompetitionData

Unnamed: 0,Event,Gold,Gold.1,Silver,Silver.1,Bronze,Bronze.1,Year,Discipline,Status,Date,Participants,NOCs
0,"Individual, Men",Simon Fairweather,AUS,Vic Wunderle,USA,Wietse van Alten,NED,2000,Archery,,,,
1,"Team, Men",Republic of Korea,KOR,Italy,ITA,United States,USA,2000,Archery,,,,
2,"Individual, Women",Yun Mi-Jin,KOR,Kim Nam-Sun,KOR,Kim Su-Nyeong,KOR,2000,Archery,,,,
3,"Team, Women",Republic of Korea,KOR,Ukraine,UKR,Germany,GER,2000,Archery,,,,
0,"Individual All-Around, Men",Aleksey Nemov,RUS,Yang Wei,CHN,Oleksandr Beresh,UKR,2000,Artistic Gymnastics,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,"Cross-Country, Women",Jenny Rissveds,SWE,Maja WÅoszczowska,POL,Catharine Pendrel,CAN,2016,Cycling Mountain Bike,,,,
0,"Road Race, Individual, Men",Greg Van Avermaet,BEL,Jakob Fuglsang,DEN,RafaÅ Majka,POL,2016,Cycling Road,,,,
1,"Individual Time Trial, Men",Fabian Cancellara,SUI,Tom Dumoulin,NED,Chris Froome,GBR,2016,Cycling Road,,,,
2,"Road Race, Individual, Women",Anna van der Breggen,NED,Emma Johansson,SWE,Elisa Longo Borghini,ITA,2016,Cycling Road,,,,


In [None]:
for competition in df_disciplines_all[0]:
    print(competition)

In [None]:
df_MasterCompetitionData

In [36]:
df_MasterCompetitionData.to_csv(r'2016_fragmented.csv', index = False)

In [None]:
# Find Location, Long, Lat

#for event in df_results['Event']:
event = df_results['Event'][1]
# Navigate to Event Details
print("going to...   " + event)
browser.find_by_text(event).click() # this works - loop step 1
# get page url
page_url = browser.url
# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
    

In [None]:
df_event_details = pd.read_html(page_url)[0]
df_event_details

In [None]:
# Scrape Location Details

location = df_event_details[1][2]
# Navigate to Location Details
print("going to...   " + location)
browser.find_by_text(location).click() # this works - loop step 1
# get page url
page_url = browser.url
# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
browser.back()

In [None]:
# Merge Data into MasterCompetitionData
df_MasterCompetitionData = pd.DataFrame() #first run

In [None]:
df_MasterCompetitionData.append(df_results)

In [None]:
df_MasterCompetitionData

# Below is working on potentially parsing each table into a list and then making the dataframe from the list of lists. 

### NOTE: 

It looks like the loop above is going to work and I doubt we are going to get graded on the time the code takes to process but left the link and maybe this challenge in here if we want to do it the 'most effective' way.

Maybe this would be an issue if we had more data but it didn't seem like an issue for the medals by year. 

link with good info on why not to append dataframes but not related to scraping 
- https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it 

## Ideas: Maybe pull tables with pd.read_html still and then send each column to a list with tolist(). Append those lists to a master list or dictionary, and then make a combined df with the data?

In [None]:
# started some testing on scraping to lists

# /html/body/div[2]/table[5]/thead/tr/th[1]
url = "https://www.olympedia.org/editions/16"

html = browser.html
html_soup = soup(html, 'html.parser')

# set table reference
year_results_table = html_soup.find_all('table')[5]
table_headers = []
for year_results_table
# get the table headers
# set the headers as dictionary keys
# append column values to the key:values
# go to the next page and repeat
# turn the dictionary into a dataframe
