In [140]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np


In [141]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\huang\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [142]:
# go to website to scrape
url = "https://www.olympedia.org/editions"
browser.visit(url)

# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')

In [143]:
# read html with pandas (bs4 and parser working on backend)
test_df = pd.read_html(url) # returs list of all tables on the page

In [144]:
test_df

[         #  Year               City  Country        Opened        Closed  \
 0        I  1896             Athina      NaN       6 April      15 April   
 1       II  1900              Paris      NaN           NaN           NaN   
 2      III  1904          St. Louis      NaN        14 May           NaN   
 3       IV  1908             London      NaN       13 July           NaN   
 4        V  1912          Stockholm      NaN        6 July       27 July   
 5       VI  1916             Berlin      NaN           NaN           NaN   
 6      VII  1920          Antwerpen      NaN     14 August     30 August   
 7     VIII  1924              Paris      NaN        5 July       27 July   
 8       IX  1928          Amsterdam      NaN       28 July     12 August   
 9        X  1932        Los Angeles      NaN       30 July     14 August   
 10      XI  1936             Berlin      NaN      1 August     16 August   
 11     XII  1940           Helsinki      NaN           NaN           NaN   

In [145]:
print(f'Total tables: {len(test_df)}')

Total tables: 8


In [146]:
# NOTE: code below could also be used to pull Winter from the winter table 
# look for tables with Tokyo
table_tokyo = pd.read_html(url, match="Tokyo", converters={"City":str})
# len(table_tokyo) # returns 1 so that is the table that we want

In [147]:
# read table as DF
sum_olympic_org = table_tokyo[0]
sum_olympic_org.head()

# NOTE: country coming in as NaN because it is a picture of a flag. Scrape country codes from the img tags. 

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7
0,I,1896,Athina,,6 April,15 April,6 â 13 April,
1,II,1900,Paris,,,,14 May â 28 October,
2,III,1904,St. Louis,,14 May,,1 July â 23 November,
3,IV,1908,London,,13 July,,27 April â 31 October,
4,V,1912,Stockholm,,6 July,27 July,5 May â 27 July,


In [148]:
# trying to get the country codes
# get the image tag from the correct column in the first table
# set table reference
table_to_scrape = html_soup.find_all('table')[0]

# for td in html_soup.find_all('td'):
#     if td.img:
#         print(td.img['src'])

# get table rows
table_body = table_to_scrape.find_all('tr')


In [149]:
# get number of rows in the table
body_rows = table_body[0:]
# len(body_rows)

In [150]:
body_rows

[<tr>
 <th>#</th>
 <th>Year</th>
 <th>City</th>
 <th>Country</th>
 <th>Opened</th>
 <th>Closed</th>
 <th>Competition</th>
 <th></th>
 </tr>,
 <tr>
 <td><a href="/editions/1">I</a></td>
 <td><a href="/editions/1">1896</a></td>
 <td><a href="/editions/1">Athina</a></td>
 <td><img src="/images/flags/GRE.png" style="padding-right: 2px; vertical-align: middle"/></td>
 <td> 6 April</td>
 <td>15 April</td>
 <td>6 – 13 April</td>
 <td>
 </td>
 </tr>,
 <tr>
 <td><a href="/editions/2">II</a></td>
 <td><a href="/editions/2">1900</a></td>
 <td><a href="/editions/2">Paris</a></td>
 <td><img src="/images/flags/FRA.png" style="padding-right: 2px; vertical-align: middle"/></td>
 <td></td>
 <td></td>
 <td>14 May – 28 October</td>
 <td>
 </td>
 </tr>,
 <tr>
 <td><a href="/editions/3">III</a></td>
 <td><a href="/editions/3">1904</a></td>
 <td><a href="/editions/3">St. Louis</a></td>
 <td><img src="/images/flags/USA.png" style="padding-right: 2px; vertical-align: middle"/></td>
 <td>14 May</td>
 <td></td>

In [151]:
# loop through rows and get the image source tags


# define empty list to hold the src tags
flag_images = []

# set row number to pass to table_body[]
for row_num in range(len(body_rows)):

    # loop through each row in the table body and find all rows
    for row in table_body[row_num].find_all('td'):
         
        # check to see if the row has an img tag 
        if row.img:
            # append the img src tag to a list
            flag_images.append(row.img['src'])

In [152]:
# use regex to get the country code from the img src
import re
country_codes = []
# loop through list and pull 3 alphanumeric before '.'
for src in flag_images:
    code = re.findall(r"(\w{3})\.", src) # maybe not the most efficient way but all the strings were the same so took a shot at it
    # append to list
    country_codes.append(code)

In [153]:
# regex loop returned a list of lists so extract into single list
country_codes = [x[0] for x in country_codes]

In [154]:
# get rid of columns we don't need
sum_olympic_cln = sum_olympic_org[['Year', 'City', 'Country']].copy()
sum_olympic_cln.head()

Unnamed: 0,Year,City,Country
0,1896,Athina,
1,1900,Paris,
2,1904,St. Louis,
3,1908,London,
4,1912,Stockholm,


In [155]:
# replace the NaNs in the Country column with country_codes list
sum_olympic_cln['Country'] = country_codes
sum_olympic_cln.head()

Unnamed: 0,Year,City,Country
0,1896,Athina,GRE
1,1900,Paris,FRA
2,1904,St. Louis,USA
3,1908,London,GBR
4,1912,Stockholm,SWE


In [156]:
# check DF
sum_olympic_cln.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Year     34 non-null     int64 
 1   City     34 non-null     object
 2   Country  34 non-null     object
dtypes: int64(1), object(2)
memory usage: 944.0+ bytes


In [157]:
sum_olympic_org

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7
0,I,1896,Athina,,6 April,15 April,6 â 13 April,
1,II,1900,Paris,,,,14 May â 28 October,
2,III,1904,St. Louis,,14 May,,1 July â 23 November,
3,IV,1908,London,,13 July,,27 April â 31 October,
4,V,1912,Stockholm,,6 July,27 July,5 May â 27 July,
5,VI,1916,Berlin,,,,â,Not held due to war
6,VII,1920,Antwerpen,,14 August,30 August,23 April â 12 September,
7,VIII,1924,Paris,,5 July,27 July,4 May â 27 July,
8,IX,1928,Amsterdam,,28 July,12 August,17 May â 12 August,
9,X,1932,Los Angeles,,30 July,14 August,30 July â 14 August,


In [158]:
# get rid of everything before 1964 and after 2016
analysis_years = sum_olympic_cln.copy()
analysis_years.drop(analysis_years[(analysis_years['Year'] < 1964) | (analysis_years['Year'] > 2016)].index, inplace=True)

In [159]:
analysis_years.rename(columns={"City": "Host_City", "Country":"Host_Country"}, inplace=True)
# analysis_years.head()

In [160]:
# send years with host cities and countries to csv
# analysis_years.to_csv("host_cities.csv")

In [161]:
# merge the dataframes - doing this after becuase I didn't want to re-run the scrape.

# load host cities as DF
# host_cities_df = pd.read_csv("host_cities.csv", index_col=0)

In [162]:
# load medals scrape csv as DF
# all_country_medals_df = pd.read_csv("complete_year_datascrape_07182021.csv", index_col=0)

In [163]:
# merge the dataframes on year
# combined_data = all_country_medals_df.merge(right=host_cities_df, how="left", on="Year")
# combined_data.head()

In [None]:
# send merged dataframe to .csv
# combined_data.to_csv("medal_data_by_year.csv")

# Moving to individual year medals pages
1 - navigate to correct page

2 - locate medal table

3 - load medal table as DF with index as the year

4 - merge with the summer olympic table - 

In [None]:
# get years into a list so you can use them for href and click
years_list = analysis_years['Year'].tolist()

# years are int - need them to be string to pass to scraping loop so convert
years_list = [str(x) for x in years_list]

In [None]:
years_list

In [None]:
# from xpath we know the table row to start is 18 
# browser.find_by_xpath('/html/body/div[2]/table[1]/tbody/tr[18]/td[2]/a'). click() # this also works but is less understandable

# NOTE: according to this link https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it it is not a good idea to append dataframes in a loop. Working on scraping to lists or dictionaries and then turning into dataframe but this loop should work despite it being memory intensive if we need to use it. 

# create the first dataframe - 1964
# complete_medals_table = Data - was going to create a blank dataframe to append to but apparently that's not a good idea

# test list = successful for 2 years
test_years = ['1964', '1968']

# set iterator
# counter = 0

# df dictionary - testing
# https://stackoverflow.com/questions/30233982/merge-dataframes-in-a-dictionary

all_years = {}

# start for loop to get all of the year medal tables
for year in test_years: # change back to years_list when testing complete

    browser.find_by_text(year).click() # this works - loop step 1

    # get page url - loop step 2
    page_url = browser.url

    # reset soup
    # parse HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')

    # get all tables on the page as DFs 
    #page_dfs = pd.read_html(page_url)
    # print(f'Total tables: {len(page_dfs)}') # check number of tables on page - use for debugging

    # get the medal table and load to DF
    # medals_table = pd.read_html(page_url, match="NOC")[0] # loop step 3
    #all_years[year] = pd.read_html(page_url, match="NOC")[0] # loop step 3

    # add column 'year' with the year of the games - loop step 4
    # medals_table['Year'] = int(year)
    all_years[year]['Year'] = int(year)

    # rename the columns - loop step 5
    # medals_table.rename(columns={'NOC': 'Country', 'NOC.1': 'Country Code'}, inplace=True)
    all_years[year].rename(columns={'NOC': 'Country', 'NOC.1': 'Country Code'}, inplace=True)

    # NOTE: may get rid of this in favor of adding to dictionary and then appending.
    # if this is the first year then just save as DF, otherwise append to the DF
    # if counter != 0:
        # medals_table = medals_table.append(medals_table, ignore_index=True)

    # counter += 1

    # go back to the main page
    browser.back()

browser.quit()

In [178]:
# ################# HH Test

##Extract Disciplines List
url = "https://www.olympedia.org/editions/16"
browser.visit(url)

# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
#test_disc_df = pd.read_html(url)
table_disciplines = pd.read_html(url)
other_disciplines = pd.read_html(url)
# len(table_tokyo) # returns 1 so that is the table that we want
#table_disciplines
#other_disciplines

In [179]:
pd.read_html(url)[3]

Unnamed: 0,0,1,2
0,Artistic Gymnastics,Equestrian Dressage,Rowing
1,Athletics,Equestrian Eventing,Sailing
2,Basketball,Equestrian Jumping,Shooting
3,Boxing,Fencing,Swimming
4,Canoe Sprint,Football,Volleyball
5,Cycling Road,Hockey,Water Polo
6,Cycling Track,Judo,Weightlifting
7,Diving,Modern Pentathlon,Wrestling


In [180]:
#Gather List of Main Disciplines
df_table_disciplines = pd.read_html(url)[3]
#df_table_disciplines

df_table0 = df_table_disciplines[0]
df_table1 = df_table_disciplines[1]
df_table2 = df_table_disciplines[2]

pieces = {"1": df_table0,"2": df_table1,"3": df_table2}
df_table_disciplines = pd.concat(pieces)

main_disciplines = df_table_disciplines.tolist()

In [181]:
main_disciplines

['Artistic Gymnastics',
 'Athletics',
 'Basketball',
 'Boxing',
 'Canoe Sprint',
 'Cycling Road',
 'Cycling Track',
 'Diving',
 'Equestrian Dressage',
 'Equestrian Eventing',
 'Equestrian Jumping',
 'Fencing',
 'Football',
 'Hockey',
 'Judo',
 'Modern Pentathlon',
 'Rowing',
 'Sailing',
 'Shooting',
 'Swimming',
 'Volleyball',
 'Water Polo',
 'Weightlifting',
 'Wrestling']

In [8]:
#Gather list of Other Disciplines
df_other_disc = pd.read_html(url)[4]


df_table0 = df_other_disc[0]
df_table1 = df_other_disc[1]
df_table2 = df_other_disc[2]

pieces = {"1": df_table0,"2": df_table1,"3": df_table2}
df_other_disc = pd.concat(pieces)
    
other_disciplines = df_other_disc.tolist()
other_disciplines

['Baseball', 'Kendo', 'Kyudo', 'Sumo', nan, nan]

In [10]:
discipline_combo = main_disciplines + other_disciplines
df_disciplines_all = pd.DataFrame(discipline_combo)
df_disciplines_all.dropna()

Unnamed: 0,0
0,Artistic Gymnastics
1,Athletics
2,Basketball
3,Boxing
4,Canoe Sprint
5,Cycling Road
6,Cycling Track
7,Diving
8,Equestrian Dressage
9,Equestrian Eventing


In [None]:
######### HH Single Competition Extract

In [131]:
#Find the right competition
url = "https://www.olympedia.org/editions/16"
browser.visit(url)

competition = df_disciplines_all[0][1]
print("going to...   " + competition)

browser.find_by_text(competition).click() # this works - loop step 1

# get page url
page_url = browser.url

# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')



going to...   Athletics


In [132]:
html_soup

<html><head>
<title>Olympedia – 1964 Summer Olympics Overview</title>
<meta content="authenticity_token" name="csrf-param"/>
<meta content="sXmMJUtCyI++ngoOPwK07h+2RLsIOYhyAMe4ObX9QxIcgXEahhk48OBxXF+QQPr7ItKeEMtmS9VzmkIaezMbYg==" name="csrf-token"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="EN" http-equiv="content-language"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/assets/bootstrap.min-460a43de22fd9534d595e5aea2715cb154560291c9c6401b526e31c86a5ce32d.css" media="all" rel="stylesheet"/>
<link href="/assets/bootstrap-sortable-363d232309d54b549fa85446295ef2b5d290e3f8a49f1a646247340be3705ef9.css" media="all" rel="stylesheet"/>
<link href="/assets/jquery-ui-1.11.4.min-359ba1b9eb679ad05fb4c8fda710ee4c0239354f1ba635200b6065638295d646.css" media="all" rel="stylesheet"/>
<link href="/assets/lightbox-e29689e123fc27505d2b9d919f43ffcb6fade539cb4670f21c35aa07848105e7.css" media="screen" rel="stylesheet"/>
<link data

In [133]:
page_dfs = pd.read_html(page_url)
page_dfs

[              0                       1
 0         Dates  14 â 21 October 1964
 1  Medal Events                      36,
                              Event   Status                    Date  \
 0                  100 metres, Men  Olympic  14 â 15 October 1964   
 1                  200 metres, Men  Olympic  16 â 17 October 1964   
 2                  400 metres, Men  Olympic  17 â 19 October 1964   
 3                  800 metres, Men  Olympic  14 â 16 October 1964   
 4                1,500 metres, Men  Olympic  17 â 21 October 1964   
 5                5,000 metres, Men  Olympic  16 â 18 October 1964   
 6               10,000 metres, Men  Olympic         14 October 1964   
 7                    Marathon, Men  Olympic         21 October 1964   
 8          110 metres Hurdles, Men  Olympic  17 â 18 October 1964   
 9          400 metres Hurdles, Men  Olympic  14 â 16 October 1964   
 10  3,000 metres Steeplechase, Men  Olympic  15 â 17 October 1964   
 11       4 

In [134]:
# Gather Results Table
df_results = pd.read_html(page_url)[2]
df_results

Unnamed: 0,Event,Gold,Gold.1,Silver,Silver.1,Bronze,Bronze.1
0,"100 metres, Men",Bob Hayes,USA,Enrique Figuerola,CUB,Harry Jerome,CAN
1,"200 metres, Men",Henry Carr,USA,Paul Drayton,USA,Edwin Roberts,TTO
2,"400 metres, Men",Mike Larrabee,USA,Wendell Mottley,TTO,Andrzej BadeÅski,POL
3,"800 metres, Men",Peter Snell,NZL,Bill Crothers,CAN,Wilson Kiprugut,KEN
4,"1,500 metres, Men",Peter Snell,NZL,Josef OdloÅ¾il,TCH,John Davies,NZL
5,"5,000 metres, Men",Bob Schul,USA,Harald Norpoth,GER,Bill Dellinger,USA
6,"10,000 metres, Men",Billy Mills,USA,Mohamad Gammoudi,TUN,Ron Clarke,AUS
7,"Marathon, Men",Abebe Bikila,ETH,Basil Heatley,GBR,Kokichi Tsuburaya,JPN
8,"110 metres Hurdles, Men",Hayes Jones,USA,Blaine Lindgren,USA,Anatoly Mikhaylov,URS
9,"400 metres Hurdles, Men",Rex Cawley,USA,John Cooper,GBR,Salvatore Morale,ITA


In [40]:
# Clean/Resolve Ties

#df_results_tiebreaker

In [111]:
# Find Location, Long, Lat

#for event in df_results['Event']:
event = df_results['Event'][1]
# Navigate to Event Details
print("going to...   " + event)
browser.find_by_text(event).click() # this works - loop step 1
# get page url
page_url = browser.url
# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
    

going to...   Team All-Around, Men


In [120]:
df_event_details = pd.read_html(page_url)[0]
df_event_details

Unnamed: 0,0,1
0,Dates,14 â 21 October 1964
1,Medal Events,36


In [135]:
# Add Year, Discipline to Results Table
df_results['Year'] = '1964'

In [136]:
df_results

Unnamed: 0,Event,Gold,Gold.1,Silver,Silver.1,Bronze,Bronze.1,Year
0,"100 metres, Men",Bob Hayes,USA,Enrique Figuerola,CUB,Harry Jerome,CAN,1964
1,"200 metres, Men",Henry Carr,USA,Paul Drayton,USA,Edwin Roberts,TTO,1964
2,"400 metres, Men",Mike Larrabee,USA,Wendell Mottley,TTO,Andrzej BadeÅski,POL,1964
3,"800 metres, Men",Peter Snell,NZL,Bill Crothers,CAN,Wilson Kiprugut,KEN,1964
4,"1,500 metres, Men",Peter Snell,NZL,Josef OdloÅ¾il,TCH,John Davies,NZL,1964
5,"5,000 metres, Men",Bob Schul,USA,Harald Norpoth,GER,Bill Dellinger,USA,1964
6,"10,000 metres, Men",Billy Mills,USA,Mohamad Gammoudi,TUN,Ron Clarke,AUS,1964
7,"Marathon, Men",Abebe Bikila,ETH,Basil Heatley,GBR,Kokichi Tsuburaya,JPN,1964
8,"110 metres Hurdles, Men",Hayes Jones,USA,Blaine Lindgren,USA,Anatoly Mikhaylov,URS,1964
9,"400 metres Hurdles, Men",Rex Cawley,USA,John Cooper,GBR,Salvatore Morale,ITA,1964


In [None]:
# Scrape Location Details

location = df_event_details[1][2]
# Navigate to Location Details
print("going to...   " + location)
browser.find_by_text(location).click() # this works - loop step 1
# get page url
page_url = browser.url
# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
browser.back()

In [129]:
# Merge Data into MasterCompetitionData
df_MasterCompetitionData = pd.DataFrame() #first run

In [138]:
df_MasterCompetitionData.append(df_results)

Unnamed: 0,Event,Gold,Gold.1,Silver,Silver.1,Bronze,Bronze.1,Year
0,"100 metres, Men",Bob Hayes,USA,Enrique Figuerola,CUB,Harry Jerome,CAN,1964
1,"200 metres, Men",Henry Carr,USA,Paul Drayton,USA,Edwin Roberts,TTO,1964
2,"400 metres, Men",Mike Larrabee,USA,Wendell Mottley,TTO,Andrzej BadeÅski,POL,1964
3,"800 metres, Men",Peter Snell,NZL,Bill Crothers,CAN,Wilson Kiprugut,KEN,1964
4,"1,500 metres, Men",Peter Snell,NZL,Josef OdloÅ¾il,TCH,John Davies,NZL,1964
5,"5,000 metres, Men",Bob Schul,USA,Harald Norpoth,GER,Bill Dellinger,USA,1964
6,"10,000 metres, Men",Billy Mills,USA,Mohamad Gammoudi,TUN,Ron Clarke,AUS,1964
7,"Marathon, Men",Abebe Bikila,ETH,Basil Heatley,GBR,Kokichi Tsuburaya,JPN,1964
8,"110 metres Hurdles, Men",Hayes Jones,USA,Blaine Lindgren,USA,Anatoly Mikhaylov,URS,1964
9,"400 metres Hurdles, Men",Rex Cawley,USA,John Cooper,GBR,Salvatore Morale,ITA,1964


In [139]:
df_MasterCompetitionData

# Below is working on potentially parsing each table into a list and then making the dataframe from the list of lists. 

### NOTE: 

It looks like the loop above is going to work and I doubt we are going to get graded on the time the code takes to process but left the link and maybe this challenge in here if we want to do it the 'most effective' way.

Maybe this would be an issue if we had more data but it didn't seem like an issue for the medals by year. 

link with good info on why not to append dataframes but not related to scraping 
- https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it 

## Ideas: Maybe pull tables with pd.read_html still and then send each column to a list with tolist(). Append those lists to a master list or dictionary, and then make a combined df with the data?

In [None]:
# started some testing on scraping to lists

# /html/body/div[2]/table[5]/thead/tr/th[1]
url = "https://www.olympedia.org/editions/16"

html = browser.html
html_soup = soup(html, 'html.parser')

# set table reference
year_results_table = html_soup.find_all('table')[5]
table_headers = []
for year_results_table
# get the table headers
# set the headers as dictionary keys
# append column values to the key:values
# go to the next page and repeat
# turn the dictionary into a dataframe
