In [13]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np

In [14]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\tgrah\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [38]:
# go to website to scrape
url = "https://www.olympedia.org/editions"
browser.visit(url)

# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')

In [16]:
# read html with pandas (bs4 and parser working on backend)
test_df = pd.read_html(url) # returs list of all tables on the page

In [17]:
print(f'Total tables: {len(test_df)}')

Total tables: 8


In [18]:
# NOTE: code below could also be used to pull Winter from the winter table 
# look for tables with Tokyo
table_tokyo = pd.read_html(url, match="Tokyo", converters={"City":str})
# len(table_tokyo) # returns 1 so that is the table that we want

In [19]:
# read table as DF
sum_olympic_org = table_tokyo[0]
sum_olympic_org.head()

# NOTE: country coming in as NaN because it is a picture of a flag. Scrape country codes from the img tags. 

Unnamed: 0,#,Year,City,Country,Opened,Closed,Competition,Unnamed: 7
0,I,1896,Athina,,6 April,15 April,6 â 13 April,
1,II,1900,Paris,,,,14 May â 28 October,
2,III,1904,St. Louis,,14 May,,1 July â 23 November,
3,IV,1908,London,,13 July,,27 April â 31 October,
4,V,1912,Stockholm,,6 July,27 July,5 May â 27 July,


In [20]:
# trying to get the country codes
# get the image tag from the correct column in the first table
# set table reference
table_to_scrape = html_soup.find_all('table')[0]

# for td in html_soup.find_all('td'):
#     if td.img:
#         print(td.img['src'])

# get table rows
table_body = table_to_scrape.find_all('tr')


In [21]:
# get number of rows in the table
body_rows = table_body[0:]
# len(body_rows)

In [22]:
# loop through rows and get the image source tags


# define empty list to hold the src tags
flag_images = []

# set row number to pass to table_body[]
for row_num in range(len(body_rows)):

    # loop through each row in the table body and find all rows
    for row in table_body[row_num].find_all('td'):
         
        # check to see if the row has an img tag 
        if row.img:
            # append the img src tag to a list
            flag_images.append(row.img['src'])

In [23]:
# use regex to get the country code from the img src
import re
country_codes = []
# loop through list and pull 3 alphanumeric before '.'
for src in flag_images:
    code = re.findall(r"(\w{3})\.", src) # maybe not the most efficient way but all the strings were the same so took a shot at it
    # append to list
    country_codes.append(code)

In [24]:
# regex loop returned a list of lists so extract into single list
country_codes = [x[0] for x in country_codes]

In [25]:
# get rid of columns we don't need
sum_olympic_cln = sum_olympic_org[['Year', 'City', 'Country']].copy()
sum_olympic_cln.head()

Unnamed: 0,Year,City,Country
0,1896,Athina,
1,1900,Paris,
2,1904,St. Louis,
3,1908,London,
4,1912,Stockholm,


In [26]:
# replace the NaNs in the Country column with country_codes list
sum_olympic_cln['Country'] = country_codes
sum_olympic_cln.head()

Unnamed: 0,Year,City,Country
0,1896,Athina,GRE
1,1900,Paris,FRA
2,1904,St. Louis,USA
3,1908,London,GBR
4,1912,Stockholm,SWE


In [27]:
# check DF
sum_olympic_cln.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Year     34 non-null     int64 
 1   City     34 non-null     object
 2   Country  34 non-null     object
dtypes: int64(1), object(2)
memory usage: 944.0+ bytes


In [28]:
# get rid of everything before 1964 and after 2016
analysis_years = sum_olympic_cln.copy()
analysis_years.drop(analysis_years[(analysis_years['Year'] < 1964) | (analysis_years['Year'] > 2016)].index, inplace=True)

In [29]:
analysis_years.rename(columns={"City": "Host_City", "Country":"Host_Country"}, inplace=True)
# analysis_years.head()

In [30]:
# send years with host cities and countries to csv
# analysis_years.to_csv("host_cities.csv")

In [31]:
# merge the dataframes - doing this after becuase I didn't want to re-run the scrape.

# load host cities as DF
# host_cities_df = pd.read_csv("host_cities.csv", index_col=0)

In [32]:
# load medals scrape csv as DF
# all_country_medals_df = pd.read_csv("complete_year_datascrape_07182021.csv", index_col=0)

In [33]:
# merge the dataframes on year
# combined_data = all_country_medals_df.merge(right=host_cities_df, how="left", on="Year")
# combined_data.head()

In [34]:
# send merged dataframe to .csv
# combined_data.to_csv("medal_data_by_year.csv")

# Moving to individual year medals pages
1 - navigate to correct page

2 - locate medal table

3 - load medal table as DF with index as the year

4 - merge with the summer olympic table - 

In [35]:
# get years into a list so you can use them for href and click
years_list = analysis_years['Year'].tolist()

# years are int - need them to be string to pass to scraping loop so convert
years_list = [str(x) for x in years_list]

In [39]:
# from xpath we know the table row to start is 18 
# browser.find_by_xpath('/html/body/div[2]/table[1]/tbody/tr[18]/td[2]/a'). click() # this also works but is less understandable

# NOTE: according to this link https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it it is not a good idea to append dataframes in a loop. Working on scraping to lists or dictionaries and then turning into dataframe but this loop should work despite it being memory intensive if we need to use it. 

# create the first dataframe - 1964
# complete_medals_table = Data - was going to create a blank dataframe to append to but apparently that's not a good idea

# test list = successful for 2 years
test_years = ['1964', '1968']

# set iterator
# counter = 0

# df dictionary - testing
# https://stackoverflow.com/questions/30233982/merge-dataframes-in-a-dictionary

all_years = {}

# start for loop to get all of the year medal tables
for year in years_list: # change back to years_list when testing complete

    browser.find_by_text(year).click() # this works - loop step 1

    # get page url - loop step 2
    page_url = browser.url

    # reset soup
    # parse HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')

    # get all tables on the page as DFs 
    page_dfs = pd.read_html(page_url)
    # print(f'Total tables: {len(page_dfs)}') # check number of tables on page - use for debugging

    # get the medal table and load to DF
    # medals_table = pd.read_html(page_url, match="NOC")[0] # loop step 3
    all_years[year] = pd.read_html(page_url, match="NOC")[0] # loop step 3

    # add column 'year' with the year of the games - loop step 4
    # medals_table['Year'] = int(year)
    all_years[year]['Year'] = int(year)

    # rename the columns - loop step 5
    # medals_table.rename(columns={'NOC': 'Country', 'NOC.1': 'Country Code'}, inplace=True)
    all_years[year].rename(columns={'NOC': 'Country', 'NOC.1': 'Country Code'}, inplace=True)

    # NOTE: may get rid of this in favor of adding to dictionary and then appending.
    # if this is the first year then just save as DF, otherwise append to the DF
    # if counter != 0:
        # medals_table = medals_table.append(medals_table, ignore_index=True)

    # counter += 1

    # go back to the main page
    browser.back()

browser.quit()

In [40]:
# check that we pulled all years should = 14 - looks good so far
len(all_years)

14

In [41]:
complete_summer_data = pd.concat(all_years.values(), ignore_index=True)
print(complete_summer_data.shape)
complete_summer_data.Year.unique().tolist()
complete_summer_data.head()

(865, 7)


Unnamed: 0,Country,Country Code,Gold,Silver,Bronze,Total,Year
0,United States,USA,36,26,28,90,1964
1,Soviet Union,URS,30,31,35,96,1964
2,Japan,JPN,16,5,8,29,1964
3,Germany,GER,10,22,18,50,1964
4,Italy,ITA,10,10,7,27,1964


In [43]:
# send to a csv to double check the data
complete_summer_data.to_csv('complete_country_medals_by_year.csv')

In [45]:
# merge dataframes on year
analysis_years_df = pd.read_csv("host_cities.csv")
analysis_years_df.head()

Unnamed: 0,Year,Host_City,Host_Country
0,1964,Tokyo,JPN
1,1968,Mexico City,MEX
2,1972,Munich,GER
3,1976,Montreal,CAN
4,1980,Moskow,RUS


In [47]:
combined_data = pd.merge(complete_summer_data, analysis_years_df, how="left", on=["Year"])
combined_data.head()
print(combined_data.shape)

(865, 9)


# Below is working on potentially parsing each table into a list and then making the dataframe from the list of lists. 

### NOTE: 

It looks like the loop above is going to work and I doubt we are going to get graded on the time the code takes to process but left the link and maybe this challenge in here if we want to do it the 'most effective' way.

Maybe this would be an issue if we had more data but it didn't seem like an issue for the medals by year. 

link with good info on why not to append dataframes but not related to scraping 
- https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it 

## Ideas: Maybe pull tables with pd.read_html still and then send each column to a list with tolist(). Append those lists to a master list or dictionary, and then make a combined df with the data?

In [182]:
# started some testing on scraping to lists

# /html/body/div[2]/table[5]/thead/tr/th[1]
url = "https://www.olympedia.org/editions/16"

html = browser.html
html_soup = soup(html, 'html.parser')

# set table reference
year_results_table = html_soup.find_all('table')[5]
table_headers = []
for year_results_table
# get the table headers
# set the headers as dictionary keys
# append column values to the key:values
# go to the next page and repeat
# turn the dictionary into a dataframe
