In [None]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np


In [None]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)

In [None]:
# go to website to scrape
url = "https://www.olympedia.org/editions"
browser.visit(url)

# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')

In [None]:
# read html with pandas (bs4 and parser working on backend)
test_df = pd.read_html(url) # returs list of all tables on the page


In [None]:
test_df

In [None]:
print(f'Total tables: {len(test_df)}')

In [None]:
# NOTE: code below could also be used to pull Winter from the winter table 
# look for tables with Tokyo
table_tokyo = pd.read_html(url, match="Tokyo", converters={"City":str})
# len(table_tokyo) # returns 1 so that is the table that we want

In [None]:
# read table as DF
sum_olympic_org = table_tokyo[0]
sum_olympic_org.head()

# NOTE: country coming in as NaN because it is a picture of a flag. Scrape country codes from the img tags. 

In [None]:
# trying to get the country codes
# get the image tag from the correct column in the first table
# set table reference
table_to_scrape = html_soup.find_all('table')[0]

# for td in html_soup.find_all('td'):
#     if td.img:
#         print(td.img['src'])

# get table rows
table_body = table_to_scrape.find_all('tr')


In [None]:
# get number of rows in the table
body_rows = table_body[0:]
# len(body_rows)

In [None]:
body_rows

In [None]:
# loop through rows and get the image source tags


# define empty list to hold the src tags
flag_images = []

# set row number to pass to table_body[]
for row_num in range(len(body_rows)):

    # loop through each row in the table body and find all rows
    for row in table_body[row_num].find_all('td'):
         
        # check to see if the row has an img tag 
        if row.img:
            # append the img src tag to a list
            flag_images.append(row.img['src'])

In [None]:
# use regex to get the country code from the img src
import re
country_codes = []
# loop through list and pull 3 alphanumeric before '.'
for src in flag_images:
    code = re.findall(r"(\w{3})\.", src) # maybe not the most efficient way but all the strings were the same so took a shot at it
    # append to list
    country_codes.append(code)

In [None]:
# regex loop returned a list of lists so extract into single list
country_codes = [x[0] for x in country_codes]

In [None]:
# get rid of columns we don't need
sum_olympic_cln = sum_olympic_org[['Year', 'City', 'Country']].copy()
sum_olympic_cln.head()

In [None]:
# replace the NaNs in the Country column with country_codes list
sum_olympic_cln['Country'] = country_codes
sum_olympic_cln.head()

In [None]:
# check DF
sum_olympic_cln.info()

In [None]:
sum_olympic_org

In [None]:
# get rid of everything before 1964 and after 2016
analysis_years = sum_olympic_cln.copy()
analysis_years.drop(analysis_years[(analysis_years['Year'] < 1964) | (analysis_years['Year'] > 2016)].index, inplace=True)

In [None]:
analysis_years.rename(columns={"City": "Host_City", "Country":"Host_Country"}, inplace=True)
# analysis_years.head()

In [None]:
# send years with host cities and countries to csv
# analysis_years.to_csv("host_cities.csv")

In [None]:
# merge the dataframes - doing this after becuase I didn't want to re-run the scrape.

# load host cities as DF
# host_cities_df = pd.read_csv("host_cities.csv", index_col=0)

In [None]:
# load medals scrape csv as DF
# all_country_medals_df = pd.read_csv("complete_year_datascrape_07182021.csv", index_col=0)

In [None]:
# merge the dataframes on year
# combined_data = all_country_medals_df.merge(right=host_cities_df, how="left", on="Year")
# combined_data.head()

In [None]:
# send merged dataframe to .csv
# combined_data.to_csv("medal_data_by_year.csv")

# Moving to individual year medals pages
1 - navigate to correct page

2 - locate medal table

3 - load medal table as DF with index as the year

4 - merge with the summer olympic table - 

In [None]:
# get years into a list so you can use them for href and click
years_list = analysis_years['Year'].tolist()

# years are int - need them to be string to pass to scraping loop so convert
years_list = [str(x) for x in years_list]

In [None]:
years_list

In [None]:
# from xpath we know the table row to start is 18 
# browser.find_by_xpath('/html/body/div[2]/table[1]/tbody/tr[18]/td[2]/a'). click() # this also works but is less understandable

# NOTE: according to this link https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it it is not a good idea to append dataframes in a loop. Working on scraping to lists or dictionaries and then turning into dataframe but this loop should work despite it being memory intensive if we need to use it. 

# create the first dataframe - 1964
# complete_medals_table = Data - was going to create a blank dataframe to append to but apparently that's not a good idea

# test list = successful for 2 years
test_years = ['1964', '1968']

# set iterator
# counter = 0

# df dictionary - testing
# https://stackoverflow.com/questions/30233982/merge-dataframes-in-a-dictionary

all_years = {}

# start for loop to get all of the year medal tables
for year in test_years: # change back to years_list when testing complete

    browser.find_by_text(year).click() # this works - loop step 1

    # get page url - loop step 2
    page_url = browser.url

    # reset soup
    # parse HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')

    # get all tables on the page as DFs 
    #page_dfs = pd.read_html(page_url)
    # print(f'Total tables: {len(page_dfs)}') # check number of tables on page - use for debugging

    # get the medal table and load to DF
    # medals_table = pd.read_html(page_url, match="NOC")[0] # loop step 3
    #all_years[year] = pd.read_html(page_url, match="NOC")[0] # loop step 3

    # add column 'year' with the year of the games - loop step 4
    # medals_table['Year'] = int(year)
    all_years[year]['Year'] = int(year)

    # rename the columns - loop step 5
    # medals_table.rename(columns={'NOC': 'Country', 'NOC.1': 'Country Code'}, inplace=True)
    all_years[year].rename(columns={'NOC': 'Country', 'NOC.1': 'Country Code'}, inplace=True)

    # NOTE: may get rid of this in favor of adding to dictionary and then appending.
    # if this is the first year then just save as DF, otherwise append to the DF
    # if counter != 0:
        # medals_table = medals_table.append(medals_table, ignore_index=True)

    # counter += 1

    # go back to the main page
    browser.back()

browser.quit()

In [18]:
# ################# HH Test

##Extract Disciplines List
url = "https://www.olympedia.org/editions/16"
browser.visit(url)

# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
#test_disc_df = pd.read_html(url)
table_disciplines = pd.read_html(url)
other_disciplines = pd.read_html(url)
# len(table_tokyo) # returns 1 so that is the table that we want
#table_disciplines
#other_disciplines


#Gather List of Main Disciplines
df_table_disciplines = pd.read_html(url)[3]
#df_table_disciplines

df_table0 = df_table_disciplines[0]
df_table1 = df_table_disciplines[1]
df_table2 = df_table_disciplines[2]

pieces = {"1": df_table0,"2": df_table1,"3": df_table2}
df_table_disciplines = pd.concat(pieces)

main_disciplines = df_table_disciplines.tolist()

#Gather list of Other Disciplines
df_other_disc = pd.read_html(url)[4]


df_table0 = df_other_disc[0]
df_table1 = df_other_disc[1]
df_table2 = df_other_disc[2]

pieces = {"1": df_table0,"2": df_table1,"3": df_table2}
df_other_disc = pd.concat(pieces)
    
other_disciplines = df_other_disc.tolist()

# Combine List of Disciplines and Drop NaN
discipline_combo = main_disciplines + other_disciplines
df_disciplines_all = pd.DataFrame(discipline_combo)
df_disciplines_all.dropna()

#####################################

#Find the right competition
url = "https://www.olympedia.org/editions/16"
browser.visit(url)

competition = df_disciplines_all[0][0]
print("going to...   " + competition)

browser.find_by_text(competition).click() # this works - loop step 1

# get page url
page_url = browser.url

# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')

page_dfs = pd.read_html(page_url)

# Gather Results Table
df_results = pd.read_html(page_url)[2]
df_results

# Clean/Resolve Ties

#df_results_tiebreaker


# Add Year, Discipline to Results Table
df_results['Year'] = '1964'

# Merge Data into MasterCompetitionData
df_MasterCompetitionData = df_results #first run

#df_MasterCompetitionData.append(df_results)

df_MasterCompetitionData






going to...   Artistic Gymnastics


Unnamed: 0,Event,Gold,Gold.1,Silver,Silver.1,Bronze,Bronze.1,Year
0,"Individual All-Around, Men",Yukio Endo,JPN,Shuji TsurumiViktor LisitskyBoris Shakhlin,JPNURSURS,â,â,1964
1,"Team All-Around, Men",Japan,JPN,Soviet Union,URS,Unified Team of Germany,GER,1964
2,"Floor Exercise, Men",Franco Menichelli,ITA,Viktor LisitskyYukio Endo,URSJPN,â,â,1964
3,"Horse Vault, Men",Haruhiro Yamashita,JPN,Viktor Lisitsky,URS,Hannu Rantakari,FIN,1964
4,"Parallel Bars, Men",Yukio Endo,JPN,Shuji Tsurumi,JPN,Franco Menichelli,ITA,1964
5,"Horizontal Bar, Men",Boris Shakhlin,URS,Yury Titov,URS,Miroslav Cerar,YUG,1964
6,"Rings, Men",Takuji Hayata,JPN,Franco Menichelli,ITA,Boris Shakhlin,URS,1964
7,"Pommelled Horse, Men",Miroslav Cerar,YUG,Shuji Tsurumi,JPN,Yury Tsapenko,URS,1964
8,"Individual All-Around, Women",VÄra ÄÃ¡slavskÃ¡,TCH,Larisa Latynina,URS,Polina Astakhova,URS,1964
9,"Team All-Around, Women",Soviet Union,URS,Czechoslovakia,TCH,Japan,JPN,1964


In [10]:
df_MasterCompetitionData

In [17]:
################## Second event

#####################################

#Find the right competition
url = "https://www.olympedia.org/editions/16"
browser.visit(url)

competition = df_disciplines_all[0][1]
print("going to...   " + competition)

browser.find_by_text(competition).click() # this works - loop step 1

# get page url
page_url = browser.url

# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')

page_dfs = pd.read_html(page_url)

# Gather Results Table
df_results = pd.read_html(page_url)[2]
df_results

# Clean/Resolve Ties

#df_results_tiebreaker


# Add Year, Discipline to Results Table
df_results['Year'] = '1964'

# Merge Data into MasterCompetitionData

df_MasterCompetitionData = df_MasterCompetitionData.append(df_results)

df_MasterCompetitionData

going to...   Athletics


Unnamed: 0,Event,Gold,Gold.1,Silver,Silver.1,Bronze,Bronze.1,Year
0,"Individual All-Around, Men",Yukio Endo,JPN,Shuji TsurumiViktor LisitskyBoris Shakhlin,JPNURSURS,â,â,1964
1,"Team All-Around, Men",Japan,JPN,Soviet Union,URS,Unified Team of Germany,GER,1964
2,"Floor Exercise, Men",Franco Menichelli,ITA,Viktor LisitskyYukio Endo,URSJPN,â,â,1964
3,"Horse Vault, Men",Haruhiro Yamashita,JPN,Viktor Lisitsky,URS,Hannu Rantakari,FIN,1964
4,"Parallel Bars, Men",Yukio Endo,JPN,Shuji Tsurumi,JPN,Franco Menichelli,ITA,1964
5,"Horizontal Bar, Men",Boris Shakhlin,URS,Yury Titov,URS,Miroslav Cerar,YUG,1964
6,"Rings, Men",Takuji Hayata,JPN,Franco Menichelli,ITA,Boris Shakhlin,URS,1964
7,"Pommelled Horse, Men",Miroslav Cerar,YUG,Shuji Tsurumi,JPN,Yury Tsapenko,URS,1964
8,"Individual All-Around, Women",VÄra ÄÃ¡slavskÃ¡,TCH,Larisa Latynina,URS,Polina Astakhova,URS,1964
9,"Team All-Around, Women",Soviet Union,URS,Czechoslovakia,TCH,Japan,JPN,1964


Unnamed: 0,Event,Gold,Gold.1,Silver,Silver.1,Bronze,Bronze.1,Year
0,"Basketball, Men",United States,USA,Soviet Union,URS,Brazil,BRA,1964


In [None]:
# Find Location, Long, Lat

#for event in df_results['Event']:
event = df_results['Event'][1]
# Navigate to Event Details
print("going to...   " + event)
browser.find_by_text(event).click() # this works - loop step 1
# get page url
page_url = browser.url
# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
    

In [None]:
df_event_details = pd.read_html(page_url)[0]
df_event_details

In [None]:
# Scrape Location Details

location = df_event_details[1][2]
# Navigate to Location Details
print("going to...   " + location)
browser.find_by_text(location).click() # this works - loop step 1
# get page url
page_url = browser.url
# reset soup
# parse HTML
html = browser.html
html_soup = soup(html, 'html.parser')
browser.back()

In [None]:
# Merge Data into MasterCompetitionData
df_MasterCompetitionData = pd.DataFrame() #first run

In [None]:
df_MasterCompetitionData.append(df_results)

In [None]:
df_MasterCompetitionData

# Below is working on potentially parsing each table into a list and then making the dataframe from the list of lists. 

### NOTE: 

It looks like the loop above is going to work and I doubt we are going to get graded on the time the code takes to process but left the link and maybe this challenge in here if we want to do it the 'most effective' way.

Maybe this would be an issue if we had more data but it didn't seem like an issue for the medals by year. 

link with good info on why not to append dataframes but not related to scraping 
- https://stackoverflow.com/questions/13784192/creating-an-empty-pandas-dataframe-then-filling-it 

## Ideas: Maybe pull tables with pd.read_html still and then send each column to a list with tolist(). Append those lists to a master list or dictionary, and then make a combined df with the data?

In [None]:
# started some testing on scraping to lists

# /html/body/div[2]/table[5]/thead/tr/th[1]
url = "https://www.olympedia.org/editions/16"

html = browser.html
html_soup = soup(html, 'html.parser')

# set table reference
year_results_table = html_soup.find_all('table')[5]
table_headers = []
for year_results_table
# get the table headers
# set the headers as dictionary keys
# append column values to the key:values
# go to the next page and repeat
# turn the dictionary into a dataframe
