In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import pdfplumber
import tabula
from Olympic_PED_use.src import functions as fn

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

from bs4 import BeautifulSoup
import certifi
import urllib3
import re
from csv import DictReader, DictWriter

In [3]:
from time import sleep
from random import randint
import requests
from requests import get

In [4]:
import pandas as pd
pd.set_option('display.max_rows', 2200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

#### Function to scrape and return results as a dataframe

In [5]:
def wiki_scraper(content):
    a = []
    b = []
    c = []
    d = []
    e = []
    f = []
    g = []
    tables = []
    for table in content:
        tables.append(table)
        for table_ in tables:
            rows = table_.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if len(cells) == 7:
                    a.append(cells[0].text)
                    b.append(cells[1].text)
                    c.append(cells[2].text)
                    d.append(cells[3].text)
                    e.append(cells[4].text)
                    f.append(cells[5].text)
                    g.append(cells[6].text)
    df = pd.DataFrame(a, columns=['name'])
    df['country'] = b
    df['event'] = c
    df['date_of_violation'] = d
    df['substance'] = e
    df['sanction'] = f
    df['references'] = g
    
    return df

#### Retrieving contents of doping cases in athletics Wikipedia page

In [6]:
url = 'https://en.wikipedia.org/wiki/List_of_doping_cases_in_athletics'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all('table', class_='wikitable sortable')   

#### Formatting the values in the dataframe by removing '\n' 

In [7]:
def col_format(df):
    df['name'] = [x.strip('\n') for x in df['name']]
    df['country'] = [x.strip('\n') for x in df['country'].values]
    df['date_of_violation'] = [x.strip('\n') for x in df['date_of_violation'].values]
    df['event'] = [x.strip('\n') for x in df['event'].values]
    df['substance'] = [x.strip('\n') for x in df['substance'].values]
    df['sanction'] = [x.strip('\n') for x in df['sanction'].values]
    df['references'] = [x.strip('\n') for x in df['references'].values]
    return df

#### Creating doping dataframe and adding a 'flagged' column specifying whether or not the athlete has tested positive for PED use during the Games.

In [8]:
doping = wiki_scraper(contents)

doping = col_format(doping)

doping['flagged'] = [1 for x in doping.index]

#### Reading in the Athlete events csv file containing Olympic athletes and general information

In [9]:
athlete_df = pd.read_csv('../data/athlete_events.csv')
athlete_df.columns = [x.lower() for x in athlete_df.columns]

#### Merging the athlete and doping dataframes by name

In [10]:
df = pd.merge(athlete_df, doping, on='name', how='left')
df = df.drop(['sanction', 'references', 'country', 'date_of_violation', 'games'], axis=1)
df = df[df['sport']=='Athletics']
df = df[df['year'] > 2003]
df = df[df['year'] < 2017]
df

Unnamed: 0,id,name,sex,age,height,weight,team,noc,year,season,city,sport,event_x,medal,event_y,substance,flagged
98,34,Jamale (Djamel-) Aarrass (Ahrass-),M,30.0,187.0,76.0,France,FRA,2012,Summer,London,Athletics,"Athletics Men's 1,500 metres",,,,
135,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,,
136,49,Moonika Aava,F,28.0,168.0,65.0,Estonia,EST,2008,Summer,Beijing,Athletics,Athletics Women's Javelin Throw,,,,
148,55,Antonio Abadia Beci,M,26.0,170.0,65.0,Spain,ESP,2016,Summer,Rio de Janeiro,Athletics,"Athletics Men's 5,000 metres",,,,
163,67,Mariya Vasilyevna Abakumova (-Tarabina),F,22.0,179.0,80.0,Russia,RUS,2008,Summer,Beijing,Athletics,Athletics Women's Javelin Throw,Silver,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275330,135547,Viktoriya Viktorovna Zyabkina,F,19.0,174.0,62.0,Kazakhstan,KAZ,2012,Summer,London,Athletics,Athletics Women's 200 metres,,,,
275331,135547,Viktoriya Viktorovna Zyabkina,F,23.0,174.0,62.0,Kazakhstan,KAZ,2016,Summer,Rio de Janeiro,Athletics,Athletics Women's 100 metres,,,,
275332,135547,Viktoriya Viktorovna Zyabkina,F,23.0,174.0,62.0,Kazakhstan,KAZ,2016,Summer,Rio de Janeiro,Athletics,Athletics Women's 200 metres,,,,
275333,135547,Viktoriya Viktorovna Zyabkina,F,23.0,174.0,62.0,Kazakhstan,KAZ,2016,Summer,Rio de Janeiro,Athletics,Athletics Women's 4 x 100 metres Relay,,,,


#### Creating dataframe for the 2004 Games held in Athens

In [56]:
df_04 = df[df['year']==2004]
df_04 = df_04.drop_duplicates().reset_index(drop=True)
df_04.name = df_04.name.str.replace(r"\(.*\)","")


#### Next, I am going to scrape the tables from Olympic.org that contain the different events, athletes per event, ranking, and result (time/distance). I am going to start with the 100m Dash

In [57]:
url = 'https://www.olympic.org/athens-2004/athletics/100m-men'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_='table4')  

In [58]:
mens_100m_04 = fn.athens_scraper(contents)
mens_100m_04 = fn.content_cleaner(mens_100m_04)
mens_100m_04.columns = ['rank', 'name', 'result']
mens_100m_04.insert(3, "event_x", "Athletics Men's 100 metres")
mens_100m_04

Unnamed: 0,rank,name,result,event_x
0,1.0,Justin Gatlin,9.85,Athletics Men's 100 metres
1,2.0,Francis Obikwelu,9.86,Athletics Men's 100 metres
2,3.0,Maurice Greene,9.87,Athletics Men's 100 metres
3,4.0,Shawn Crawford,9.89,Athletics Men's 100 metres
4,5.0,Asafa Powell,9.94,Athletics Men's 100 metres
5,6.0,Kim Collins,10.0,Athletics Men's 100 metres
6,7.0,Obadele Thompson,10.1,Athletics Men's 100 metres


In [59]:
mens_100m_04.name = [x.strip() for x in mens_100m_04.name]

In [60]:
mens_100m_04

Unnamed: 0,rank,name,result,event_x
0,1.0,Justin Gatlin,9.85,Athletics Men's 100 metres
1,2.0,Francis Obikwelu,9.86,Athletics Men's 100 metres
2,3.0,Maurice Greene,9.87,Athletics Men's 100 metres
3,4.0,Shawn Crawford,9.89,Athletics Men's 100 metres
4,5.0,Asafa Powell,9.94,Athletics Men's 100 metres
5,6.0,Kim Collins,10.0,Athletics Men's 100 metres
6,7.0,Obadele Thompson,10.1,Athletics Men's 100 metres


#### This only matched 3 of the 7 values in the result column. There are names in the athletes df that include first, middle, and last names. I am going to transform the names in the dataframe to only be first and last names. First, I will create a list to hold the first and last names, then replace the names in the dataframe with the names from the list.

In [61]:
names = []
for x in df_04.name:
    split = x.split()
    x = split[0] + ' ' + split[-1]
    names.append(x)


df_04['name'] = [x for x in names]

In [62]:
df_04 = pd.merge(df_04, mens_100m_04[['name', 'result', 'event_x']], how='left', on=['name', 'event_x'])
df_04.result.value_counts()

9.85     1
9.94     1
9.89     1
9.87     1
10.10    1
9.86     1
10.00    1
Name: result, dtype: int64

In [63]:
def olympic_query(event):
    url = 'https://www.olympic.org/athens-2004/athletics/' + str(event)
    req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                             ca_certs=certifi.where())
    res = req.request('GET', url)
    soup = BeautifulSoup(res.data, 'html.parser')
    contents = soup.find_all(class_ = 'table4')
    return contents

In [64]:
womens_100m_04 = olympic_query('100m-women')
womens_100m_04 = fn.athens_scraper(womens_100m_04)
womens_100m_04 = fn.content_cleaner(womens_100m_04)
womens_100m_04.columns = ['rank', 'name', 'result']
womens_100m_04.insert(3, "event_x", "Athletics Women's 100 metres")
womens_100m_04.name = [x.strip() for x in womens_100m_04.name]
womens_100m_04 = womens_100m_04.replace('Yuliya Nestsiarenka', 'Yuliya Nesterenko')
womens_100m_04 = womens_100m_04.replace('Debbie Ferguson-Mckenzie', 'Deborah Ferguson-McKenzie')
womens_100m_04 = womens_100m_04.replace('Latasha Colander', 'LaTasha Colander-Richardson')
womens_100m_04

Unnamed: 0,rank,name,result,event_x
0,1.0,Yuliya Nesterenko,10.93,Athletics Women's 100 metres
1,2.0,Lauryn Williams,10.96,Athletics Women's 100 metres
2,3.0,Veronica Campbell-Brown,10.97,Athletics Women's 100 metres
3,4.0,Ivet Lalova,11.0,Athletics Women's 100 metres
4,5.0,Aleen Bailey,11.05,Athletics Women's 100 metres
5,6.0,Sherone Simpson,11.07,Athletics Women's 100 metres
6,7.0,Deborah Ferguson-McKenzie,11.16,Athletics Women's 100 metres
7,8.0,LaTasha Colander-Richardson,11.18,Athletics Women's 100 metres


In [65]:
df_04 = pd.merge(df_04, womens_100m_04[['name', 'result', 'event_x']], how='left', on=['name', 'event_x'])
df_04.head(100)

Unnamed: 0,id,name,sex,age,height,weight,team,noc,year,season,city,sport,event_x,medal,event_y,substance,flagged,result_x,result_y
0,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,,,,
1,95,Georgia Abatzidou,F,35.0,155.0,43.0,Greece,GRE,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,,,,
2,96,Carlos Balladares,M,18.0,168.0,60.0,Nicaragua,NCA,2004,Summer,Athina,Athletics,Athletics Men's 100 metres,,,,,,
3,256,Abdihakim Abdirahman,M,27.0,178.0,61.0,United States,USA,2004,Summer,Athina,Athletics,"Athletics Men's 10,000 metres",,,,,,
4,397,Mara Diguez,F,29.0,163.0,46.0,Spain,ESP,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,,,,
5,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 1,500 metres",,,,,,
6,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 5,000 metres",,,,,,
7,488,Ibrahim Aboubaker,M,21.0,190.0,65.0,Qatar,QAT,2004,Summer,Athina,Athletics,Athletics Men's Triple Jump,,,,,,
8,600,Nagmeldin Abubakr,M,18.0,172.0,63.0,Sudan,SUD,2004,Summer,Athina,Athletics,Athletics Men's 400 metres,,,,,,
9,601,Sanna Abubkheet,F,19.0,157.0,52.0,Palestine,PLE,2004,Summer,Athina,Athletics,Athletics Women's 800 metres,,,,,,


In [66]:
df_04.result_y.value_counts()

10.93    1
11.05    1
10.96    1
10.97    1
11.16    1
11.07    1
11.18    1
Name: result_y, dtype: int64

In [67]:
df_04

Unnamed: 0,id,name,sex,age,height,weight,team,noc,year,season,city,sport,event_x,medal,event_y,substance,flagged,result_x,result_y
0,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,,,,
1,95,Georgia Abatzidou,F,35.0,155.0,43.0,Greece,GRE,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,,,,
2,96,Carlos Balladares,M,18.0,168.0,60.0,Nicaragua,NCA,2004,Summer,Athina,Athletics,Athletics Men's 100 metres,,,,,,
3,256,Abdihakim Abdirahman,M,27.0,178.0,61.0,United States,USA,2004,Summer,Athina,Athletics,"Athletics Men's 10,000 metres",,,,,,
4,397,Mara Diguez,F,29.0,163.0,46.0,Spain,ESP,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,,,,
5,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 1,500 metres",,,,,,
6,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 5,000 metres",,,,,,
7,488,Ibrahim Aboubaker,M,21.0,190.0,65.0,Qatar,QAT,2004,Summer,Athina,Athletics,Athletics Men's Triple Jump,,,,,,
8,600,Nagmeldin Abubakr,M,18.0,172.0,63.0,Sudan,SUD,2004,Summer,Athina,Athletics,Athletics Men's 400 metres,,,,,,
9,601,Sanna Abubkheet,F,19.0,157.0,52.0,Palestine,PLE,2004,Summer,Athina,Athletics,Athletics Women's 800 metres,,,,,,


In [75]:
import requests

url = "file:///Users/JasonWong/Desktop/GetAllCases.html"

headers = {
          'Username': 'jsonwong',
          'Password': '2020jsonwong=api'}

response = requests.request("GET", url, headers=headers)

print(response.text)

InvalidSchema: No connection adapters were found for 'file:///Users/JasonWong/Desktop/GetAllCases.html'

In [74]:
curl 'file:///Users/JasonWong/Desktop/GetAllCases.html'

SyntaxError: invalid syntax (<ipython-input-74-0843ab1f78c9>, line 1)

In [79]:
import urllib

page = urllib.open("file:///Users/JasonWong/Desktop/GetAllCases.html").read()
print(page)

AttributeError: module 'urllib' has no attribute 'open'