# EDA:
#### This notebook will load in the different datasets to be used for the project. There will also be cells scraping tables.

Unzipping the athletes.csv which holds all Olympic Athletes and results from 1896 to 2016

In [6]:
!unzip '../data/31029_40943_compressed_athlete_events.csv.zip'

Archive:  ../data/31029_40943_compressed_athlete_events.csv.zip
  inflating: athlete_events.csv      


In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

Importing libraries for data cleaning and exploration

In [2]:
import pandas as pd
import numpy as np
import pdfplumber
import tabula
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

Importing libraries for scraping tables and scraping a table showing number of doping positives per Olympic Game

In [50]:
from Olympic_PED_use.src import functions as fn

In [3]:
from bs4 import BeautifulSoup
import certifi
import urllib3
import re
from csv import DictReader, DictWriter
url = 'https://olympstats.com/2019/01/18/all-olympic-doping-positives-the-count-by-games/'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_= 'table-responsive')    
ped_by_country_df = pd.read_html(str(contents))[0]
ped_by_country_df

Unnamed: 0,Year,City,###
0,2012.0,London,121
1,2008.0,Beijing,86
2,2000.0,Sydney,42
3,2004.0,Athínai,41
4,2006.0,Torino,19
5,2016.0,Rio de Janeiro,17
6,1996.0,Atlanta,13
7,1984.0,Los Angeles,12
8,2014.0,Sochi,12
9,1976.0,Montréal,11


Creating chart from pdf that shows the number of athletes doping per country in 2007

In [4]:
pdf_file = '../data/wada_07.pdf'
with pdfplumber.open(pdf_file) as pdf:
    page_9 = pdf.pages[2]
    rows = page_9.extract_table()
    print(rows)


[['Laboratory', None, 'N\nTotal', 'N\nAdverse \nAnalytical \nFindings*', '% Adverse'], ['1', 'Sydney, Australia', '7,457', '1 34', '1.80'], ['2', 'Seibersdorf, Austria', '4,595', '9 4', '2.05'], ['3', 'Ghent, Belgium', '6,800', '3 00', '4.41'], ['4', 'Rio de Janeiro, Brazil', '5,970', '5 2', '0.87'], ['5', 'Montreal, Canada', '15,094', '4 33', '2.87'], ['6', 'Beijing, China', '10,202', '5 2', '0.51'], ['7', 'Bogota, Colombia', '2,371', '7 3', '3.08'], ['8', 'Havana, Cuba', '2,448', '7 1', '2.90'], ['9', 'Prague, Czech Republic', '1,826', '1 00', '5.48'], ['10', 'Helsinki, Finland', '2,485', '6 0', '2.41'], ['11', 'Paris, France', '9,167', '4 61', '5.03'], ['12', 'Cologne, Germany', '13,313', '2 69', '2.02'], ['13', 'Kreischa, Germany', '6,158', '1 24', '2.01'], ['14', 'London, UK', '8,101', '7 3', '0.90'], ['15', 'Athens, Greece', '5,019', '8 9', '1.77'], ['16', 'Rome, Italy', '10,903', '2 69', '2.47'], ['17', 'Tokyo, Japan', '6,099', '1 5', '0.25'], ['18', 'Seoul, Korea', '2,833', '2 

In [5]:
table = [row for row in rows if ''.join([str(i) for i in row]) != '']

In [6]:
df2 = pd.DataFrame(table)

In [7]:
df2

Unnamed: 0,0,1,2,3,4
0,Laboratory,,N\nTotal,N\nAdverse \nAnalytical \nFindings*,% Adverse
1,1,"Sydney, Australia",7457,1 34,1.80
2,2,"Seibersdorf, Austria",4595,9 4,2.05
3,3,"Ghent, Belgium",6800,3 00,4.41
4,4,"Rio de Janeiro, Brazil",5970,5 2,0.87
5,5,"Montreal, Canada",15094,4 33,2.87
6,6,"Beijing, China",10202,5 2,0.51
7,7,"Bogota, Colombia",2371,7 3,3.08
8,8,"Havana, Cuba",2448,7 1,2.90
9,9,"Prague, Czech Republic",1826,1 00,5.48


Defining the columns for the dataframe and resetting the index

In [8]:
df2.columns=['Unnamed', 'Laboratory', 'Total',
                   'Adverse Analytical Findings', 
                   '% Adverse']

In [9]:
df2.drop([0,1,2], axis=0, inplace=True)

In [10]:
df2.drop(['Unnamed'], axis=1, inplace=True)

In [11]:
df2.reset_index(inplace=True, drop=True)

In [12]:
df2

Unnamed: 0,Laboratory,Total,Adverse Analytical Findings,% Adverse
0,"Ghent, Belgium",6800,3 00,4.41
1,"Rio de Janeiro, Brazil",5970,5 2,0.87
2,"Montreal, Canada",15094,4 33,2.87
3,"Beijing, China",10202,5 2,0.51
4,"Bogota, Colombia",2371,7 3,3.08
5,"Havana, Cuba",2448,7 1,2.90
6,"Prague, Czech Republic",1826,1 00,5.48
7,"Helsinki, Finland",2485,6 0,2.41
8,"Paris, France",9167,4 61,5.03
9,"Cologne, Germany",13313,2 69,2.02


Removing spaces from the column names and transforming them to be lowercase

In [13]:
df2['adverse_analytical_findings'] = df2['Adverse Analytical Findings']

In [14]:
df2.drop(['Adverse Analytical Findings'], axis=1, inplace=True)

In [15]:
df2.columns = [col.lower() for col in df2.columns]

In [16]:
df2

Unnamed: 0,laboratory,total,% adverse,adverse_analytical_findings
0,"Ghent, Belgium",6800,4.41,3 00
1,"Rio de Janeiro, Brazil",5970,0.87,5 2
2,"Montreal, Canada",15094,2.87,4 33
3,"Beijing, China",10202,0.51,5 2
4,"Bogota, Colombia",2371,3.08,7 3
5,"Havana, Cuba",2448,2.90,7 1
6,"Prague, Czech Republic",1826,5.48,1 00
7,"Helsinki, Finland",2485,2.41,6 0
8,"Paris, France",9167,5.03,4 61
9,"Cologne, Germany",13313,2.02,2 69


Scraping the 'list of doping cases in athletics' chart from wikipedia

In [17]:
url = 'https://en.wikipedia.org/wiki/List_of_doping_cases_in_athletics'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all('table', class_='wikitable sortable')    

In [18]:
len(contents)

26

In [19]:
def wiki_scraper(content):
    a = []
    b = []
    c = []
    d = []
    e = []
    f = []
    g = []
    tables = []
    for table in content:
        tables.append(table)
        for table_ in tables:
            rows = table_.find_all('tr')
            for row in rows:
                cells = row.find_all('td')
                if len(cells) == 7:
                    a.append(cells[0].text)
                    b.append(cells[1].text)
                    c.append(cells[2].text)
                    d.append(cells[3].text)
                    e.append(cells[4].text)
                    f.append(cells[5].text)
                    g.append(cells[6].text)
    df = pd.DataFrame(a, columns=['name'])
    df['country'] = b
    df['event'] = c
    df['date_of_violation'] = d
    df['substance'] = e
    df['sanction'] = f
    df['references'] = g
    
    return df

In [20]:
doping = wiki_scraper(contents)

In [21]:
def col_format(df):
    df['name'] = [x.strip('\n') for x in df['name']]
    df['country'] = [x.strip('\n') for x in df['country'].values]
    df['date_of_violation'] = [x.strip('\n') for x in df['date_of_violation'].values]
    df['event'] = [x.strip('\n') for x in df['event'].values]
    df['substance'] = [x.strip('\n') for x in df['substance'].values]
    df['sanction'] = [x.strip('\n') for x in df['sanction'].values]
    df['references'] = [x.strip('\n') for x in df['references'].values]
    return df

In [22]:
doping = col_format(doping)

In [23]:
doping

Unnamed: 0,name,country,event,date_of_violation,substance,sanction,references
0,Nunu Abashidze,Soviet Union,Shot put,1981,,,[1][2]
1,Ahmed Abd El Raouf,Egypt,Hammer throw,2008,Norandrosterone,2 years,[3][4]
2,Inga Abitova,Russia,Long distance,2009,Biological passport anomalies,2 years,[5][6][7]
3,Folashade Abugan,Nigeria,Sprinting,2010,Testosterone prohormone,2 years,[8][9][10]
4,Ibrahim Mohamed Aden,Somalia,Middle distance,1999,Ephedrine,Public warning,[11][12][13]
...,...,...,...,...,...,...,...
14482,Zhou Wei,China,Sprinting,2000,,2 years,[264]
14483,Zohar Zimro,Israel,Marathon,2012,,Public reprimand,[219]
14484,Yevgeniya Zinurova,Russia,Middle distance,2010,Biological passport abnormalities,2 years,[516]
14485,Khalid Zoubaa,France,Long distance,2007,EPO,3 years,[887]


Reading in the athletes.csv file

In [24]:
athlete_df = pd.read_csv('../data/athlete_events.csv')
athlete_df

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271111,135569,Andrzej ya,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,Winter,Innsbruck,Luge,Luge Mixed (Men)'s Doubles,
271112,135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",
271113,135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,Winter,Sochi,Ski Jumping,"Ski Jumping Men's Large Hill, Team",
271114,135571,Tomasz Ireneusz ya,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,Winter,Nagano,Bobsleigh,Bobsleigh Men's Four,


Checking the count of athletes in the chart from wiki to see how many athletes match up in the athletes dataframe. The spelling and punctuation may be different in the dataframes. I am going to try using fuzzywuzzy to see if there is a matched name ratio above 70 percent in the dataframes

In [None]:
names = []
for name in doping['name']:
    for n in athlete_df['Name']:
        if n == name:
            names.append(n)
print(len(names))
            

In [288]:
def lookup(x, values):
    for value in values:
        if value.lower() in x.lower():
            return value

Defining both dataframes as df1 and df2 for easier manipulation

In [466]:
df_1 = athlete_df
df_2 = doping

In [None]:
df_1 = df_2['name'].apply(lambda x: lookup(x, df_1['Name']))
df_1

In [467]:
df1 = pd.DataFrame(df_1)
df2 = pd.DataFrame(df_2)

In [42]:
for x in df1['name'].unique():
    if x != None:
        print(x)

Mariem Alaoui Selsouli
Deni
Gloria Amuche Nwosu
Marvin Anderson
Seema Antil
Lu
Mark Anthony
Masoud Azizi
Ndiss Kaba Badji
Gach
Michal Balner
Falk Balzer
Jarrod Bannister
Dieter Baumann
Vadim Bavikin
Chantal Beaugeant
Ass
Yahya Berrabah
Gwen Berry
Cr
Antonella Bevilacqua
Uwe Beyer
Rajendra Bahadur Bhandari
Quentin Bigot
Yohan Blake
Brahim Boulami
Brau
Okkert Brits
Alicia Brown
Erik de Bruin
Dean Capobianco
LaMark Carter
Arnaud Casquette
Hath
David Chaussinand
Abdelatif Chemlal
Cherry
Silvano Chesani
Christophe Cheval
Dimitrios Chondrokoukis
Clement Chukwu
Daniela Costian
Peter Dajia
Dimitrios Delifotis
Ronald Desruelles
Ahmed Mohamed
Alberico Di Cecco
Rashid Shafi Al-Dosari
Troy Douglas
Rasa Drazdauskait
Afa Ismail
Latifa Essarokh
Davidson Ezinwa
Osmond Ezinwa
Hamid Ezzine
Martin Fagan
Elena Fidatov
Yuliya Fomenko
Allodin Fothergill
Geronimo Goeloe
Thomas Goller
Trevor Graham
Abdelkader Hachlaf
Halima Hachlaf
Ahmad Hazer
Knut Hjeltnes
Huang Qun
Regina Jacobs
Helena Javornik
Olivera Jevt

In [25]:
athlete_df.columns = [x.lower() for x in athlete_df.columns]

In [26]:
doping['flagged'] = [1 for x in doping.index]

In [27]:
df = pd.merge(athlete_df, doping, on='name', how='left')

In [28]:
df

Unnamed: 0,id,name,sex,age,height,weight,team,noc,games,year,...,sport,event_x,medal,country,event_y,date_of_violation,substance,sanction,references,flagged
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,...,Basketball,Basketball Men's Basketball,,,,,,,,
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,...,Judo,Judo Men's Extra-Lightweight,,,,,,,,
2,3,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,...,Football,Football Men's Football,,,,,,,,
3,4,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,...,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold,,,,,,,
4,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NED,1988 Winter,1988,...,Speed Skating,Speed Skating Women's 500 metres,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275389,135569,Andrzej ya,M,29.0,179.0,89.0,Poland-1,POL,1976 Winter,1976,...,Luge,Luge Mixed (Men)'s Doubles,,,,,,,,
275390,135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,...,Ski Jumping,"Ski Jumping Men's Large Hill, Individual",,,,,,,,
275391,135570,Piotr ya,M,27.0,176.0,59.0,Poland,POL,2014 Winter,2014,...,Ski Jumping,"Ski Jumping Men's Large Hill, Team",,,,,,,,
275392,135571,Tomasz Ireneusz ya,M,30.0,185.0,96.0,Poland,POL,1998 Winter,1998,...,Bobsleigh,Bobsleigh Men's Four,,,,,,,,


In [150]:
df['sport'].value_counts()

Athletics                    42534
Gymnastics                   26815
Swimming                     23257
Shooting                     11454
Cycling                      10907
Fencing                      10735
Rowing                       10625
Cross Country Skiing          9133
Alpine Skiing                 8829
Wrestling                     7169
Football                      6745
Sailing                       6586
Equestrianism                 6344
Canoeing                      6174
Boxing                        6047
Speed Skating                 5613
Ice Hockey                    5516
Hockey                        5417
Biathlon                      4893
Basketball                    4536
Weightlifting                 3952
Water Polo                    3846
Judo                          3801
Handball                      3665
Art Competitions              3578
Volleyball                    3413
Bobsleigh                     3074
Tennis                        2862
Diving              

In [74]:
import pandas as pd
pd.set_option('display.max_rows', 2800)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)

In [29]:
df = df.drop(['sanction', 'references'], axis=1)

In [30]:
df.isna().sum()

id                        0
name                      0
sex                       0
age                    9474
height                60261
weight                62957
team                      0
noc                       0
games                     0
year                      0
season                    0
city                      0
sport                     0
event_x                   0
medal                235094
country              270834
event_y              270834
date_of_violation    270834
substance            270834
flagged              270834
dtype: int64

In [31]:
sum(df['flagged']==1)

4560

In [32]:
not_flagged = []
for x in df['flagged']:
    if x != 1:
        not_flagged.append(x)
print(len(not_flagged))
        

270834


In [491]:
df['year'].value_counts()

1992    16739
1988    14931
2000    14368
2008    14217
1996    14214
2016    14120
2004    14043
2012    13468
1972    12008
1984    11846
1976    10552
1968    10503
1964     9504
1952     9358
1960     9235
1980     8975
1948     7480
1936     7421
1956     6434
1924     5693
1928     5574
2014     4891
2010     4402
2006     4382
1920     4292
2002     4125
1912     4040
1998     3619
1932     3321
1994     3188
1908     3101
1900     1936
1906     1733
1904     1301
1896      380
Name: year, dtype: int64

In [33]:
df = df[df['sport']=='Athletics']

In [34]:
df

Unnamed: 0,id,name,sex,age,height,weight,team,noc,games,year,season,city,sport,event_x,medal,country,event_y,date_of_violation,substance,flagged
26,8,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,168.0,,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 100 metres,,,,,,
27,8,"Cornelia ""Cor"" Aalten (-Strannood)",F,18.0,168.0,,Netherlands,NED,1932 Summer,1932,Summer,Los Angeles,Athletics,Athletics Women's 4 x 100 metres Relay,,,,,,
57,18,Timo Antero Aaltonen,M,31.0,189.0,130.0,Finland,FIN,2000 Summer,2000,Summer,Sydney,Athletics,Athletics Men's Shot Put,,,,,,
94,31,Evald rma (rman-),M,24.0,174.0,70.0,Estonia,EST,1936 Summer,1936,Summer,Berlin,Athletics,Athletics Men's Pole Vault,,,,,,
95,32,Olav Augunson Aarnes,M,23.0,,,Norway,NOR,1912 Summer,1912,Summer,Stockholm,Athletics,Athletics Men's High Jump,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275357,135553,Galina Ivanovna Zybina (-Fyodorova),F,29.0,168.0,80.0,Soviet Union,URS,1960 Summer,1960,Summer,Roma,Athletics,Athletics Women's Shot Put,,,,,,
275358,135553,Galina Ivanovna Zybina (-Fyodorova),F,33.0,168.0,80.0,Soviet Union,URS,1964 Summer,1964,Summer,Tokyo,Athletics,Athletics Women's Shot Put,Bronze,,,,,
275378,135561,Frantiek Zyka,M,26.0,,,Czechoslovakia,TCH,1928 Summer,1928,Summer,Amsterdam,Athletics,Athletics Men's Marathon,,,,,,
275380,135563,Olesya Nikolayevna Zykina,F,19.0,171.0,64.0,Russia,RUS,2000 Summer,2000,Summer,Sydney,Athletics,Athletics Women's 4 x 400 metres Relay,Bronze,,,,,


In [35]:
df = df[df['year'] > 2003]

In [36]:
df = df[df['year'] < 2017]

In [37]:
sum(df['name'].value_counts() > 1)

2133

In [497]:
df['season'].value_counts()

Summer    11340
Name: season, dtype: int64

In [38]:
df = df.drop(columns=['country', 'event_y', 'date_of_violation'], axis=1)

In [39]:
flagged_df = df[df['flagged']==1]

In [40]:
flagged_df = flagged_df.drop_duplicates().reset_index(drop=True)

In [41]:
len(flagged_df)

146

In [640]:
flagged_df['event_x'].value_counts()

Athletics Men's 200 metres                     11
Athletics Women's 400 metres                   10
Athletics Women's 20 kilometres Walk           10
Athletics Men's 100 metres                      9
Athletics Women's 4 x 400 metres Relay          7
Athletics Men's Long Jump                       6
Athletics Men's 20 kilometres Walk              6
Athletics Men's 3,000 metres Steeplechase       6
Athletics Women's Marathon                      5
Athletics Men's Triple Jump                     5
Athletics Women's Discus Throw                  5
Athletics Men's Marathon                        4
Athletics Women's 800 metres                    4
Athletics Men's 4 x 400 metres Relay            4
Athletics Men's 4 x 100 metres Relay            4
Athletics Men's 400 metres                      4
Athletics Women's 3,000 metres Steeplechase     4
Athletics Men's 50 kilometres Walk              3
Athletics Men's 110 metres Hurdles              3
Athletics Men's 800 metres                      3


In [42]:
df = df.drop(['games'], axis=1)

In [43]:
len(df)

11340

In [622]:
len(df[df['flagged']==1])

2281

In [244]:
df_04 = df[df['year']==2004]

In [245]:
df_04 = df_04.drop_duplicates().reset_index(drop=True)

In [246]:
sum(df_04['flagged']==1)

37

In [247]:
len(df_04)

2175

In [227]:
#df_04.set_index('name', inplace=True)
df_04

Unnamed: 0,id,name,sex,age,height,weight,team,noc,year,season,city,sport,event_x,medal,substance,flagged
0,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,
1,95,Georgia Abatzidou,F,35.0,155.0,43.0,Greece,GRE,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,
2,96,Carlos Rodolfo Abaunza Balladares,M,18.0,168.0,60.0,Nicaragua,NCA,2004,Summer,Athina,Athletics,Athletics Men's 100 metres,,,
3,256,"Abdihakim ""Abdi"" Abdirahman",M,27.0,178.0,61.0,United States,USA,2004,Summer,Athina,Athletics,"Athletics Men's 10,000 metres",,,
4,397,Mara Abel Diguez,F,29.0,163.0,46.0,Spain,ESP,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,
5,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 1,500 metres",,,
6,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 5,000 metres",,,
7,488,Ibrahim Mohamedin Aboubaker,M,21.0,190.0,65.0,Qatar,QAT,2004,Summer,Athina,Athletics,Athletics Men's Triple Jump,,,
8,600,Nagmeldin Ali Abubakr,M,18.0,172.0,63.0,Sudan,SUD,2004,Summer,Athina,Athletics,Athletics Men's 400 metres,,,
9,601,Sanna Abubkheet,F,19.0,157.0,52.0,Palestine,PLE,2004,Summer,Athina,Athletics,Athletics Women's 800 metres,,,


In [228]:
df_04.index

RangeIndex(start=0, stop=2175, step=1)

In [248]:
df_04.name = df_04.name.str.replace(r"\(.*\)","")

In [249]:
df_04.name

0                                        Moonika Aava
1                                   Georgia Abatzidou
2                   Carlos Rodolfo Abaunza Balladares
3                         Abdihakim "Abdi" Abdirahman
4                                    Mara Abel Diguez
5                                   Elvan Abeylegesse
6                                   Elvan Abeylegesse
7                         Ibrahim Mohamedin Aboubaker
8                               Nagmeldin Ali Abubakr
9                                     Sanna Abubkheet
10                                 Fartun Abukar Omar
11                                    Shigeru Aburaya
12                           Amelia Lynn "Amy" Acuff 
13                                 Luke Kendall Adams
14                       Valerie Kasanita Adams-Vili 
15                            Elisngela Maria Adriano
16                            Elisngela Maria Adriano
17                                    Khoudir Aggoune
18                          

In [234]:
df_04.head(10)

Unnamed: 0,id,name,sex,age,height,weight,team,noc,year,season,city,sport,event_x,medal,substance,flagged
0,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,
1,95,Georgia Abatzidou,F,35.0,155.0,43.0,Greece,GRE,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,
2,96,Carlos Rodolfo Abaunza Balladares,M,18.0,168.0,60.0,Nicaragua,NCA,2004,Summer,Athina,Athletics,Athletics Men's 100 metres,,,
3,256,"Abdihakim ""Abdi"" Abdirahman",M,27.0,178.0,61.0,United States,USA,2004,Summer,Athina,Athletics,"Athletics Men's 10,000 metres",,,
4,397,Mara Abel Diguez,F,29.0,163.0,46.0,Spain,ESP,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,
5,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 1,500 metres",,,
6,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 5,000 metres",,,
7,488,Ibrahim Mohamedin Aboubaker,M,21.0,190.0,65.0,Qatar,QAT,2004,Summer,Athina,Athletics,Athletics Men's Triple Jump,,,
8,600,Nagmeldin Ali Abubakr,M,18.0,172.0,63.0,Sudan,SUD,2004,Summer,Athina,Athletics,Athletics Men's 400 metres,,,
9,601,Sanna Abubkheet,F,19.0,157.0,52.0,Palestine,PLE,2004,Summer,Athina,Athletics,Athletics Women's 800 metres,,,


In [252]:
names = []
for x in df_04.name:
    split = x.split()
    x = split[0] + ' ' + split[-1]
    names.append(x)


In [253]:
df_04['name'] = [x for x in names]

In [275]:
sum(df_04.name=='Aleen Bailey')

3

In [274]:
womens_100m_04

Unnamed: 0,rank,name,result
0,1.0,Yuliya Nestsiarenka,10.93
1,2.0,Lauryn Williams,10.96
2,3.0,Veronica Campbell-Brown,10.97
3,4.0,Ivet Lalova,11.0
4,5.0,Aleen Bailey,11.05
5,6.0,Sherone Simpson,11.07
6,7.0,Debbie Ferguson-Mckenzie,11.16
7,8.0,Latasha Colander,11.18


In [273]:
sum(mens_100m_04.name.isin(df_04.name))

0

In [None]:
in_both = [x for x in df_04.name if x.isin(mens_100m_04.name)]

In [280]:
df_04.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2175 entries, 0 to 2174
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         2175 non-null   int64  
 1   name       2175 non-null   object 
 2   sex        2175 non-null   object 
 3   age        2175 non-null   float64
 4   height     2174 non-null   float64
 5   weight     2174 non-null   float64
 6   team       2175 non-null   object 
 7   noc        2175 non-null   object 
 8   year       2175 non-null   int64  
 9   season     2175 non-null   object 
 10  city       2175 non-null   object 
 11  sport      2175 non-null   object 
 12  event_x    2175 non-null   object 
 13  medal      180 non-null    object 
 14  substance  37 non-null     object 
 15  flagged    37 non-null     float64
dtypes: float64(4), int64(2), object(10)
memory usage: 272.0+ KB


In [278]:
womens_100m_04.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   rank    8 non-null      object
 1   name    8 non-null      object
 2   result  8 non-null      object
dtypes: object(3)
memory usage: 320.0+ bytes


In [287]:
for x in mens_100m_04.name:
    print(type(x))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [675]:
mens_100m_04['name']

0       Justin Gatlin   
1    Francis Obikwelu   
2      Maurice Greene   
3      Shawn Crawford   
4        Asafa Powell   
5         Kim Collins   
6    Obadele Thompson   
Name: name, dtype: object

In [332]:
df_04.insert(15, 'result', 'Nan')

In [334]:
df_04.head()

Unnamed: 0,id,name,sex,age,height,weight,team,noc,year,season,city,sport,event_x,medal,substance,result,flagged
0,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,Nan,
1,95,Georgia Abatzidou,F,35.0,155.0,43.0,Greece,GRE,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,Nan,
2,96,Carlos Balladares,M,18.0,168.0,60.0,Nicaragua,NCA,2004,Summer,Athina,Athletics,Athletics Men's 100 metres,,,Nan,
3,256,Abdihakim Abdirahman,M,27.0,178.0,61.0,United States,USA,2004,Summer,Athina,Athletics,"Athletics Men's 10,000 metres",,,Nan,
4,397,Mara Diguez,F,29.0,163.0,46.0,Spain,ESP,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,Nan,


In [345]:
df_1.where(df_1.values==df_2.values).notna()


ValueError: Array conditional must be same shape as self

In [344]:
test_2

In [314]:
pd.merge(df_04, mens_100m_04, left_on='name', right_on='name', how='left')

Unnamed: 0,id,name,sex,age,height,weight,team,noc,year,season,city,sport,event_x_x,medal,substance,flagged,rank,result,event_x_y
0,49,Moonika Aava,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,,,,
1,95,Georgia Abatzidou,F,35.0,155.0,43.0,Greece,GRE,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,,,,
2,96,Carlos Balladares,M,18.0,168.0,60.0,Nicaragua,NCA,2004,Summer,Athina,Athletics,Athletics Men's 100 metres,,,,,,
3,256,Abdihakim Abdirahman,M,27.0,178.0,61.0,United States,USA,2004,Summer,Athina,Athletics,"Athletics Men's 10,000 metres",,,,,,
4,397,Mara Diguez,F,29.0,163.0,46.0,Spain,ESP,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,,,,
5,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 1,500 metres",,,,,,
6,428,Elvan Abeylegesse,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 5,000 metres",,,,,,
7,488,Ibrahim Aboubaker,M,21.0,190.0,65.0,Qatar,QAT,2004,Summer,Athina,Athletics,Athletics Men's Triple Jump,,,,,,
8,600,Nagmeldin Abubakr,M,18.0,172.0,63.0,Sudan,SUD,2004,Summer,Athina,Athletics,Athletics Men's 400 metres,,,,,,
9,601,Sanna Abubkheet,F,19.0,157.0,52.0,Palestine,PLE,2004,Summer,Athina,Athletics,Athletics Women's 800 metres,,,,,,


#### Next I will be importing the the datasets from the anti-doping database. I am going to be focusing on the years 2004-2016 for athletes caught using PED's. 

https://www.dopinglist.com/index.php?searchform=advanced&action=search&search=search&offset=0&slist=&ssport=74&scountry=&syear=2004&stype=&searchstring=&athleteRoles=&esearch=&suspensionSearch=&sgender=&organizations=&submit=Search#

In [35]:
from urllib.request import urlopen


In [48]:
url = 'https://www.olympic.org/athens-2004/athletics/marathon-men'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_='table4')  

In [51]:
mens_marathon04 = fn.athens_scraper(contents)
mens_marathon04 = fn.content_cleaner(mens_marathon04)

In [52]:
mens_marathon04 

Unnamed: 0,rank,name,result
0,1.,Stefano Baldini,2:10:55
1,2.,Mebrahtom Keflezighi,2:11:29
2,3.,Vanderlei De Lima,2:12:11
3,4.,Jon Brown,2:12:26
4,5.,Shigeru Aburaya,2:13:11
...,...,...,...
76,77.,Alfredo Arevalo,2:34:02
77,78.,Antonio Zeferino,2:36:22
78,79.,Valery Pisarev,2:40:10
79,80.,Zepherinus Joseph,2:44:19


In [53]:
url = 'https://www.olympic.org/athens-2004/athletics/marathon-women#'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_='table4')  

In [55]:
womens_marathon_04 = fn.athens_scraper(contents)

womens_marathon_04 = fn.content_cleaner(womens_marathon_04)

In [56]:
womens_marathon_04.columns = ['rank', 'name', 'result']

In [57]:
womens_marathon_04

Unnamed: 0,rank,name,result
0,1.,Mizuki Noguchi,2:26:20
1,2.,Catherine Ndereba,2:26:32
2,3.,Deena Kastor,2:27:20
3,4.,Elfenesh Alemu,2:28:15
4,5.,Reiko Tosa,2:28:44
...,...,...,...
62,62.,Ana Dias,3:08:11
63,63.,Inga Juodeskiene,3:09:18
64,64.,Mamokete Lechela,3:11:56
65,65.,Agueda Amaral,3:18:25


In [270]:
url = 'https://www.olympic.org/athens-2004/athletics/100m-men'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_='table4')  

In [282]:
mens_100m_04 = fn.athens_scraper(contents)
mens_100m_04 = fn.content_cleaner(mens_100m_04)
mens_100m_04.columns = ['rank', 'name', 'result']
mens_100m_04.insert(3, "event_x", "Athletics Men's 100 metres")
mens_100m_04

Unnamed: 0,rank,name,result,event_x
0,1.0,Justin Gatlin,9.85,Athletics Men's 100 metres
1,2.0,Francis Obikwelu,9.86,Athletics Men's 100 metres
2,3.0,Maurice Greene,9.87,Athletics Men's 100 metres
3,4.0,Shawn Crawford,9.89,Athletics Men's 100 metres
4,5.0,Asafa Powell,9.94,Athletics Men's 100 metres
5,6.0,Kim Collins,10.0,Athletics Men's 100 metres
6,7.0,Obadele Thompson,10.1,Athletics Men's 100 metres


In [261]:
url = 'https://www.olympic.org/athens-2004/athletics/100m-women'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_='table4')

In [262]:
womens_100m_04 = fn.athens_scraper(contents)
womens_100m_04 = fn.content_cleaner(womens_100m_04)
womens_100m_04.columns = ['rank', 'name', 'result']
womens_100m_04

Unnamed: 0,rank,name,result
0,1.0,Yuliya Nestsiarenka,10.93
1,2.0,Lauryn Williams,10.96
2,3.0,Veronica Campbell-Brown,10.97
3,4.0,Ivet Lalova,11.0
4,5.0,Aleen Bailey,11.05
5,6.0,Sherone Simpson,11.07
6,7.0,Debbie Ferguson-Mckenzie,11.16
7,8.0,Latasha Colander,11.18


In [88]:
womens_100m_04.set_index('name', inplace=True)

In [89]:
womens_100m_04

Unnamed: 0_level_0,rank,result
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Yuliya Nestsiarenka,1.0,10.93
Lauryn Williams,2.0,10.96
Veronica Campbell-Brown,3.0,10.97
Ivet Lalova,4.0,11.0
Aleen Bailey,5.0,11.05
Sherone Simpson,6.0,11.07
Debbie Ferguson-Mckenzie,7.0,11.16
Latasha Colander,8.0,11.18


In [92]:
womens_100m_04.loc[womens_100m_04.index.isin(df_04.index)]

Unnamed: 0_level_0,rank,result
name,Unnamed: 1_level_1,Unnamed: 2_level_1


In [119]:
df_04.combine_first(mens_100m_04)

Unnamed: 0_level_0,age,city,event_x,flagged,height,id,medal,noc,rank,result,season,sex,sport,substance,team,weight,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Aaron Cleare,21.0,Athina,Athletics Men's 4 x 400 metres Relay,,185.0,21967.0,,BAH,,,Summer,M,Athletics,,Bahamas,84.0,2004.0
Aaron Egbele,25.0,Athina,Athletics Men's 4 x 100 metres Relay,,181.0,31551.0,Bronze,NGR,,,Summer,M,Athletics,,Nigeria,79.0,2004.0
Abbas Samimi,27.0,Athina,Athletics Men's Discus Throw,,203.0,104984.0,,IRI,,,Summer,M,Athletics,,Iran,115.0,2004.0
Abdal Salam Al-Dabaji,25.0,Athina,Athletics Men's 800 metres,,176.0,1853.0,,PLE,,,Summer,M,Athletics,,Palestine,64.0,2004.0
Abdelatif Chemlal,22.0,Athina,"Athletics Men's 3,000 metres Steeplechase",1.0,170.0,20146.0,,MAR,,,Summer,M,Athletics,Norandrosterone,Morocco,60.0,2004.0
Abdelhakim Maazouz,28.0,Athina,"Athletics Men's 3,000 metres Steeplechase",,176.0,72801.0,,ALG,,,Summer,M,Athletics,,Algeria,62.0,2004.0
Abderrahim Al-Goumri,28.0,Athina,"Athletics Men's 5,000 metres",,167.0,42078.0,,MAR,,,Summer,M,Athletics,,Morocco,60.0,2004.0
Abderrahim El Haouzy,29.0,Athina,Athletics Men's 4 x 400 metres Relay,,187.0,31892.0,,FRA,,,Summer,M,Athletics,,France,72.0,2004.0
Abderrahmane Hammad Zaheer,27.0,Athina,Athletics Men's High Jump,,189.0,45314.0,,ALG,,,Summer,M,Athletics,,Algeria,70.0,2004.0
"Abdihakim ""Abdi"" Abdirahman",27.0,Athina,"Athletics Men's 10,000 metres",,178.0,256.0,,USA,,,Summer,M,Athletics,,United States,61.0,2004.0


In [426]:
url = 'https://www.olympic.org/athens-2004/athletics/5000m-men'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_='table4')

In [428]:
mens_5000m_04 = athens_scraper(contents)
mens_5000m_04 = fn.content_cleaner(mens_5000m_04)
mens_5000m_04.columns = ['rank', 'name', 'mens_5000m_results04']
mens_5000m_04

Unnamed: 0,rank,name,mens_5000m_results04
0,1.0,Kenenisa Bekele,13:21.16
1,1.0,Ali Saidi-Sief,13:18.94
2,2.0,Gebre Egziabher Gebremariam,13:21.20
3,2.0,Eliud Kipchoge Rotich,13:19.01
4,3.0,Dejene Berhanu,13:19.42
5,3.0,Hicham El Guerrouj,13:21.87
6,4.0,John Kibowen,13:19.65
7,4.0,Craig Mottram,13:21.88
8,5.0,Abderrahim Goumri,13:20.03
9,5.0,Abraham Chebii,13:22.30


In [429]:
url = 'https://www.olympic.org/athens-2004/athletics/5000m-women'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
contents = soup.find_all(class_='table4')

In [430]:
womens_5000m_04 = athens_scraper(contents)
womens_5000m_04 = fn.content_cleaner(womens_5000m_04)
womens_5000m_04.columns = ['rank', 'name', 'womens_5000m_results04']
womens_5000m_04

Unnamed: 0,rank,name,womens_5000m_results04
0,1.0,Meseret Defar,14:45.65
1,2.0,Isabella Ochichi,14:48.19
2,3.0,Tirunesh Dibaba,14:51.83
3,4.0,Yelena Zadorozhnaya,14:55.52
4,5.0,Joanne Pavey,14:57.87
5,6.0,Gulnara Samitova,15:02.30
6,7.0,Irina Mikitenko,15:03.36
7,8.0,Yingjie Sun,15:07.23
8,9.0,Huina Xing,15:07.41
9,10.0,Sentayehu Ejigu,15:09.55


In [441]:
url = 'https://www.olympic.org/athens-2004/athletics/10000m-men'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
mens_10000_contents = soup.find_all(class_='table4')

In [446]:
mens_10000m_04 = athens_scraper(mens_10000_contents)
mens_10000m_04 = fn.content_cleaner(mens_10000m_04)
mens_10000m_04.columns = ['rank', 'name', 'mens_10000m_results04']
mens_10000m_04

Unnamed: 0,rank,name,mens_10000m_results04
0,1.0,Kenenisa Bekele,27:05.10
1,2.0,Sileshi Sihine,27:09.39
2,3.0,Zersenay Tadese,27:22.57
3,4.0,Boniface Kiprop Toroitich,27:25.48
4,5.0,Haile Gebrselassie,27:27.70
5,6.0,John Cheruiyot Korir,27:41.91
6,7.0,Moses Mosop,27:46.61
7,8.0,Ismail Sghyr,27:57.09
8,9.0,Jose Manuel Martinez,27:57.61
9,10.0,Fabiano Joseph,28:01.94


In [447]:
url = 'https://www.olympic.org/athens-2004/athletics/10000m-women'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = req.request('GET', url)
soup = BeautifulSoup(res.data, 'html.parser')
womens_10000_contents = soup.find_all(class_='table4')

In [449]:
womens_10000m_04 = athens_scraper(womens_10000_contents)
womens_10000m_04 = fn.content_cleaner(womens_10000m_04)
womens_10000m_04.columns = ['rank', 'name', 'womens_10000m_results04']
womens_10000m_04

Unnamed: 0,rank,name,womens_10000m_results04
0,1.0,Huina Xing,30:24.36
1,2.0,Ejegayehu Dibaba,30:24.98
2,3.0,Derartu Tulu,30:26.42
3,4.0,Werknesh Kidane,30:28.30
4,5.0,Lornah Kiplagat,30:31.92
5,6.0,Yingjie Sun,30:54.37
6,7.0,Jelena Prokopcuka,31:04.10
7,8.0,Lidiya Grigoryeva,31:04.62
8,9.0,Lucy Wangui Kabuu,31:05.90
9,10.0,Helena Javornik,31:06.63


In [122]:
df_04.columns

Index(['id', 'sex', 'age', 'height', 'weight', 'team', 'noc', 'year', 'season', 'city', 'sport', 'event_x', 'medal', 'substance', 'flagged', 'rank', 'result'], dtype='object')

In [136]:
def remove_middle_name(row):
    for x in df_04.index:
        split = x.split()
        if len(split) > 2:
            x = x.strip(split[1])

In [189]:
df_04

Unnamed: 0_level_0,id,sex,age,height,weight,team,noc,year,season,city,sport,event_x,medal,substance,flagged
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Moonika Aava,49,F,24.0,168.0,65.0,Estonia,EST,2004,Summer,Athina,Athletics,Athletics Women's Javelin Throw,,,
Georgia Abatzidou,95,F,35.0,155.0,43.0,Greece,GRE,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,
Carlos Rodolfo Abaunza Balladares,96,M,18.0,168.0,60.0,Nicaragua,NCA,2004,Summer,Athina,Athletics,Athletics Men's 100 metres,,,
"Abdihakim ""Abdi"" Abdirahman",256,M,27.0,178.0,61.0,United States,USA,2004,Summer,Athina,Athletics,"Athletics Men's 10,000 metres",,,
Mara Abel Diguez,397,F,29.0,163.0,46.0,Spain,ESP,2004,Summer,Athina,Athletics,Athletics Women's Marathon,,,
Elvan Abeylegesse,428,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 1,500 metres",,,
Elvan Abeylegesse,428,F,21.0,159.0,40.0,Turkey,TUR,2004,Summer,Athina,Athletics,"Athletics Women's 5,000 metres",,,
Ibrahim Mohamedin Aboubaker,488,M,21.0,190.0,65.0,Qatar,QAT,2004,Summer,Athina,Athletics,Athletics Men's Triple Jump,,,
Nagmeldin Ali Abubakr,600,M,18.0,172.0,63.0,Sudan,SUD,2004,Summer,Athina,Athletics,Athletics Men's 400 metres,,,
Sanna Abubkheet,601,F,19.0,157.0,52.0,Palestine,PLE,2004,Summer,Athina,Athletics,Athletics Women's 800 metres,,,


In [692]:
url = 'https://olympics.api.pressassociation.io/v3/games'
req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
res = requests.request('GET', url)
print(res.text)
#soup = BeautifulSoup(res.data, 'html.parser')
#womens_10000_contents = soup.find_all(class_='table4')

<h1>596 Service Not Found</h1>


In [188]:
import requests

url = "https://api.dopinglist.com/search/GetAllCases"

querystring = {"limit":"100"}

headers = {
    'accept': "application/json",
    'apikey': "AIzaSyAxS-fiVv0YzBbx3fAMSmqutK-35rTVK7I"
    }

response = requests.request("GET", url, headers=headers, params=querystring)

print(response.text)

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>401 Unauthorized</title>
</head><body>
<h1>Unauthorized</h1>
<p>This server could not verify that you
are authorized to access the document
requested.  Either you supplied the wrong
credentials (e.g., bad password), or your
browser doesn't understand how to supply
the credentials required.</p>
<p>Additionally, a 401 Unauthorized
error was encountered while trying to use an ErrorDocument to handle the request.</p>
<hr>
<address>Apache/2.4.18 (Ubuntu) Server at api.dopinglist.com Port 443</address>
</body></html>



In [694]:
import requests

url = "https://olympics.api.pressassociation.io/v3/games/games/discipline/discipline/#"

headers = {'accept': 'application/json'}

response = requests.request("GET", url, headers=headers)

print(response.text)

<h1>596 Service Not Found</h1>


In [168]:
br = mechanize.Browser()
br.set_handle_robots(False)

In [None]:
api_url = '{0}account'.format(api_url_base)

    response = requests.get(api_url, headers=headers)

    if response.status_code == 200:
        return json.loads(response.content.decode('utf-8'))
    else:
        return None

In [177]:
import requests

url = "https://api.dopinglist.com/search/GetAllCases"

headers = {'accept': 'application/json',
          'Username': 'jsonwong',
          'Password': '2020jsonwong=api'}

response = requests.request("GET", url, headers=headers)

print(response.text)

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>401 Unauthorized</title>
</head><body>
<h1>Unauthorized</h1>
<p>This server could not verify that you
are authorized to access the document
requested.  Either you supplied the wrong
credentials (e.g., bad password), or your
browser doesn't understand how to supply
the credentials required.</p>
<p>Additionally, a 401 Unauthorized
error was encountered while trying to use an ErrorDocument to handle the request.</p>
<hr>
<address>Apache/2.4.18 (Ubuntu) Server at api.dopinglist.com Port 443</address>
</body></html>



In [187]:
import mechanize
from bs4 import BeautifulSoup
import urllib 
import http.cookiejar

cook = http.cookiejar.CookieJar()
req = mechanize.Browser()
req.set_cookiejar(cook)
req.set_handle_robots(False)
headers = {'username': 'jsonwong',
           'password': '2020jsonwong=api'}
req.open("https://api.dopinglist.com/search/GetAllCases")

req.select_form(nr=0)
req.form['username'] = 'jsonwong'
req.form['password'] = '2020jsonwong=api'
req.submit()

print(req.response().read())

httperror_seek_wrapper: HTTP Error 401: Unauthorized

In [194]:
api_url_base = 'https://api.dopinglist.com/search/GetAllCases'
def get_account_info():

    api_url = '{0}account'.format(api_url_base)

    response = requests.get(api_url_base, auth=('jsonwong', '2020jsonwong=api'))

    if response.status_code == 200:
        return json.loads(response.content.decode('utf-8'))
    else:
        return None

In [195]:
account_info = get_account_info()

if account_info is not None:
    print("Here's your info: ")
    for k, v in account_info['account'].items():
        print('{0}:{1}'.format(k, v))

else:
    print('[!] Request Failed')

[!] Request Failed
