In [2]:
import pandas as pd
import numpy as np

import requests
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


In [3]:
# league_dict: dictionary with all years to be scrapped for the league along with unique URL CODE for season
# code: integer with unique URL CODE for league
# league: string with Full League Name
# gender: string "M" or "F"


def get_data(league_dict, code, league, gender):
    
    driver = webdriver.Chrome(r'D:\Justin\Downloads\chromedriver_win32\chromedriver')  # Optional argument, if not specified will search path.
    delay = 3

    data_types = [['stats', 'standard'], ['shooting', 'shooting'],['passing', 'passing'],
                  ['passing_types','passing_types'],['gca', 'gca'],['defense', 'defense'],['possession', 'possession'],
                  ['misc', 'misc']]
    
    df_list = []
    for year in league_dict:
        print('NEW YEAR: ', year)
        year_df = pd.DataFrame()
        for type in data_types:
            #print(code)
            url = "https://fbref.com/en/comps/"+ str(code) + "/" + str(league_dict[year]) + "/" + type[0] + "/"
            #print(url)
            try:
                driver.get(url)
                myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID, 'stats_'+type[1])))
                df = pd.read_html(myElem.get_attribute('outerHTML'), header = (0,1))[0]
#                 print(type[1], len(df))
                for col in df.columns:
                    year_col = col
                    if 'Unnamed' in col[0]:
                        year_col =  col[1]
                    if year_col not in year_df.columns:
                        year_df[year_col] = df[col]
            except Exception as e:
                print('not found')
                print(year, type)
        
        try:
            year_df.drop(labels = list(range(25, len(year_df), 26)), axis = 0, inplace = True)
            year_df.drop(labels = 'Matches', axis = 1, inplace = True)
        except:
            print('idfk')

        year_df['Season'] = str(year)
        year_df['League'] = league
        year_df['Gender'] = gender
        df_list.append(year_df)
        
    driver.quit()
    full_df = pd.concat(df_list, axis = 0, join = "outer", sort = False)
    return full_df

In [4]:
###### LEAGUE DICTIONARY WHERE KEY = Year  and VALUE = URL CODE for season
belgium_ids = {'2020-2021': ''
              ,'2019-2020': 3216
              ,'2018-2019': 2246
              ,'2017-2018': 1619
              ,'2016-2017': 1514
              ,'2015-2016': 1455
              ,'2014-2015': 721
              ,'2013-2014': 657
              ,'2012-2013': 591
              ,'2011-2012': 523
              ,'2010-2011': 456
              ,'2009-2010': 388
              ,'2008-2009': 329
              ,'2007-2008': 273
              ,'2006-2007': 222
              ,'2005-2006': 177
              ,'2004-2005': 140
              ,'2003-2004': 106}       
dutch_ids = {'2020-2021': ''
            ,'2019-2020': 3269
            ,'2018-2019': 2231
            ,'2017-2018': 1625
            ,'2016-2017': 1520
            ,'2015-2016': 1462
            ,'2014-2015': 728
            ,'2013-2014': 664
            ,'2012-2013': 597
            ,'2011-2012': 529
            ,'2010-2011': 462
            ,'2009-2010': 394
            ,'2008-2009': 333
            ,'2007-2008': 277
            ,'2006-2007': 224
            ,'2005-2006': 179
            ,'2004-2005': 142
            ,'2003-2004': 108
            ,'2002-2003': 80
            ,'2001-2002': 61
            ,'2000-2001': 46}
russia_ids = {'2020-2021': ''
            ,'2019-2020': 3285
            ,'2018-2019': 2371
            ,'2017-2018': 1646
            ,'2016-2017': 1541
            ,'2015-2016': 1482
            ,'2014-2015': 748
            ,'2013-2014': 683
            ,'2012-2013': 617
            ,'2011-2012': 551
            ,'2010': 446
            ,'2009': 378
            ,'2008': 320
            ,'2007': 264
            ,'2006': 214
            ,'2005': 171
            ,'2004': 135
            ,'2003': 102
            ,'2002': 77
            ,'2001': 58}
korea_ids = {'2020': ''
            ,'2019': 3930
            ,'2018': 1771
            ,'2017': 1575
            ,'2016': 1508
            ,'2015': 1230
            ,'2014': 713}
australia_ids = {'2019-2020': ''
            ,'2018-2019': 2381
            ,'2017-2018': 1617
            ,'2016-2017': 1512
            ,'2015-2016': 1453
            ,'2014-2015': 713
            ,'2013-2014': 655
            ,'2012-2013': 588
            ,'2011-2012': 520
            ,'2010-2011': 453
            ,'2009-2010': 385
            ,'2008-2009': 327
            ,'2007-2008': 271
            ,'2006-2007': 220
            ,'2005-2006': 175}

In [5]:
belgium = get_data(belgium_ids, 37, "Belgian First Division A", "M")
belgium.head()

NEW YEAR:  2020-2021
NEW YEAR:  2019-2020


KeyboardInterrupt: 

In [5]:
belgium.to_csv('belgium_scraped.csv', sep='|', index=False)

In [6]:
dutch = get_data(dutch_ids, 23, 'Dutch Eredivise', "M")
dutch.head()

NEW YEAR:  2020-2021
NEW YEAR:  2019-2020
NEW YEAR:  2018-2019
NEW YEAR:  2017-2018
NEW YEAR:  2016-2017
NEW YEAR:  2015-2016
NEW YEAR:  2014-2015
NEW YEAR:  2013-2014
NEW YEAR:  2012-2013
NEW YEAR:  2011-2012
NEW YEAR:  2010-2011
NEW YEAR:  2009-2010
NEW YEAR:  2008-2009
NEW YEAR:  2007-2008
NEW YEAR:  2006-2007
NEW YEAR:  2005-2006
NEW YEAR:  2004-2005
NEW YEAR:  2003-2004
NEW YEAR:  2002-2003
NEW YEAR:  2001-2002
NEW YEAR:  2000-2001


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,"(Playing Time, MP)","(Playing Time, Starts)","(Playing Time, Min)",...,"(Performance, Off)","(Performance, Crs)","(Performance, Int)","(Performance, TklW)","(Performance, PKwon)","(Performance, PKcon)","(Performance, OG)",Season,League,Gender
0,1,Dirk Abels,nl NED,DF,Sparta R'dam,23,1997,3,1,148,...,0,3,1,2,,,0,2020-2021,Dutch Eredivise,M
1,2,Zakaria Aboukhlal,nl NED,"MF,FW",AZ Alkmaar,20,2000,2,2,170,...,1,1,1,0,,,0,2020-2021,Dutch Eredivise,M
2,3,Rohat Agca,nl NED,MF,Heracles Almelo,18,2001,2,0,35,...,0,0,0,1,,,0,2020-2021,Dutch Eredivise,M
3,4,Hamdi Akujobi,nl NED,MF,Heerenveen,20,2000,1,0,45,...,0,0,0,2,,,0,2020-2021,Dutch Eredivise,M
4,5,Edson Álvarez,mx MEX,MF,Ajax,22,1997,4,4,291,...,0,0,4,4,,,0,2020-2021,Dutch Eredivise,M


In [7]:
dutch.to_csv('dutch_scraped.csv', sep='|', index=False)

In [8]:
russia = get_data(russia_ids, 30, 'Russian Premier League', "M")
russia.head()

NEW YEAR:  2020-2021
NEW YEAR:  2019-2020
NEW YEAR:  2018-2019
NEW YEAR:  2017-2018
NEW YEAR:  2016-2017
NEW YEAR:  2015-2016
NEW YEAR:  2014-2015
NEW YEAR:  2013-2014
idfk
NEW YEAR:  2012-2013
idfk
NEW YEAR:  2011-2012
idfk
NEW YEAR:  2010
idfk
NEW YEAR:  2009
idfk
NEW YEAR:  2008
idfk
NEW YEAR:  2007
idfk
NEW YEAR:  2006
idfk
NEW YEAR:  2005
idfk
NEW YEAR:  2004
idfk
NEW YEAR:  2003
idfk
NEW YEAR:  2002
idfk
NEW YEAR:  2001
idfk


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,"(Playing Time, MP)","(Playing Time, Starts)","(Playing Time, Min)",...,"(Performance, Off)","(Performance, Crs)","(Performance, Int)","(Performance, TklW)","(Performance, PKwon)","(Performance, PKcon)","(Performance, OG)",Season,League,Gender
0,1,Oliver Abildgaard,dk DEN,MF,Rubin Kazan,24,1996,10,10,900,...,0,4,25,18,,,0,2020-2021,Russian Premier League,M
1,2,Arsen Adamov,ru RUS,DF,Akhmat Grozny,20,1999,2,0,14,...,0,0,1,1,,,0,2020-2021,Russian Premier League,M
2,3,Guram Adzhoyev,ru RUS,FW,Arsenal Tula,25,1995,3,0,12,...,0,0,0,0,,,0,2020-2021,Russian Premier League,M
3,4,Gamid Agalarov,ru RUS,FW,Ufa,20,2000,5,1,111,...,0,0,1,0,,,0,2020-2021,Russian Premier League,M
4,5,Mikhail Ageev,ru RUS,FW,Loko Moscow,20,2000,1,0,1,...,0,0,0,0,,,0,2020-2021,Russian Premier League,M


In [9]:
russia.to_csv('russia_scraped.csv', sep='|', index=False)

In [10]:
korea = get_data(korea_ids, 55, 'K League Classic', "M")
korea.head()

NEW YEAR:  2020
NEW YEAR:  2019
NEW YEAR:  2018
NEW YEAR:  2017
NEW YEAR:  2016
NEW YEAR:  2015
NEW YEAR:  2014


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,"(Playing Time, MP)","(Playing Time, Starts)","(Playing Time, Min)",...,"(Performance, Off)","(Performance, Crs)","(Performance, Int)","(Performance, TklW)","(Performance, PKwon)","(Performance, PKcon)","(Performance, OG)",Season,League,Gender
0,1,Adriano,br BRA,FW,FC Seoul,32,1987,7,4,238,...,,,,,,,0,2020,K League Classic,M
1,2,Elías Aguilar,cr CRC,"FW,MF",Incheon United,28,1991,14,13,1092,...,,,,,,,0,2020,K League Classic,M
2,3,Ikrom Alibayev,uz UZB,MF,FC Seoul,26,1994,11,7,660,...,,,,,,,0,2020,K League Classic,M
3,4,Terry Antonis,au AUS,MF,Suw Bluewings,26,1993,15,6,693,...,,,,,,,0,2020,K League Classic,M
4,5,Rustam Ashurmatov,uz UZB,DF,Gwangju FC,23,1996,21,21,1889,...,,,,,,,0,2020,K League Classic,M


In [11]:
korea.to_csv('korea_scraped.csv', sep='|', index=False)

In [12]:
australia = get_data(australia_ids, 65, 'A-League', "M")
australia.head()

NEW YEAR:  2019-2020
NEW YEAR:  2018-2019
NEW YEAR:  2017-2018
NEW YEAR:  2016-2017
NEW YEAR:  2015-2016
NEW YEAR:  2014-2015
NEW YEAR:  2013-2014
idfk
NEW YEAR:  2012-2013
idfk
NEW YEAR:  2011-2012
idfk
NEW YEAR:  2010-2011
idfk
NEW YEAR:  2009-2010
idfk
NEW YEAR:  2008-2009
idfk
NEW YEAR:  2007-2008
idfk
NEW YEAR:  2006-2007
idfk
NEW YEAR:  2005-2006
idfk


Unnamed: 0,Rk,Player,Nation,Pos,Squad,Age,Born,"(Playing Time, MP)","(Playing Time, Starts)","(Playing Time, Min)",...,"(Performance, Off)","(Performance, Crs)","(Performance, Int)","(Performance, TklW)","(Performance, PKwon)","(Performance, PKcon)","(Performance, OG)",Season,League,Gender
0,1,Idrus Abdulahi,au AUS,DF,Melb City,15,2003,1,0,7,...,0,1,0,0,,,0,2019-2020,A-League,M
1,2,Yarad Abetew,au AUS,DF,Adelaide,20,1999,1,1,45,...,0,0,0,1,0.0,0.0,0,2019-2020,A-League,M
2,3,Matt Acton,au AUS,GK,Melb Victory,27,1992,5,5,450,...,0,0,0,0,,,0,2019-2020,A-League,M
3,4,Mohamed Adam,au AUS,"FW,MF",W Sydney,18,2000,19,5,606,...,4,17,2,7,0.0,0.0,0,2019-2020,A-League,M
4,5,Rahmat Akbari,au AUS,MF,Brisbane,19,2000,5,2,194,...,0,0,8,3,0.0,0.0,0,2019-2020,A-League,M


In [13]:
australia.to_csv('australia_scraped.csv', sep='|', index=False)

In [None]:
w