* 1- Download selenium with pip <br> sudo pip3 install selenium
(python 3.8)

* 2- Download chromedriver 86.0.4240.22 (for chrome navigator version 86)<br>
or  Download Geckodriver (for Firefox => bug as of 2019 July)

* 3- Move the executable file (chromedriver.exe) in usr/local/bin so it can be identifiable by system (for Linux OS)<br>
or explicit its path (here called PATH_DRIVER )

Tutorial followed <br>
https://towardsdatascience.com/mastering-the-art-of-web-scraping-with-selenium-and-python-part-1-2-90a216199873 <br>
https://www.reddit.com/r/learnpython/comments/9zzfvl/how_do_i_scrape_a_website_with_a_login_page/ <br>
https://towardsdatascience.com/web-scraping-using-selenium-python-8a60f4cf40ab <br>
https://www.scrapehero.com/how-to-prevent-getting-blacklisted-while-scraping/

Documentation consulted <br>
https://selenium-python.readthedocs.io/locating-elements.html<br>
https://www.w3.org/2018/10/26-webdriver-minutes.html#item06<br>
https://selenium-python.readthedocs.io/waits.html
    
Bug ElementNotInteractableException in geckodriver, potential reasons :<br> 
    1- https://bugzilla.mozilla.org/show_bug.cgi?id=1445227 / https://github.com/mozilla/geckodriver/issues/1414
    <br>
    2- Need to add waiting time (implicit wait or explicit wait) #driver.implicitly_wait(400)<br>
    3- Hidden input + dupplicated input element

In [1]:
## import relevant librairies 
from datetime import datetime
from time import sleep
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
## often used function 

def login_mpg(driver,email,password):
    ''' Login to MPG website '''
    driver.get('https://mpg.football/?type=login')
    driver.maximize_window()
    for element in driver.find_elements_by_xpath("//form[1]//input[@type='email']"):
        ## THERE ARE 2 SAME ELEMENTS THAT ARE NOT DISTINGUISHABLE BY XPATH
        if element.is_displayed() : element.send_keys(email)
    for element in driver.find_elements_by_xpath("//form[1]//input[@type='password']"):
        ## THERE ARE 2 SAME ELEMENTS THAT ARE NOT DISTINGUISHABLE BY XPATH
        if element.is_displayed() : element.send_keys(password)
    for element in driver.find_elements_by_xpath("//form[1]//button[@type='submit']"):
        ## THERE ARE 2 SAME ELEMENTS THAT ARE NOT DISTINGUISHABLE BY XPATH
        if element.is_displayed() : element.click() 

def currrent_date_time():
    return datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

In [3]:
def scrap_match_data(LEAGUE_CODE, SEASON, NB_PLAYER, NB_DAY, USER_LOGIN, USER_PASSWORD, PATH_DRIVER,TIME_OUT=10): 
    '''scrap the data from https://mpg.football/ 
       LEAGUE_CODE, USER_LOGIN, USER_PASSWORD, PATH_DRIVER : string
       SEASON, NB_PLAYER, NB_DAY : integer 
       Returns a df with cols : ['match_id','player_home','player_away','score_home','score_away','bonus_home','bonus_away']
    '''   
    def extract_match_data(driver):
        '''driver : Selenium driver 
        returns : list of strings '''
        ## player name
        player_elements=driver.find_elements_by_class_name('index__team___2teXs')
        player_home=player_elements[0].text
        player_away=player_elements[1].text

        ## bonus
        # 2nd position is home player
        bonus_elements = driver.find_elements_by_xpath("//div[@class='index__bonusContent___ALDPF']/div[@class='index__bonusBloc___pae9P' and position()=2]//span[@class='index__root___1mxjA index__span___2ImFl index__bonusLabelMobile___3rGrs index__uppercase___3C7cm']")
        if len(bonus_elements)==0 : bonus_home = ""
        elif len(bonus_elements)==1 : bonus_home = bonus_elements[0].text  
        
        # 3rd position is away player
        bonus_elements = driver.find_elements_by_xpath("//div[@class='index__bonusContent___ALDPF']/div[@class='index__bonusBloc___pae9P' and position()=3]//span[@class='index__root___1mxjA index__span___2ImFl index__bonusLabelMobile___3rGrs index__uppercase___3C7cm']")
        if len(bonus_elements)==0 : bonus_away = ""
        elif len(bonus_elements)==1 : bonus_away = bonus_elements[0].text 

        ## SCORE
        score_elements = driver.find_elements_by_xpath("//div[@class='index__score___300rq animated slideInUp']")
        score_home = score_elements[0].text
        score_away = score_elements[1].text
        return [player_home,player_away,score_home,score_away,bonus_home,bonus_away]
    
    ## login 
    driver = webdriver.Chrome(PATH_DRIVER)
    login_mpg(driver,USER_LOGIN,USER_PASSWORD)
    
    ## initialize matches DF
    matches = pd.DataFrame(columns = ['match_id','player_home','player_away','score_home','score_away','bonus_home','bonus_away'])
    matches.set_index('match_id',inplace=True)

    ## data parsing and retrieval
    NB_MATCH_PER_DAY=int(NB_PLAYER/2)
    for dayi in range(1,NB_DAY+1): 
        for matchj in range(1,NB_MATCH_PER_DAY+1): 
            match_id = str(SEASON) + "_" + str(dayi) + "_" + str(matchj)
            URL = "https://mpg.football/league/" + LEAGUE_CODE + "/results/detail/" + match_id + "?lang=fr-FR"
            driver.get(URL)
            try :
                element = WebDriverWait(driver, TIME_OUT).until(EC.visibility_of_element_located((By.CLASS_NAME, "index__team___2teXs")))
            except :
                break # exit out of the exeception  
            
            matches.loc[match_id] = extract_match_data(driver)
    driver.quit()
    return matches


def display_bonus_usage(matches, bonus_name=['ZAHIA', 'UBER EATS', 'LA VALISE À NANARD', 'SUAREZ', "CHAPRON ROUGE",'MIROIR ALLIANZ', "TONTON PAT'"]):
    ''' return a pandas.io.formats.style.Styler object (not a dataframe)
    that displays bonus usage from the Match dataframe retrieved with Selenium
    Matches : df containing columns 'player_home','bonus_home,'player_away','bonus_away'
    bonus_name : list of strings containing the bonus name. 
    Ex in English : ['THE NAUGHTY WAG', 'RED BULL', 'WTF SUITCASE', 'SUAREZ', "CRAZY REF'",'MIRROR', "UNCLE PAT'"]'''
    
    def build_reference_table(bonus_name,matches):
        ''' REFERENCE TABLE (CARTESIAN PRODUCT OF ALL PLAYERS AND BONUSES) '''
        ##Adding a 'key' columns=1 on both df and merge on this column <=> cartesian product
        bonus_ref = pd.DataFrame()
        bonus_ref["bonus_name"] = bonus_name
        bonus_ref["total_nber"] = [1,3,1,1,1,1,1]  ##total number of bonuses available at the beginning of the market in the same order
        bonus_ref["key"] = 1

        players_ref = pd.DataFrame()
        players_ref["player_name"]=matches.player_away.append(matches.player_home).unique() ##list of unique player_name
        players_ref["key"]=1
    
        ref = pd.merge(bonus_ref, players_ref,on='key')[['bonus_name','total_nber',"player_name"]]
        return ref
    
    def etl_matches(matches):
        ''' DATA WRANGLING TO HAVE A DF WITH 2 COLUMNS 'player_name','bonus used' '''
        home = matches[['player_home','bonus_home']]
        away = matches[['player_away','bonus_away']]
        home.columns = ['player_name','bonus used']
        away.columns = ['player_name','bonus used']
        bonus = home.append(away)
    
        ##DELETE ROWS WITH NO BONUS
        bonus.replace("", np.nan, inplace=True)
        bonus.dropna(subset = ["bonus used"], inplace=True)
    
        ##AGGREGATION
        bonus['count']=1    #for the count aggregator 
        bonus=bonus.groupby(['player_name','bonus used']).count()
        bonus.reset_index(inplace=True)    #Delete the multiindex generated with the count
        return bonus 
    
    def merge(bonus, ref):
        ''' REFERENCE TABLE MERGED WITH SCRAPED DATA '''
        bonus = pd.merge(ref, bonus, how='left',
                         left_on = ['bonus_name',"player_name"],
                         right_on = ['bonus used',"player_name"])[["player_name","bonus_name","total_nber","count"]]
        bonus.columns = ["player_name","bonus_name","total_nber","total_used"]
        bonus.replace(np.nan,0, inplace=True)   ##rows with nan <=> bonus not used so we put it to 0
        return bonus 

    def compute_indicators(bonus):
        ''' COMPUTE USAGE INDICATOR AND FORMATTING '''
        
        bonus["usage"] = bonus["total_used"]/ bonus["total_nber"]
        bonus["total_used"] = pd.to_numeric(bonus["total_used"],downcast='integer')  #integer formatting
        bonus['usage'] = bonus[['usage']].applymap(lambda x: "{0:.1f}%".format(x*100)) ##% formatting
        bonus = bonus.sort_values(by='player_name')
        
        return bonus 
        
    def color_not_fully_used(val):
            """ Takes a scalar and returns a string with 
            the css property `'color: red'` for strings != '100.0%', black otherwise."""
            color = 'orange' if val != '100.0%' else 'black'
            return 'color: %s' % color
        
    ref = build_reference_table(bonus_name,matches)
    bonus = etl_matches(matches)
    bonus = merge(bonus, ref)
    bonus = compute_indicators(bonus) 
    bonus.style.applymap(color_not_fully_used, subset=['usage'])
    return bonus

In [4]:
def scrap_team_data(LEAGUE_CODE, NB_PLAYER, USER_LOGIN, USER_PASSWORD, PATH_DRIVER,TIME_OUT=10): 
    '''scrap the data from https://mpg.football/ 
       LEAGUE_CODE, USER_LOGIN, USER_PASSWORD, PATH_DRIVER : string
       SEASON, NB_PLAYER, NB_DAY : integer 
       Returns a df with cols : ['match_id','player_home','player_away','score_home','score_away','bonus_home','bonus_away']
    '''   
    def extract_team_data(driver):
        '''driver : Selenium driver 
        returns : list of strings '''
        ## footballer
        footballer_elements = driver.find_elements_by_xpath("//div[@class='index__playerTitleTextStyle___1jw0j']")
        footballer = [el.text for el in footballer_elements]

        ## team and position
        team_position_elements = driver.find_elements_by_xpath("//div[@class='index__textNormal___xx1mi']")
        team_position = [el.text for el in team_position_elements]
        team = team_position[0::2]
        position= team_position[1::2]

        ## price
        price_elements = driver.find_elements_by_xpath("//div[@class='index__textBold___3SH8v']")
        price = [el.text for el in price_elements]

        assert len(team)==len(position)
        assert len(team)==len(price)

        return pd.DataFrame(data = {"footballer":footballer,"team":team,"position":position,"price":price})

    ## login 
    driver = webdriver.Chrome(PATH_DRIVER)
    login_mpg(driver,USER_LOGIN,USER_PASSWORD)
    
    ## data parsing and retrieval
    URL = "https://mpg.football/league/" + LEAGUE_CODE + "/ranking/teams"
    driver.get(URL)
    sleep(2)
    
    teams = extract_team_data(driver)
    teams['player'] = "les Cocos Singapouriens"
    teams['dt_extraction'] = currrent_date_time()
    driver.quit()
    
    return teams

In [5]:
## INPUT FROM USER
LEAGUE_CODE="LNYBEV5F"
SEASON=3  # 3rd season
NB_PLAYER=8
NB_DAY=14 # extract up to the 11th day included 
USER_LOGIN='XXX'
USER_PASSWORD='XXX'
PATH_DRIVER = "C:/Users/WV6098/Downloads/Data_Scraping/chromedriver.exe" # path where the chrome driver is installed 

matches = scrap_match_data(LEAGUE_CODE, SEASON, NB_PLAYER, NB_DAY, USER_LOGIN, USER_PASSWORD, PATH_DRIVER)

###Scrapped data from MPG
#Comment if already done it :
matches.to_csv("data/matches/matches"+currrent_date_time()+".csv", sep=';')

#Uncomment if data is already scraped and saved  :
#matches=pd.read_csv("data/matches/matches.csv") 

matches.head()

Unnamed: 0_level_0,player_home,player_away,score_home,score_away,bonus_home,bonus_away
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3_1_1,les Cocos Singapouriens,Nolan Power,3,4,,
3_1_2,Flyers,The Last,0,1,,
3_1_3,Bebzer,FC Football,3,1,,
3_1_4,Manchester Bigcity,Woka woka,4,0,,
3_2_1,Nolan Power,Manchester Bigcity,4,2,,


In [7]:
teams = scrap_team_data(LEAGUE_CODE, NB_PLAYER, USER_LOGIN, USER_PASSWORD, PATH_DRIVER)
dt_extraction=teams['dt_extraction'][0]
teams.to_csv("data/teams/teams"+dt_extraction+".csv", sep=';',encoding='latin1')
teams.head()

Unnamed: 0,footballer,team,position,price,player,dt_extraction
0,Larsonneur,Brest,Gardien,39,les Cocos Singapouriens,28-12-2020-18-10-55
1,Ntumba,Dijon,Gardien,1,les Cocos Singapouriens,28-12-2020-18-10-55
2,Gomis,Rennes,Gardien,22,les Cocos Singapouriens,28-12-2020-18-10-55
3,Kamara,Marseille,Def. Cen.,16,les Cocos Singapouriens,28-12-2020-18-10-55
4,GonzÁLez,Marseille,Def. Cen.,15,les Cocos Singapouriens,28-12-2020-18-10-55


In [8]:
display_bonus_usage(matches).head()

NameError: name 'matches' is not defined

In [None]:
## TROUBLESHOOTING with driver 

driver = webdriver.Chrome(PATH_DRIVER)
login_mpg(driver,USER_LOGIN,USER_PASSWORD)

URL = "https://mpg.football/league/" + LEAGUE_CODE + "/ranking/teams"
driver.get(URL)
sleep(1)
classes = ["index__playerTitleTextStyle___1jw0j","index__textNormal___xx1mi","index__textBold___3SH8v"]
xpaths = ["//div[@class='"+ class_element+"']" for class_element in classes]

for xpath in xpaths:
    for i in range(0,len(driver.find_elements_by_xpath(xpath))):
        element = driver.find_elements_by_xpath(xpath)[i]
        break
        if element.is_displayed():
            print('YES, element no '+ str(i))
            
        print('text is ' + element.text)
        print('-------')