# Introduction

In this notebook we are going to retrieve L1 results from ligue1.com website and save it down as csv file

# Import

In [1]:
import lxml.html as lh
import lxml.etree as et
import urllib.request as ulib
import pandas as pd
from selenium import webdriver
import time
import pickle
import traceback
import logging
import os.path

# Params

In [2]:
root_path = 'L:/Dev/Sandbox/Apps Development/SoccerStats/historical result files/France'
dic_competition_url = {'ligue1': 'ligue1', 'coupe-de-la-ligue': 'coupeLigue', 'trophee-des-champions': 'tropheeChampions'}
week_counter = {'coupe-de-la-ligue': [47,48,56,57,58,59,60], 'trophee-des-champions': [-1]}

## Logging

In [3]:
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(os.path.join(root_path,'ligue1comScoreRetriever.log'))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

In [4]:
#logger.error("test error")
#logger.warning("this is a warning")
#logger.info("this is just info")
#logger.debug("this is debug")        

# Functions

## [save_obj] && [load_obj] save/load object in file

In [5]:
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

## Create season dictionary

In [6]:
#dic = {'2017/2018': '101', '2016/2017': '100', '2015/2016': '84', '2014/2015': '83', '2013/2014': '82', '2012/2013': '81', '2011/2012': '80', '2010/2011': '79', '2009/2010': '78', '2008/2009': '77', '2007/2008': '76', '2006/2007': '75', '2005/2006': '74', '2004/2005': '73', '2003/2004': '72', '2002/2003': '71', '2001/2002': '70', '2000/2001': '69', '1999/2000': '68', '1998/1999': '67', '1997/1998': '66', '1996/1997': '65', '1995/1996': '64', '1994/1995': '63', '1993/1994': '62', '1992/1993': '61', '1991/1992': '60', '1990/1991': '59', '1989/1990': '58', '1988/1989': '57', '1987/1988': '56', '1986/1987': '55', '1985/1986': '54', '1984/1985': '53', '1983/1984': '52', '1982/1983': '51', '1981/1982': '50', '1980/1981': '49', '1979/1980': '48', '1978/1979': '47', '1977/1978': '46', '1976/1977': '45', '1975/1976': '44', '1974/1975': '43', '1973/1974': '42', '1972/1973': '41', '1971/1972': '40', '1970/1971': '39', '1969/1970': '38', '1968/1969': '37', '1967/1968': '36', '1966/1967': '35', '1965/1966': '34', '1964/1965': '33', '1963/1964': '32', '1962/1963': '31', '1961/1962': '30', '1960/1961': '29', '1959/1960': '28', '1958/1959': '27', '1957/1958': '26', '1956/1957': '25', '1955/1956': '24', '1954/1955': '23', '1953/1954': '22', '1952/1953': '21', '1951/1952': '20', '1950/1951': '19', '1949/1950': '18', '1948/1949': '17', '1947/1948': '16', '1946/1947': '15', '1945/1946': '14', '1938/1939': '7', '1937/1938': '6', '1936/1937': '5', '1935/1936': '4', '1934/1935': '3', '1933/1934': '2', '1932/1933': '1'}
#save_obj(dic, 'season_dic')

## Load season dictionary

In [7]:
season_dic = load_obj('season_dic')

## [selenium_url_to_tree] download url final output as a tree

We want to retrieve our URLs HTML as elementree object to parse it later. The problem is the URLs we are looking for are using Javascript to generate part of HTML page. The basic method 'urlopen' won't work in that case.


In [8]:
# get url html as elementtree using selenium.webdriver
def selenium_url_to_tree(driver, url):
    driver.get(url)
    time.sleep(5)
    htmlSource = driver.page_source
    tree = lh.fromstring(htmlSource)
    return tree

# HOW IT WORKS....
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#results_tree = selenium_url_to_tree(driver,week_url)

# close firefox window once done
#driver.quit()

## [handle_accent] get rid of weird accent

In [9]:
# we need to find a much better way to handle all that (using unicode)
def handle_accent(s):
    s = s.replace("ã¨", "e")
    s = s.replace("ã©", "e")
    s = s.replace("ã¯", "i")
    s = s.replace("ã", "a")
    s = s.replace("a«", "e")
    s = s.replace("a¢", "a")
    s = s.replace("a§", "c")
    return s

## [get_week_url] construct week url from season_id and week_id

In [10]:
def get_week_url(competition, season_id, week_id):
    template_url = 'http://www.ligue1.com/' + dic_competition_url[competition] + '/calendrier_resultat#sai=%season_id%&jour=%week_id%'
    season_url = template_url.replace("%season_id%", str(season_id))
    return season_url.replace("%week_id%", str(week_id))

## [get_seasons_dictionary] construct dictionary of (season, season_id)

In [11]:
def get_seasons_dictionary(tree):
    options = results_tree.findall('.//select[@name="saison"]/option')
    dic = {}
    for option in options:
        dic[option.text_content()] = option.attrib['value']
    return dic

## [get_coupeligue_stages_dictionary] construct dictionary {id,name}

In [12]:
def get_coupeligue_stages_dictionary(tree):
    dic = {}
    for elem in tree.findall('.//div[@id="journee_select"]/select[@id="journee"]/option'):
        dic[elem.attrib['value']] = elem.text_content()
    return dic

## [get_week_nb] get number of week

In [13]:
def get_week_nb(tree):
    lis = tree.findall('.//ul[@class="limite_hauteur"]/li')
    l = [x.text_content().strip() for x in lis if x.text_content()[:4]=='Week']
    return len(l)

## [get_scores] get scores for a single week in a season

In [14]:
# used by dataframe
cols = ['season', 'week', 'date', 'time', 'team_home', 'team_away', 'score_home', 'score_away', 'pen_home', 'pen_away']

def get_scores(tree, competition, season, week_id):
    
    # get tables html element from the tree
    tables = tree.findall('.//div[@id="tableaux_rencontres"]//table')
    
    # get stage name from week id
    if(week_id == 82):
        # ligue1 playoffs
        stage = "Play-off"
    elif(week_id==-1):
        # trophee des champs
        stage = "Final"
    elif(week_id > 40):
        # coupe de la ligue
        stage = get_coupeligue_stages_dictionary(tree)[str(week_id)]
    else:
        # ligue 1
        stage = "Week" + str(week_id)
    
    # log current process
    logger.info("Processing season {0}, {1}".format(season, stage))
    
    # store results
    data = []
    
    for table in tables:
        
        # get date
        caption = table.find('.//caption').text_content().split()
        date = caption[5] + "-" + caption[4] + "-" + caption[3]
        
        # get all rows
        trs = table.findall('.//tbody/tr')
        
        for tr in trs:
            
            try:
                time = tr.find('.//td[@class="horaire "]').text_content().strip()
            except:
                try:
                    # check for postponed game exception
                    time = tr.find('.//td[@class="horaire reporte "]').text_content().strip()
                    if(time == 'Postponed'):
                        continue
                    else:
                        logger.error("time value not correct: {0}".format(time))
                        continue
                except:
                    # unknown error: log and continue
                    logger.error("Unknown exception while fetching time")
                    continue
            try:
                team_home = tr.find('.//td[@class="domicile"]').text_content().strip()
                team_away = tr.find('.//td[@class="exterieur"]').text_content().strip()
                
                score = tr.find('.//td[@class="stats"]/a').text_content().strip()
                spanElem = tr.findall('.//td[@class="stats"]/a/span')

                scorePen = None
                
                # 1 span element: could be extra time indicator or penalty kick...
                if(len(spanElem)==1):
                    
                    # since 2016/2017: no more extra time for coupe de la ligue
                    if(competition=='coupe-de-la-ligue' and int(season_dic[season]) >= int(season_dic['2016/2017'])):
                        scorePen = spanElem[0].text_content().strip()
                        score = score.replace(scorePen, '')
                        scorePen = scorePen.replace('on pens', '')
                        scorePen = scorePen.replace(' ', '')
                    
                    # if trophee des champions, for sure it's penalty kick indicator only (never any extra time for this)
                    elif(competition=='trophee-des-champions'):
                        scorePen = spanElem[0].text_content().strip()
                        score = score.replace(scorePen, '')
                        scorePen = scorePen.replace('on pens', '')
                        scorePen = scorePen.replace(' ', '')
                                
                # 2 span elements: need to handle extra time score + penalty kicks..
                if(len(spanElem)==2):
                    scorePen = spanElem[1].text_content().strip()
                    score = score.replace(scorePen, '')
                    scorePen = scorePen.replace('on pens', '')
                    scorePen = scorePen.replace(' ', '')

                score = score.replace('a.e.t.', '')
                score = score.replace(' ', '')
                
                score_home = score.split('-')[0]
                score_away = score.split('-')[1]
                
                pen_home = None
                pen_away = None
                if(scorePen != None):
                    pen_home = scorePen.split('-')[0]
                    pen_away = scorePen.split('-')[1]                
                
            except:
                # if any exception when retrieving these attributes, log and raise (non recoverable exception)
                logger.error("Exception while fetching main game data: {0}".format(traceback.print_exc()))
                
                # better to not block on any game error, so we now use 'continue' i/o 'raise'
                #raise
                continue
            
            #link = tr.find('.//td[@class="video"]/a').attrib['href']
            data += [[season, stage, date, time, team_home, team_away, score_home, score_away, pen_home, pen_away]]
    
    return data

## [process_single_week] process a single week of data (web connection is wrapped inside)
## => "live" data

In [15]:
def process_single_week(competition, season, week_id):
    # get season id from season dictionary
    season_id = season_dic[season]
    # open driver: that will open firefox window
    driver = webdriver.Firefox()
    # get the tree from week url
    tree = selenium_url_to_tree(driver,get_week_url(competition, season_id, week_id))
    # process one week
    data = get_scores(tree, competition, season, week_id)
    # we are done with firefox...
    driver.quit()
    return data

## [season_df_to_csv] helper to save a season results from dataframe to csv file, with proper file path

In [16]:
def season_df_to_csv(df, competition, season):
    season_string = season.split('/')[0] + '_' + season.split('/')[1]
    filename = competition + '_' + season_string + '.csv'
    df.to_csv(os.path.join(root_path, competition, filename), index=False)

## [process_ligue1_seasons] process set of ligue1 seasons (historical data)

In [17]:
def process_ligue1_seasons(season_to_process):
    
    competition = 'ligue1'
    
    for season in season_to_process:
    
        # get season id
        season_id = season_dic[season]
        
        # first week id
        first_week_id = 1
        
        # get the tree from first week url
        tree = selenium_url_to_tree(driver,get_week_url(competition, season_id, first_week_id))

        # get week number
        week_nb = get_week_nb(tree)

        # empty list to store results
        data = []

        # process first week url which we already have
        data += get_scores(tree, competition, season, first_week_id)

        # browse remaining weeks of data
        for i in range(2,week_nb + 1):
            tree = selenium_url_to_tree(driver,get_week_url(competition, season_id, i))
            data += get_scores(tree, competition, season, i)

        #check on data table
        nb_team = (week_nb/2) + 1
        nb_game = (week_nb * nb_team)/2
        if(len(data)  != nb_game):
            # log error
            logger.error("error while processing season {0}: data len ({1}) doesn't match number of game ({2}) ".format(season, len(data), nb_game))
            # go to next iteration
            continue

        # if check was passed, we are going to save data as panda dataframe
        df = pd.DataFrame(data, columns=cols)

        # save df as csv file
        season_df_to_csv(df, competition, season)

## [process_playoffs] process play-off pages (works only for ligue1 competition)

In [18]:
def process_playoffs(season_to_process):
    
    competition = 'ligue1'
    
    for season in season_to_process:
    
        # get season id
        season_id = season_dic[season]

        # get the tree from first week url
        tree = selenium_url_to_tree(driver,get_week_url(season_id, 82))

        # empty list to store results
        data = []
        
        try:
            # process play-off page (id=82)
            data = get_scores(tree, competition, season, 82)
        except:
            logger.error("error while processing play-off web page for season {0}: skip..".format(season))
            continue
            
        # check on data table
        # there shouldn't be more than 10 games.. (usually maybe 6 games max, not sure...)
        nb_game = 10
        if(len(data) == 0):
            # log error
            logger.error("no data to process for season {0}".format(season))
            # go to next iteration
            continue
        elif (len(data)  > nb_game):
            # log error
            logger.error("error while processing season {0}: data len ({1}) is the max number of game we have set ({2}) ".format(season, len(data), nb_game))
            # go to next iteration
            continue

        # if check was passed, we are going to save data as panda dataframe
        df = pd.DataFrame(data, columns=cols)

        # save df as csv file
        season_df_to_csv(df, competition, season)

## [process_coupeligue_seasons] process set of coupe de la ligue seasons (historical data)

In [19]:
def process_coupeligue_seasons(season_to_process):
    
    competition = 'coupe-de-la-ligue'
    
    for season in season_to_process:
    
        print('processing season {0}'.format(season))
            
        # get season id
        season_id = season_dic[season]
        
        # first week id
        first_week_id = 47
        
        # get the tree from first week url
        tree = selenium_url_to_tree(driver,get_week_url(competition, season_id, first_week_id))
        
        # get list of week_id
        week_id_list = [int(x) for x in get_coupeligue_stages_dictionary(tree).keys()]

        # empty list to store results
        data = []

        # process first week url which we already have
        data += get_scores(tree, competition, season, first_week_id)

        # browse remaining weeks of data
        for i in week_id_list[1:]:
            tree = selenium_url_to_tree(driver,get_week_url(competition, season_id, i))
            data += get_scores(tree, competition, season, i)

        # check on data table
        # TO DO
        
        # if check was passed, we are going to save data as panda dataframe
        df = pd.DataFrame(data, columns=cols)

        # save df as csv file
        season_df_to_csv(df, competition, season)

## [process_trophee_seasons] process set of trophee des champs seasons (historical data)

In [20]:
def process_trophee_seasons(season_to_process):
    
    competition = 'trophee-des-champions'
    
    for season in season_to_process:
        
        print('processing season {0}'.format(season))
        
        # get season id
        season_id = season_dic[season]
        
        # first week id
        first_week_id = -1
        
        # get the tree from first week url
        tree = selenium_url_to_tree(driver,get_week_url(competition, season_id, first_week_id))

        # process first week url: the only one since trophee des champions is just one game
        data = get_scores(tree, competition, season, first_week_id)
        
        # check on data table
        # TO DO
        
        # if check was passed, we are going to save data as panda dataframe
        df = pd.DataFrame(data, columns=cols)

        # save df as csv file
        season_df_to_csv(df, competition, season)

# Unit tests

## unit test for process_single_week function

In [21]:
def assert_scores(score, score_home, score_away, pen_home, pen_away):
    assert_score = (score[6]==str(score_home) and score[7]==str(score_away))
    
    if(pen_home==None and pen_away==None):
        assert_pen = (score[8]==None and score[9]==None)
    else:
        assert_pen = (score[8]==str(pen_home) and score[9]==str(pen_away))
    return assert_score and assert_pen

def test_process_single_week():
    competition = 'coupe-de-la-ligue'
    season = '2012/2013'
    week_id = 47
    data = process_single_week(competition, season, week_id)
    assert assert_scores(data[1],3,3,5,4)
    assert assert_scores(data[2],0,0,5,4)
    assert assert_scores(data[6],2,1,None,None)

    competition = 'coupe-de-la-ligue'
    season = '2016/2017'
    week_id = 47
    data = process_single_week(competition, season, week_id)
    assert assert_scores(data[0],2,5,None,None)
    assert assert_scores(data[2],0,0,5,3)
    assert assert_scores(data[7],1,1,1,3)

    competition = 'trophee-des-champions'
    season = '2001/2002'
    week_id = -1
    data = process_single_week(competition, season, week_id)
    assert assert_scores(data[0],1,4,None,None)

    competition = 'trophee-des-champions'
    season = '1995/1996'
    week_id = -1
    data = process_single_week(competition, season, week_id)
    assert assert_scores(data[0],2,2,6,5)

    competition = 'ligue1'
    season = '2008/2009'
    week_id = 1
    data = process_single_week(competition, season, week_id)
    assert assert_scores(data[4],1,0,None,None)

    print("all good!")

## Unit test for get_coupeligue_stages_dictionary function

In [22]:
def test_get_coupeligue_stages_dictionary():
    
    driver = webdriver.Firefox()
    
    # test season 1994/1995, 1st Round
    tree = selenium_url_to_tree(driver,'http://www.ligue1.com/coupeLigue/calendrier_resultat#sai=63&jour=47')
    stage = get_coupeligue_stages_dictionary(tree)[str(47)]
    assert stage == '1st Round'
    
    # test season 1994/1995, Round of 32
    tree = selenium_url_to_tree(driver,'http://www.ligue1.com/coupeLigue/calendrier_resultat#sai=63&jour=56')
    stage = get_coupeligue_stages_dictionary(tree)[str(56)]
    assert stage == 'Round of 32'
    
    print("all good!")
    
    driver.quit()

## Run unit tests

In [23]:
#test_process_single_week()
#test_get_coupeligue_stages_dictionary()

# Test the API

## test tree creation via selenium plugin

In [24]:
#driver = webdriver.Firefox()
#tree = selenium_url_to_tree(driver,'http://www.ligue1.com/coupeLigue/calendrier_resultat#sai=63&jour=47')
#driver.quit()

## test process a single week with different competition inputs

In [25]:
# get season id from season dictionary
#season_id = season_dic['2012/2013']
# open driver: that will open firefox window
#driver = webdriver.Firefox()
# get the tree from week url
#tree = selenium_url_to_tree(driver,get_week_url('coupe-de-la-ligue', season_id, 47))
# we are done with firefox...
#driver.quit()

## process set of seasons/play-offs

In [26]:
#season_to_process = sorted(season_dic.keys())[-24:-1]
season_to_process_coupeligue = sorted(season_dic.keys())[-24:-1]
season_to_process_trophee = ['1995/1996', '1997/1998', '1998/1999', '1999/2000', '2000/2001', '2001/2002', '2002/2003', '2003/2004', '2004/2005', '2005/2006', '2006/2007', '2007/2008', '2008/2009', '2009/2010', '2010/2011', '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016', '2016/2017']

# manual:
season_to_process = ['2009/2010', '2010/2011', '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016', '2016/2017']

print('seasons to process for ligue1:\n {0}\n'.format(season_to_process))
print('seasons to process for coupe de la ligue:\n {0}\n'.format(season_to_process_coupeligue))
print('seasons to process for trophee des champion:\n {0}\n'.format(season_to_process_trophee))

seasons to process for ligue1:
 ['2009/2010', '2010/2011', '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016', '2016/2017']

seasons to process for coupe de la ligue:
 ['1994/1995', '1995/1996', '1996/1997', '1997/1998', '1998/1999', '1999/2000', '2000/2001', '2001/2002', '2002/2003', '2003/2004', '2004/2005', '2005/2006', '2006/2007', '2007/2008', '2008/2009', '2009/2010', '2010/2011', '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016', '2016/2017']

seasons to process for trophee des champion:
 ['1995/1996', '1997/1998', '1998/1999', '1999/2000', '2000/2001', '2001/2002', '2002/2003', '2003/2004', '2004/2005', '2005/2006', '2006/2007', '2007/2008', '2008/2009', '2009/2010', '2010/2011', '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016', '2016/2017']



In [27]:
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# handle ligue 1 competition
# process all seasons
#process_ligue1_seasons(season_to_process)
# process all ligue 1 play-offs (if any)
#process_ligue1_playoffs(season_to_process)

# handle other french competitions available on www.ligue1.com
#process_coupeligue_seasons(season_to_process)
#process_trophee_seasons(season_to_process_trophee)

# we are done with firefox...
#driver.quit()

processing season 1995/1996
processing season 1997/1998
processing season 1998/1999
processing season 1999/2000
processing season 2000/2001
processing season 2001/2002
processing season 2002/2003
processing season 2003/2004
processing season 2004/2005
processing season 2005/2006
processing season 2006/2007
processing season 2007/2008
processing season 2008/2009
processing season 2009/2010
processing season 2010/2011
processing season 2011/2012
processing season 2012/2013
processing season 2013/2014
processing season 2014/2015
processing season 2015/2016
processing season 2016/2017


## Inspect dataframe

In [28]:
#dtypeCount =[df.iloc[:,i].apply(type).value_counts() for i in range(df.shape[1])]
#dtypeCount