# Introduction

The goal of this notebook is to analyse worldfootball.net urls and save down results page into tree structure. This will be used later on to get results and save it under a more readable format (csv).

The trees will be saved on the drive under a specific folder hierarchy:
zone -> competition -> season -> stage.html

for instance:
france -> ligue1 -> 2016-2017 -> Week01.html

Stage name is then the tree file name.

# Import

In [1]:
import lxml.html as lh
import lxml.etree as et
from selenium import webdriver
import time
import pickle
import traceback
import logging
import os.path
import configparser
import os
import re
import datetime

# Params

In [2]:
root_path = 'L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/WorldFootballDotNet/'
conf_path = os.path.join(root_path, '_conf','worldfootball.ini')
log_path = os.path.join(root_path, '_logs')
website_url = 'https://www.worldfootball.net'

# Logging

In [3]:
logger = logging.getLogger('myapp')
timestamp = datetime.datetime.now().strftime('%H%M%S')
hdlr = logging.FileHandler(os.path.join(log_path,'worldfootballnet_TreeSaver_' + timestamp + '.log'))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

In [4]:
def logWrapper(message, level="info", print_console=True):
    if(print_console):
        print(message)
    if(level=="info"):
        logger.info(message)
    elif(level=="warning"):
        logger.warning(message)
    elif(level=="debug"):
        logger.debug(message)
    elif(level=="error"):
        logger.error(message)

In [5]:
#logger.error("test error")
#logger.warning("this is a warning")
#logger.info("this is just info")
#logger.debug("this is debug")
#logWrapper('hello world')
#logWrapper('message')

# Conf

In [6]:
class ConfHandler():

    dicSections = {}
    
    def __init__(self,file_full_path):

        if not os.path.exists(file_full_path):
            raise Exception('path not correct: {0}'.format(file_full_path))
        
        config = configparser.ConfigParser()
        config.read(file_full_path)
        
        for s in config.sections():
            dicParams = {}
            for p in config[s]:
                dicParams[p] = config[s][p]
            self.dicSections[s] = dicParams

    def get(self,section,param, is_param_list = False):
        if section not in self.dicSections.keys():
            return None
        if param not in self.dicSections[section].keys():
            return None
        if self.dicSections[section][param]=='':
            return None
        if(is_param_list):
            return self.dicSections[section][param].replace(' ','').split(',')
        return self.dicSections[section][param]
    
    def get_section(self,section):
        if section not in self.dicSections.keys():
            return None
        return self.dicSections[section]
    
    def get_sections(self):
        return sorted(list(self.dicSections.keys()))

In [7]:
# test conf
conf = ConfHandler(conf_path)

# return setting within section
print(conf.get('france','ligue1'))

# france section only
dic = conf.get_section('france')
print(dic)

# all sections
sections = conf.get_sections()
print(sections)

# filter section
print(conf.get('general', 'section_filter_for_tree_saving', True))

# this section doesn't exist, should return None:
print(conf.get('cup', 'club_cup_list', True))

# this setting doesn't exist within existing section, should return None:
print(conf.get('general', 'club_cup_list', True))

# setting exists, but is empty -> should return None
print(conf.get('general', 'test_empty_param'))

# setting exists, but is empty -> should return None
print(conf.get('general', 'test_empty_param', True))

fra-ligue-1
{'ligue1': 'fra-ligue-1', 'coupe-de-france': 'fra-coupe-de-france', 'coupe-de-la-ligue': 'fra-coupe-de-la-ligue', 'trophee-des-champions': 'fra-trophee-des-champions'}
['england', 'europe', 'france', 'general', 'germany', 'italy', 'netherlands', 'portugal', 'spain', 'world']
None
None
None
None
None


# Functions

## [selenium_url_to_tree] download url final output as a tree

In [8]:
# get url html as elementtree using selenium.webdriver
def selenium_url_to_tree(driver, url):
    driver.get(url)
    time.sleep(int(conf.get('general', 'connection_waiting_time')))
    htmlSource = driver.page_source
    tree = lh.fromstring(htmlSource)
    return tree

# HOW IT WORKS....
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#results_tree = selenium_url_to_tree(driver,week_url)

# close firefox window once done
#driver.quit()

In [9]:
# return html page as a tree.
# the tree object will be cached to drive so next call won't require web call
def get_tree_from_url(driver, filename, url):
    
    logWrapper("Trying to load url {0}...".format(url), level='info')
                            
    # file doesn't exist, we will have to do a web call
    if(not os.path.isfile(filename)):
        
        logWrapper("File don't exist: {0}".format(filename), level='info')
        
        # create folder if not done already
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        
        # load up url in web browser
        logWrapper("Loading url in web browser: {0}".format(url), level='info')
        tree = selenium_url_to_tree(driver,url)
        
        # save it down to the drive
        tree.getroottree().write(filename)
        
        if(not os.path.isfile(filename)):
            raise
        
    # return the loaded tree
    return lh.fromstring(open(filename, 'r').read())

In [10]:
def get_result_url_from_tree(tree):
    return tree.find('.//div[@class="navibox2"]//div[@class="data"]/ul[2]/li/a').attrib['href']
    

In [11]:
# construct dictionary of {season: url}
def get_seasons_dic_from_tree(tree):
    options = tree.findall('.//select[@name="saison"]/option')
    dic = {}
    for option in options:
        season = option.text_content().replace('/','-')
        dic[season] = option.attrib['value']
    return dic

In [12]:
# construct dictionary of {stage: url}
def get_stages_dic_from_tree(tree):
    
    dic = {}
    
    is_cup = False
    
    # assume we are looking at a 'league' tree
    options = tree.findall('.//select[@name="runde"]/option')
    
    if len(options) == 0:
        # now trying to see if we are looking at a 'cup' tree
        options = tree.findall('.//select[@name="phase"]/option')
        
        if len(options) == 0:
            raise
        else:
            is_cup = True
    
    for option in options:
        stage = option.text_content()
        if(is_cup):
            dic[stage] = option.attrib['value']
        else:
            stage = stage[:stage.find('.')]
            dic['Week' + str('%02d' % int(stage))] = option.attrib['value']
    return dic

In [13]:
# use regex to find season in an url (ie: 2016-2017)
def match_season(url):
    matchObj = re.match(r'.*([1-3][0-9]{3}-[1-3][0-9]{3}).*', url)
    if not matchObj:
        # sometime we get only year instead of season
        matchObj = re.match(r'.*([1-3][0-9]{3}).*', url)
        if not matchObj:
            raise
    return matchObj.group(1)

In [14]:
# use regex to find stage in an url (ie: '1' or '38' or 'final')
def match_stage_in_url(url):
    
    # league url
    matchObj = re.match(r'.*[1-3][0-9]{3}-[1-3][0-9]{3}-spieltag/(\b\d{1,2}\b)/', url)
    if matchObj:
        return 'Week' + str('%02d' % int(matchObj.group(1)))
    
    # qualification url: https://www.worldfootball.net/schedule/wm-quali-suedamerika-2010-spieltag/18/
    matchObj = re.match(r'.*-[1-3][0-9]{3}-spieltag/(\b\d{1,2}\b)/', url)
    if matchObj:
        return 'Week' + str('%02d' % int(matchObj.group(1)))
    
    # round url: 
    matchObj = re.match(r'.*-[1-3][0-9]{3}-(.*)-runde/\b\d{1,2}\b/', url)
    if matchObj:
        return 'Round ' + matchObj.group(1).upper()
    
    # group url: https://www.worldfootball.net/schedule/wm-quali-europa-2016-2017-gruppe-a/0/
    matchObj = re.match(r'.*-[1-3][0-9]{3}-gruppe-(.*)/\b\d{1,2}\b/', url)
    if matchObj:
        return 'Group ' + matchObj.group(1).upper()
    
    # round group url: https://www.worldfootball.net/schedule/wm-quali-asien-2015-2017-3-runde-gruppe-a/2/
    matchObj = re.match(r'.*-[1-3][0-9]{3}-(.*)-runde-gruppe-(.*)/\b\d{1,2}\b/', url)
    if matchObj:
        return 'Round {0} Group {1}'.format(matchObj.group(1).upper(), matchObj.group(2).upper())
    
    # relegation url: https://www.worldfootball.net/schedule/wm-quali-asien-2011-2013-relegation/2/
    matchObj = re.match(r'.*-[1-3][0-9]{3}-relegation/\b\d{1,2}\b/', url)
    if matchObj:
        return 'Relegation'
    
    # cup url: final
    matchObj = re.match(r'.*-[1-3][0-9]{3}-(.*)/\b\d{1,2}\b/', url)
    if matchObj:
        stage = matchObj.group(1)
        if('finale' in stage or 'endspiel' in stage or 'endrunde' in stage or 'finalrunde' in stage):
            return 'Final'
        elif('entscheidungsspiel' in stage):
            return 'Replays'
        else:
            raise ValueError("Couldn't match stage name in url: {0}".format(url))

In [15]:
# return selected stage from the tree
def get_selected_stage_from_tree(tree):
    option = tree.find('.//select[@name="runde"]/option[@class="wahl"]')
    stage = option.text_content()[:option.text_content().find('.')]
    return 'Week' + str('%02d' % int(stage))

In [16]:
def check_stage_name(stage):
    
    stage_allowed_names = ['Week%%', 'Final', 'Semi-finals', 'Quarter-finals', 'Round %%', 'Round of%%', 'Group%%', '3rd place', 'Relegation']
    
    for pattern in stage_allowed_names:
        if('%%' in pattern):
            if pattern.replace('%%','') in stage:
                return
        else:
            if pattern == stage:
                return
    raise ValueError('stage name not allowed: {0}'.format(stage))

In [17]:
check_stage_name('Group A')
check_stage_name('Round of 16')
check_stage_name('3rd place')

In [18]:
def format_stage_name(stage):
    dic = {
        '3td place': '3rd place',
        'Third place': '3rd place',
        'Final round': 'Final',
        'Caribbean Quarter-finals': 'Quarter-finals',
        'Caribbean Semi-finals': 'Semi-finals',
        'Caribbean Final': 'Final'
        
    }
    
    if(stage in dic.keys()):
        return dic[stage]
    return stage

In [19]:
def get_sections_to_process(conf):
    sections = conf.get_sections()
    ignore_zones = conf.get('general', 'ignore_sections_for_tree_saving', True)
    zone_filter = conf.get('general', 'section_filter_for_tree_saving', True)
    if zone_filter != None:
        sections = list(set(sections) & set(zone_filter))
    elif ignore_zones != None:
        sections = list(set(sections) - set(ignore_zones))
    return sections

In [20]:
# run the whole tree saver process
def run_tree_saver(conf):

    season_min = int(conf.get('general', 'season_min'))
    season_max = int(conf.get('general', 'season_max'))
    
    sections = get_sections_to_process(conf)
    print('sections: {0}'.format(sections))        
    
    # open driver
    driver = webdriver.Firefox()
    
    for section in sections:

        #if(section in ignore_zones):
        #    continue
        
        logWrapper('##########################################################')
        logWrapper('processing section {0}'.format(section))
        logWrapper('##########################################################')
        
        dic = conf.get_section(section)

        for competition in dic.keys():

            logWrapper('**********************************************')
            logWrapper('processing competition {0}'.format(competition))
            logWrapper('**********************************************')
            
            # competition overview url
            ovw_url = website_url + '/competition/' + dic[competition] + '/'

            competition_path = os.path.join(root_path, section, competition)

            # get tree associated to overview url (from drive or from web)
            ovw_tree_file_full_path = os.path.join(competition_path, competition + '_overview.html')
            ovw_tree = get_tree_from_url(driver, ovw_tree_file_full_path, ovw_url)

            # get result url from overview url, that's the current result url (ie: current season, current stage)
            curr_result_url = website_url + get_result_url_from_tree(ovw_tree)

            # get tree associated to current result url
            curr_tree_full_path = os.path.join(competition_path, competition + '_current_result.html')
            curr_tree = get_tree_from_url(driver, curr_tree_full_path , curr_result_url)

            # get dic {season:url} from current result tree
            dic_seasons = get_seasons_dic_from_tree(curr_tree)
            
            for season in dic_seasons.keys():

                try:
                    # check if we process this season (ie: not too old)
                    if(int(season[:4]) < season_min or int(season[:4]) > season_max):
                        logWrapper('ignoring season {0}'.format(season))
                        continue
                    
                    logWrapper('--------------------------------')
                    logWrapper('processing season {0}'.format(season))
                    logWrapper('--------------------------------')
                    
                    season_path = os.path.join(competition_path, season)
                    
                    # default url when season is loaded, usually the final stage or last week of a season
                    season_default_url = website_url + dic_seasons[season]
                    season_default_full_path = os.path.join(season_path, 'season_default.html')
                    last_stage_tree = get_tree_from_url(driver, season_default_full_path , season_default_url)
                                        
                    # get dic {stage:url} from last stage tree
                    dic_stages = get_stages_dic_from_tree(last_stage_tree)

                    for stage_as_in_url in dic_stages.keys():

                        try:
                            logWrapper('processing {0}, season {1}, stage {2}'.format(competition, season, stage_as_in_url))

                            # this url is usually the last stage url for a given season (or the 'final' url for a cup competition)
                            curr_stage_url = website_url + dic_stages[stage_as_in_url]
                            #logWrapper('we are looking for this url: {0}'.format(curr_stage_url))

                            # get tree associated to current stage url
                            curr_stage_full_path = os.path.join(season_path, stage_as_in_url + '.html')
                            #logWrapper('should resolve to path: {0}'.format(curr_stage_full_path))

                            last_stage_tree = get_tree_from_url(driver, curr_stage_full_path , curr_stage_url)
                        except Exception as e:
                            logWrapper("Exception while processing stage {0}: {1}".format(stage, e), level='error')
                            continue
                    
                except Exception as e:
                    logWrapper("Exception while processing season {0}: {1}".format(season, e), level='error')
                    continue
                
    # close driver
    driver.quit()
    logWrapper("Process finished")
                    

In [22]:
run_tree_saver(ConfHandler(conf_path))

sections: ['spain', 'europe']
##########################################################
processing section spain
##########################################################
**********************************************
processing competition la-liga
**********************************************
Trying to load url https://www.worldfootball.net/competition/esp-primera-division/...
Trying to load url https://www.worldfootball.net/schedule/esp-primera-division-2017-2018-spieltag/2/...
ignoring season 2017-2018
--------------------------------
processing season 2016-2017
--------------------------------
Trying to load url https://www.worldfootball.net/schedule/esp-primera-division-2016-2017-spieltag_2/38/...
processing la-liga, season 2016-2017, stage Week01
Trying to load url https://www.worldfootball.net/schedule/esp-primera-division-2016-2017-spieltag_2/1/...
processing la-liga, season 2016-2017, stage Week02
Trying to load url https://www.worldfootball.net/schedule/esp-primera-divisio