# Introduction

The goal of this notebook is to analyse worldfootball.net urls and save down results page into tree structure. This will be used later on to get results and save it under a more readable format (csv).

# Import

In [1]:
import lxml.html as lh
import lxml.etree as et
from selenium import webdriver
import time
import pickle
import traceback
import logging
import os.path
import configparser
import os
import re
import datetime

# Params

In [2]:
root_path = 'L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/WorldFootballDotNet/'
conf_path = os.path.join(root_path, '_conf','worldfootball.ini')
website_url = 'https://www.worldfootball.net'

# Logging

In [3]:
logger = logging.getLogger('myapp')
timestamp = datetime.datetime.now().strftime('%H%M%S')
hdlr = logging.FileHandler(os.path.join(root_path,'worldfootballnet_TreeSaver_' + timestamp + '.log'))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

In [4]:
def logWrapper(message, level="info", print_console=True):
    if(print_console):
        print(message)
    if(level=="info"):
        logger.info(message)
    elif(level=="warning"):
        logger.warning(message)
    elif(level=="debug"):
        logger.debug(message)
    elif(level=="error"):
        logger.error(message)

In [5]:
#logger.error("test error")
#logger.warning("this is a warning")
#logger.info("this is just info")
#logger.debug("this is debug")
#logWrapper('hello world')
#logWrapper('message')

# Conf

In [6]:
class ConfHandler():

    dicSections = {}
    
    def __init__(self,file_full_path):

        if not os.path.exists(file_full_path):
            raise Exception('path not correct: {0}'.format(file_full_path))
        
        config = configparser.ConfigParser()
        config.read(file_full_path)
        
        for s in config.sections():
            dicParams = {}
            for p in config[s]:
                dicParams[p] = config[s][p]
            self.dicSections[s] = dicParams

    def get(self,section,param, is_param_list = False):
        if(is_param_list):
            return self.dicSections[section][param].replace(' ','').split(',')
        return self.dicSections[section][param]
    
    def get_section(self,section):
        return self.dicSections[section]
    
    def get_sections(self):
        return sorted(list(self.dicSections.keys()))

In [7]:
# test conf
#conf = ConfHandler(conf_path)
#print(conf.get('france','ligue1'))
#dic = conf.get_section('france')
#print(dic)
#sections = conf.get_sections()
#print(sections)
#print(conf.get('cup', 'club_cup_list', True))

# Functions

## [selenium_url_to_tree] download url final output as a tree

In [8]:
# get url html as elementtree using selenium.webdriver
def selenium_url_to_tree(driver, url):
    driver.get(url)
    time.sleep(2)
    htmlSource = driver.page_source
    tree = lh.fromstring(htmlSource)
    return tree

# HOW IT WORKS....
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#results_tree = selenium_url_to_tree(driver,week_url)

# close firefox window once done
#driver.quit()

In [9]:
# return html page as a tree.
# the tree object will be cached to drive so next call won't require web call
def get_tree_from_url(driver, filename, url):
    
    # file doesn't exist, we will have to do a web call
    if(not os.path.isfile(filename)):
        
        # create folder if not done already
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        
        # load up url in web browser
        tree = selenium_url_to_tree(driver,url)
        
        # save it down to the drive
        tree.getroottree().write(filename)
        
        if(not os.path.isfile(filename)):
            raise
        
    # return the loaded tree
    return lh.fromstring(open(filename, 'r').read())

In [10]:
def get_result_url_from_tree(tree):
    return tree.find('.//div[@class="navibox2"]//div[@class="data"]/ul[2]/li/a').attrib['href']
    

In [11]:
# construct dictionary of {season: url}
def get_seasons_dic_from_tree(tree):
    options = tree.findall('.//select[@name="saison"]/option')
    dic = {}
    for option in options:
        season = option.text_content().replace('/','-')
        dic[season] = option.attrib['value']
    return dic

In [12]:
# construct dictionary of {stage: url}
def get_stages_dic_from_tree(tree, is_cup = False):
    if(is_cup):
        select_name = 'phase'
    else:
        select_name = 'runde'
    options = tree.findall('.//select[@name="{0}"]/option'.format(select_name))
    dic = {}
    for option in options:
        stage = option.text_content()
        if(is_cup):
            dic[stage] = option.attrib['value']
        else:
            stage = stage[:stage.find('.')]
            dic['Week' + str('%02d' % int(stage))] = option.attrib['value']
    return dic

In [13]:
# use regex to find season in an url (ie: 2016-2017)
def match_season(url):
    matchObj = re.match(r'.*([1-3][0-9]{3}-[1-3][0-9]{3}).*', url)
    if not matchObj:
        # sometime we get only year instead of season
        matchObj = re.match(r'.*([1-3][0-9]{3}).*', url)
        if not matchObj:
            raise
    return matchObj.group(1)

In [14]:
# use regex to find stage in an url (ie: '1' or '38' or 'final')
def match_stage(url, is_cup):
    if(not is_cup):
        matchObj = re.match(r'.*[1-3][0-9]{3}-[1-3][0-9]{3}-spieltag/(\b\d{1,2}\b)/', url)
    else:
        matchObj = re.match(r'.*-[1-3][0-9]{3}-(.*)/\b\d{1,2}\b/', url)
    if not matchObj:
        raise
    return matchObj.group(1)

In [15]:
# return selected stage from the tree
def get_selected_stage_from_tree(tree):
    option = tree.find('.//select[@name="runde"]/option[@class="wahl"]')
    stage = option.text_content()[:option.text_content().find('.')]
    return 'Week' + str('%02d' % int(stage))

In [16]:
# run the whole tree saver process
def run_tree_saver(conf):

    sections = conf.get_sections()
    ignore_sections = conf.get('general', 'ignore_sections', True)
    club_cup_list = conf.get('general', 'club_cup_list', True)
    season_min = int(conf.get('general', 'season_min'))
    
    # open driver
    driver = webdriver.Firefox()

    for section in sections:

        if(section in ignore_sections):
            continue

        logWrapper('processing section {0}'.format(section))

        dic = conf.get_section(section)

        for competition in dic.keys():

            logWrapper('processing competition {0}'.format(competition))

            is_cup = dic[competition] in club_cup_list
            if(is_cup):
                logWrapper('this is a cup competition')

            # competition overview url
            ovw_url = website_url + '/competition/' + dic[competition] + '/'

            competition_path = os.path.join(root_path, section, competition)

            # get tree associated to overview url (from drive or from web)
            ovw_tree_file_full_path = os.path.join(competition_path, competition + '_overview.html')
            ovw_tree = get_tree_from_url(driver, ovw_tree_file_full_path, ovw_url)

            # get result url from overview url, that's the current result url (ie: current season, current stage)
            curr_result_url = website_url + get_result_url_from_tree(ovw_tree)

            # get tree associated to current result url
            curr_tree_full_path = os.path.join(competition_path, competition + '_current_result.html')
            curr_tree = get_tree_from_url(driver, curr_tree_full_path , curr_result_url)

            # get dic {season:url} from current result tree
            dic_seasons = get_seasons_dic_from_tree(curr_tree)
            
            for season in dic_seasons.keys():

                try:
                    # check if we process this season (ie: not too old)
                    if(int(season[:4])<season_min):
                        logWrapper('ignoring season {0}'.format(season))
                        continue
                    
                    logWrapper('processing season {0}'.format(season))

                    season_path = os.path.join(competition_path, season)

                    # this url is usually the last stage url for a given season (or the 'final' url for a cup competition)
                    stage_url = website_url + dic_seasons[season]

                    # get stage from the url itself
                    stage = match_stage(stage_url, is_cup)
                    if(is_cup):
                        if(stage=='finale'):
                            stage = 'Final'
                        elif('-runde' in stage):
                            stage = stage.replace('-runde', '. Round')
                    else:
                        stage = 'Week' + str('%02d' % int(stage))

                    # get tree associated to season last stage url
                    last_stage_full_path = os.path.join(season_path, stage + '.html')
                    last_stage_tree = get_tree_from_url(driver, last_stage_full_path , stage_url)

                    # get dic {stage:url} from last stage tree
                    dic_stages = get_stages_dic_from_tree(last_stage_tree, is_cup)

                    for stage in dic_stages.keys():

                        try:
                            logWrapper('processing {0}, season {1}, stage {2}'.format(competition, season, stage))

                            # this url is usually the last stage url for a given season (or the 'final' url for a cup competition)
                            curr_stage_url = website_url + dic_stages[stage]

                            # get tree associated to current stage url
                            curr_stage_full_path = os.path.join(season_path, stage + '.html')
                            last_stage_tree = get_tree_from_url(driver, curr_stage_full_path , curr_stage_url)
                        except Exception as e:
                            logWrapper("Exception while processing stage {0}: {1}".format(stage, e), level='error')
                            continue
                    
                except Exception as e:
                    logWrapper("Exception while processing season {0}: {1}".format(season, e), level='error')
                    continue
                
    # close driver
    driver.quit()
    logWrapper("Process finished")
                    

In [17]:
# load conf
conf = ConfHandler(conf_path)
run_tree_saver(conf)

processing section europe
processing competition champions_league
this is a cup competition
processing season 2017-2018
L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/WorldFootballDotNet/europe\champions_league\2017-2018\Final.html
processing champions_league, season 2017-2018, stage Qualified teams
processing champions_league, season 2017-2018, stage Group A
processing champions_league, season 2017-2018, stage Group B
processing champions_league, season 2017-2018, stage Group C
processing champions_league, season 2017-2018, stage Group D
processing champions_league, season 2017-2018, stage Group E
processing champions_league, season 2017-2018, stage Group F
processing champions_league, season 2017-2018, stage Group G
processing champions_league, season 2017-2018, stage Group H
processing champions_league, season 2017-2018, stage Round of 16
processing champions_league, season 2017-2018, stage Quarter-finals
processing champions_league, season 2017-2018, stage Semi-finals
proces