# Introduction

The goal of this notebook is to analyse worldfootball.net urls and save down results page into tree structure. This will be used later on to get results and save it under a more readable format (csv).

# Import

In [1]:
import lxml.html as lh
import lxml.etree as et
from selenium import webdriver
import time
import pickle
import traceback
import logging
import os.path
import configparser
import os
import re

# Params

In [2]:
root_path = 'L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/WorldFootballDotNet/'
conf_path = os.path.join(root_path, '_conf','worldfootball.ini')
website_url = 'https://www.worldfootball.net'

# Logging

In [3]:
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler(os.path.join(root_path,'worldfootballnet_TreeSaver.log'))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

# Conf

In [4]:
class ConfHandler():

    dicSections = {}
    
    def __init__(self,file_full_path):

        if not os.path.exists(file_full_path):
            raise Exception('path not correct: {0}'.format(file_full_path))
        
        config = configparser.ConfigParser()
        config.read(file_full_path)
        
        for s in config.sections():
            dicParams = {}
            for p in config[s]:
                dicParams[p] = config[s][p]
            self.dicSections[s] = dicParams

    def get(self,section,param, is_param_list = False):
        if(is_param_list):
            return self.dicSections[section][param].replace(' ','').split(',')
        return self.dicSections[section][param]
    
    def get_section(self,section):
        return self.dicSections[section]
    
    def get_sections(self):
        return sorted(list(self.dicSections.keys()))

In [5]:
# test conf
#conf = ConfHandler(conf_path)
#print(conf.get('france','ligue1'))
#dic = conf.get_section('france')
#print(dic)
#sections = conf.get_sections()
#print(sections)
#print(conf.get('cup', 'club_cup_list', True))

# Functions

## [selenium_url_to_tree] download url final output as a tree

In [6]:
# get url html as elementtree using selenium.webdriver
def selenium_url_to_tree(driver, url):
    driver.get(url)
    time.sleep(5)
    htmlSource = driver.page_source
    tree = lh.fromstring(htmlSource)
    return tree

# HOW IT WORKS....
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#results_tree = selenium_url_to_tree(driver,week_url)

# close firefox window once done
#driver.quit()

In [7]:
# return html page as a tree.
# the tree object will be cached to drive so next call won't require web call
def get_tree_from_url(driver, filename, url):
    
    # file doesn't exist, we will have to do a web call
    if(not os.path.isfile(filename)):
        
        # create folder if not done already
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc: # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        
        # load up url in web browser
        tree = selenium_url_to_tree(driver,url)
        
        # save it down to the drive
        tree.getroottree().write(filename)
        
        if(not os.path.isfile(filename)):
            raise
        
    # return the loaded tree
    return lh.fromstring(open(filename, 'r').read())

In [8]:
def get_result_url_from_tree(tree):
    return tree.find('.//div[@class="navibox2"]//div[@class="data"]/ul[2]/li/a').attrib['href']
    

In [9]:
# construct dictionary of {season: url}
def get_seasons_dic_from_tree(tree):
    options = tree.findall('.//select[@name="saison"]/option')
    dic = {}
    for option in options:
        season = option.text_content().replace('/','-')
        dic[season] = option.attrib['value']
    return dic

In [10]:
# construct dictionary of {stage: url}
def get_stages_dic_from_tree(tree, is_cup = False):
    if(is_cup):
        select_name = 'phase'
    else:
        select_name = 'runde'
    options = tree.findall('.//select[@name="{0}"]/option'.format(select_name))
    dic = {}
    for option in options:
        stage = option.text_content()
        if(is_cup):
            dic[stage] = option.attrib['value']
        else:
            stage = stage[:stage.find('.')]
            dic['Week' + str('%02d' % int(stage))] = option.attrib['value']
    return dic

In [11]:
# use regex to find season in an url (ie: 2016-2017)
def match_season(url):
    matchObj = re.match(r'.*([1-3][0-9]{3}-[1-3][0-9]{3}).*', url)
    if not matchObj:
        # sometime we get only year instead of season
        matchObj = re.match(r'.*([1-3][0-9]{3}).*', url)
        if not matchObj:
            raise
    return matchObj.group(1)

In [12]:
# use regex to find stage in an url (ie: '1' or '38' or 'final')
def match_stage(url, is_cup):
    if(not is_cup):
        matchObj = re.match(r'.*[1-3][0-9]{3}-[1-3][0-9]{3}-spieltag/(\b\d{1,2}\b)/', url)
    else:
        matchObj = re.match(r'.*-[1-3][0-9]{3}-(.*)/\b\d{1,2}\b/', url)
    if not matchObj:
        raise
    return matchObj.group(1)

In [13]:
# return selected stage from the tree
def get_selected_stage_from_tree(tree):
    option = tree.find('.//select[@name="runde"]/option[@class="wahl"]')
    stage = option.text_content()[:option.text_content().find('.')]
    return 'Week' + str('%02d' % int(stage))

In [14]:
# load conf
conf = ConfHandler(conf_path)
sections = conf.get_sections()
club_cup_list = conf.get('cup', 'club_cup_list', True)

# open driver
driver = webdriver.Firefox()

for section in sections:
    
    if(section == 'cup'):
        continue
    
    print('processing section {0}'.format(section))
    
    dic = conf.get_section(section)
    
    for competition in dic.keys():
        
        print('processing competition {0}'.format(competition))
        
        is_cup = dic[competition] in club_cup_list
        if(is_cup):
            print('this is a cup competition')
        
        # competition overview url
        ovw_url = website_url + '/competition/' + dic[competition] + '/'
        
        competition_path = os.path.join(root_path, section, competition)
        
        # get tree associated to overview url (from drive or from web)
        ovw_tree_file_full_path = os.path.join(competition_path, competition + '_overview.html')
        ovw_tree = get_tree_from_url(driver, ovw_tree_file_full_path, ovw_url)
        
        # get result url from overview url, that's the current result url (ie: current season, current stage)
        curr_result_url = website_url + get_result_url_from_tree(ovw_tree)
        
        # get tree associated to current result url
        curr_tree_full_path = os.path.join(competition_path, competition + '_current_result.html')
        curr_tree = get_tree_from_url(driver, curr_tree_full_path , curr_result_url)
        
        # get dic {season:url} from current result tree
        dic_seasons = get_seasons_dic_from_tree(curr_tree)
        
        print(list(dic_seasons.keys())[:5])
        
        for season in dic_seasons.keys():
            
            print('processing season {0}'.format(season))
            
            season_path = os.path.join(competition_path, season)
            
            # this url is usually the last stage url for a given season (or the 'final' url for a cup competition)
            stage_url = website_url + dic_seasons[season]
            
            # get stage from the url itself
            stage = match_stage(stage_url, is_cup)
            if(is_cup):
                if(stage=='finale'):
                    stage = 'Final'
                elif('-runde' in stage):
                    stage = stage.replace('-runde', '. Round')
            else:
                stage = 'Week' + str('%02d' % int(stage))
            
            # get tree associated to season last stage url
            last_stage_full_path = os.path.join(season_path, stage + '.html')
            last_stage_tree = get_tree_from_url(driver, last_stage_full_path , stage_url)
            
            # get dic {stage:url} from last stage tree
            dic_stages = get_stages_dic_from_tree(last_stage_tree, is_cup)
            
            for stage in dic_stages.keys():
                
                print('processing stage {0}'.format(stage))
            
                # this url is usually the last stage url for a given season (or the 'final' url for a cup competition)
                curr_stage_url = website_url + dic_stages[stage]
                
                # get tree associated to current stage url
                curr_stage_full_path = os.path.join(season_path, stage + '.html')
                last_stage_tree = get_tree_from_url(driver, curr_stage_full_path , curr_stage_url)       
        
# close driver
driver.quit()

processing section france
processing competition ligue1
['2017-2018', '2016-2017', '2015-2016', '2014-2015', '2013-2014']
processing season 2017-2018
processing stage Week01
processing stage Week02
processing stage Week03
processing stage Week04
processing stage Week05
processing stage Week06
processing stage Week07
processing stage Week08
processing stage Week09
processing stage Week10
processing stage Week11
processing stage Week12
processing stage Week13
processing stage Week14
processing stage Week15
processing stage Week16
processing stage Week17
processing stage Week18
processing stage Week19
processing stage Week20
processing stage Week21
processing stage Week22
processing stage Week23
processing stage Week24
processing stage Week25
processing stage Week26
processing stage Week27
processing stage Week28
processing stage Week29
processing stage Week30
processing stage Week31
processing stage Week32
processing stage Week33
processing stage Week34
processing stage Week35
processing

RuntimeError: No active exception to reraise