# Introduction

The goal of this notebook is to load soccer results from tree structure downloaded from worldfootball.net website. Results will be loaded and then saved as csv so user can look at it quickly if needed.

# Import

In [1]:
import lxml.html as lh
import lxml.etree as et
from selenium import webdriver
import time
import pickle
import traceback
import logging
import os.path
import configparser
import os
import re
import datetime
import errno

# Params

In [2]:
root_path = 'L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/WorldFootballDotNet/'
conf_path = os.path.join(root_path, '_conf','worldfootball.ini')
log_path = os.path.join(root_path, '_logs')

# Logging

In [3]:
logger = logging.getLogger('myapp')
timestamp = datetime.datetime.now().strftime('%H%M%S')
hdlr = logging.FileHandler(os.path.join(log_path,'worldfootballnet_TreeLoader_' + timestamp + '.log'))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

In [4]:
def logWrapper(message, level="info", print_console=True):
    if(print_console):
        print(message)
    if(level=="info"):
        logger.info(message)
    elif(level=="warning"):
        logger.warning(message)
    elif(level=="debug"):
        logger.debug(message)
    elif(level=="error"):
        logger.error(message)

In [5]:
#logger.error("test error")
#logger.warning("this is a warning")
#logger.info("this is just info")
#logger.debug("this is debug")
#logWrapper('hello world')
#logWrapper('message')

# Conf

In [6]:
class ConfHandler():

    dicSections = {}
    
    def __init__(self,file_full_path):

        if not os.path.exists(file_full_path):
            raise Exception('path not correct: {0}'.format(file_full_path))
        
        config = configparser.ConfigParser()
        config.read(file_full_path)
        
        for s in config.sections():
            dicParams = {}
            for p in config[s]:
                dicParams[p] = config[s][p]
            self.dicSections[s] = dicParams

    def get(self,section,param, is_param_list = False):
        if(is_param_list):
            return self.dicSections[section][param].replace(' ','').split(',')
        return self.dicSections[section][param]
    
    def get_section(self,section):
        return self.dicSections[section]
    
    def get_sections(self):
        return sorted(list(self.dicSections.keys()))

In [7]:
# test conf
#conf = ConfHandler(conf_path)
#print(conf.get('france','ligue1'))
#dic = conf.get_section('france')
#print(dic)
#sections = conf.get_sections()
#print(sections)
#print(conf.get('cup', 'club_cup_list', True))

# Functions

## [selenium_url_to_tree] download url final output as a tree

In [8]:
# get url html as elementtree using selenium.webdriver
def selenium_url_to_tree(driver, url):
    driver.get(url)
    time.sleep(2)
    htmlSource = driver.page_source
    tree = lh.fromstring(htmlSource)
    return tree

# HOW IT WORKS....
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#results_tree = selenium_url_to_tree(driver,week_url)

# close firefox window once done
#driver.quit()

In [9]:
# return html page as a tree.
def get_tree_from_file(filename):
    
    if(not os.path.isfile(filename)):
        logWrapper("File doesn't exist: {0}".format(filename), level='error')
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
    
    return lh.fromstring(open(filename, 'r').read())

In [10]:
# analyse a tree structure and guess its type (ie: league game? 2 legs cup game?)
def guess_tree_type(tree):
    
    # specific to 2 legs cup games (ie: champions league knock out games)
    if len(tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr[@class="dunkel"]')) > 0:
        #logWrapper('Tree type is 2legs-cup')
        return '2legs-cup'
    
    # standard league games or group stage games
    elif len(tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr')):
        #logWrapper('Tree type is league')
        return 'league'
    
    raise ValueError("Couldn't guess tree type") 

In [11]:
def extract_result_from_tree(tree, competition, season, stage):
    
    tree_type = guess_tree_type(tree)
    
    if(tree_type=="league"):
        return extract_result_from_league_tree(tree, competition, season, stage)
    elif(tree_type=="2legs-cup"):
        return extract_result_from_2legscup_tree(tree, competition, season, stage)
    else:
        logWrapper('Unhandled tree type: {0}'.format(tree_type), level='error')
        raise

In [12]:
# '3:1 (2:1)'
def extract_standard_score(matchObj):
    return {'home_score_ht':matchObj.group(3),
            'away_score_ht':matchObj.group(4),
            'home_score':matchObj.group(1),
            'away_score':matchObj.group(2),
            'home_score_et':None,
            'away_score_et':None,
            'home_pk':None,
            'away_pk':None}

# '1:3 (0:0, 1:1) aet'
def extract_score_et(matchObj):
    return {'home_score_ht':matchObj.group(3),
            'away_score_ht':matchObj.group(4),
            'home_score':matchObj.group(5),
            'away_score':matchObj.group(6),
            'home_score_et':matchObj.group(1),
            'away_score_et':matchObj.group(2),
            'home_pk':None,
            'away_pk':None}

# '1:4 (0:0, 0:0, 0:0) pso'
def extract_score_pk(matchObj):
    return {'home_score_ht':matchObj.group(3),
            'away_score_ht':matchObj.group(4),
            'home_score':matchObj.group(5),
            'away_score':matchObj.group(6),
            'home_score_et':matchObj.group(7),
            'away_score_et':matchObj.group(8),
            'home_pk':matchObj.group(1),
            'away_pk':matchObj.group(2)}
# '1:3'  
def extract_simple_score(matchObj):
    return {'home_score_ht':None,
            'away_score_ht':None,
            'home_score':matchObj.group(1),
            'away_score':matchObj.group(2),
            'home_score_et':None,
            'away_score_et':None,
            'home_pk':None,
            'away_pk':None}
    
# analyse a score string, using a list of regular expressions corresponding to each possible score patterns    
def analyse_score(score):
    
    regs = {'standard-score': (r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2})\)', extract_standard_score),
            'score-extra-time': (r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2})\) aet', extract_score_et),
            'score-penalty': (r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2})\) pso', extract_score_pk),
            'simple-score': (r'([0-9]{1,2}):([0-9]{1,2})', extract_simple_score)}
    
    for key in regs.keys():
        matchObj = re.match(regs[key][0], score)
        if matchObj:
            data = regs[key][1](matchObj)
            break
    
    if data['home_score_ht']:
        if data['home_score_ht'] > data['home_score']:
            raise ValueError("half time home score ({0}) can't be higher than final home score ({1})".format(data['home_score_ht'], data['home_score']))
    
    if data['away_score_ht']:
        if data['away_score_ht'] > data['away_score']:
            raise ValueError("half time away score ({0}) can't be higher than final away score ({1})".format(data['away_score_ht'], data['away_score']))
    
    return data

In [13]:
# test analyse_score
#print(analyse_score('3:1 (0:0)'))
#print(analyse_score('1:3 (0:0, 1:1) aet'))
#print(analyse_score('1:4 (0:1, 2:2, 3:3) pso'))
#print(analyse_score('1:3'))

In [14]:
# use regex to match date in string (ie: '05/07/2016')
def match_date(s):
    matchObj = re.match(r'.*([0-3][0-9]/[0-1][0-9]/[1-3][0-9]{3}).*', s)
    if not matchObj:
        raise
    return matchObj.group(1)

In [15]:
# use regex to match time in string (ie: '17:05')
def match_time(s):
    matchObj = re.match(r'.*([0-2][0-9]:[0-5][0-9]).*', s)
    if not matchObj:
        return None
    return matchObj.group(1)

In [16]:
def extract_result_from_league_tree(tree, competition, season, stage):
    
    # get tables html element from the tree
    table = tree.find('.//div[@class="data"]//table[@class="standard_tabelle"]')
    
    # log current process
    logWrapper("Processing season {0}, {1}".format(season, stage))
    
    # store results
    data = []
    
    # get all rows
    trs = table.findall('.//tbody/tr')
    
    current_date = None
    
    for tr in trs:
        
        date = tr[0].text_content().strip()
        if(date != ''):
            current_date = date
        else:
            date = current_date
            
        time = tr[1].text_content().strip()
        home_team = tr[2].text_content().strip()
        away_team = tr[4].text_content().strip()
        
        scores = tr[5].text_content().strip()
        
        dic_scores = analyse_score(scores)
        home_score_ht = dic_scores['home_score_ht']
        away_score_ht = dic_scores['away_score_ht']
        home_score = dic_scores['home_score']
        away_score = dic_scores['away_score']
        home_score_et = dic_scores['home_score_et']
        away_score_et = dic_scores['away_score_et']
        home_pk = dic_scores['home_pk']
        away_pk = dic_scores['away_pk']
        
        data += [[season, stage, date, time, home_team, away_team, home_score_ht, away_score_ht, home_score, away_score, home_score_et, away_score_et, home_pk, away_pk]]
        
    return data

In [17]:
def extract_result_from_2legscup_tree(tree, competition, season, stage):
    
    # log current process
    logWrapper("Processing season {0}, {1}".format(season, stage))
    
    # store results
    data = []
    
    # get all rows which represent a results
    trs = tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr[@class="hell"]')
        
    date = None
    time = None
    
    for tr in trs:
        
        leg = tr[0].text_content().strip()
        if leg[:3]=='1st':
            curr_stage = stage + ' - 1st Leg'
        else:
            curr_stage = stage + ' - 2nd Leg'
        
        home_team = tr[1].text_content().strip()
        away_team = tr[3].text_content().strip()
        
        scores = tr[4].text_content().strip()
        
        dic_scores = analyse_score(scores)
        home_score_ht = dic_scores['home_score_ht']
        away_score_ht = dic_scores['away_score_ht']
        home_score = dic_scores['home_score']
        away_score = dic_scores['away_score']
        home_score_et = dic_scores['home_score_et']
        away_score_et = dic_scores['away_score_et']
        home_pk = dic_scores['home_pk']
        away_pk = dic_scores['away_pk']
        
        data += [[season, curr_stage, date, time, home_team, away_team, home_score_ht, away_score_ht, home_score, away_score, home_score_et, away_score_et, home_pk, away_pk]]
    
    # get all rows which represent date/time informations
    trs = tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr[@class="dunkel"]')
        
    index = 0
    for tr in trs:
        data[index][2] = match_date(tr[1].text_content().strip())
        data[index][3] = match_time(tr[1].text_content().strip())
        data[index+1][2] = match_date(tr[3].text_content().strip())
        data[index+1][3] = match_time(tr[3].text_content().strip())
        index = index + 2
            
    return data

In [18]:
def construct_stage_tree_path(zone, competition, season, stage):
    return os.path.join(root_path, zone, competition, season, stage + '.html')    

In [19]:
def wrap_test(params):
    tree = get_tree_from_file(construct_stage_tree_path(params['zone'], params['competition'], params['season'], params['stage']))
    return extract_result_from_tree(tree, params['competition'], params['season'], params['stage'])

In [20]:
def run_test():
    # load conf
    conf = ConfHandler(conf_path)

    # 0:  season
    # 1:  stage
    # 2:  date
    # 3:  time
    # 4:  home_team
    # 5:  away_team
    # 6:  home_score_ht
    # 7:  away_score_ht
    # 8:  home_score
    # 9:  away_score
    # 10: home_score_et
    # 11: away_score_et
    # 12: home_pk
    # 13: away_pk

    # 1st example: league game - ligue1
    data = wrap_test({'zone':'france', 'competition':'ligue1', 'season':'2016-2017', 'stage':'Week04'})
    game = data[0]
    assert(game[4]=='Paris Saint-Germain' and game[6]=='0' and game[8]=='1')
    print('Test 01 --> OK')

    # 2nd example: cup game - coupe de france with extra time and penalty shots
    data = wrap_test({'zone':'france', 'competition':'coupe-de-france', 'season':'2014-2015', 'stage':'Round of 16'})
    game = data[1]
    assert(game[5]=='AJ Auxerre' and game[6]=='0' and game[7]=='0' and game[8]=='1' and game[9]=='1' and game[12]=='5' and game[13]=='6')
    print('Test 02 --> OK')

    # 3rd example: cup game - coupe de france with extra time only
    game = data[4]
    assert(game[4]=='AS Yzeure' and game[6]=='0' and game[7]=='0' and game[8]=='1' and game[9]=='1' and game[10]=='1' and game[11]=='3' and game[12]==None)
    print('Test 03 --> OK')

    # 4th example: 2 legs game - champions league
    data = wrap_test({'zone':'europe', 'competition':'champions-league', 'season':'2015-2016', 'stage':'Round of 16'})
    game = data[0]
    assert(game[1]=='Round of 16 - 1st Leg' and game[2]=='16/02/2016' and game[3]=='19:45' and game[4]=='Paris Saint-Germain' and game[8]=='2' and game[9]=='1' and game[6]=='1' and game[7]=='1')
    print('Test 04 --> OK')

    # 5th example: 2 legs game - champions league with extra time
    game = data[11]
    assert(game[1]=='Round of 16 - 2nd Leg' and game[2]=='16/03/2016' and game[3]=='19:45' and game[5]=='Juventus' and game[8]=='2' and game[9]=='2' and game[6]=='0' and game[7]=='2' and game[10]=='4' and game[11]=='2')
    print('Test 05 --> OK')

    # 6th example: 2 legs game - champions league with penalty shots
    game = data[13]
    assert(game[1]=='Round of 16 - 2nd Leg' and game[2]=='15/03/2016' and game[3]=='19:45' and game[5]=='PSV Eindhoven' and game[6]==game[7]==game[8]==game[9]==game[10]==game[11]=='0' and game[12]=='8' and game[13]=='7')
    print('Test 06 --> OK')

    # 7th example: champions league group stage
    data = wrap_test({'zone':'europe', 'competition':'champions-league', 'season':'2015-2016', 'stage':'Group A'})
    game = data[0]
    assert(game[1]=='Group A' and game[2]=='15/09/2015' and game[3]=='19:45' and game[4]=='Real Madrid' and game[8]=='4' and game[9]=='0' and game[6]=='1' and game[7]=='0')
    print('Test 07 --> OK')

    # 8th example: champions league group stage
    game = data[1]
    assert(game[1]=='Group A' and game[2]=='15/09/2015' and game[3]=='19:45' and game[4]=='Paris Saint-Germain' and game[8]=='2' and game[9]=='0' and game[6]=='1' and game[7]=='0')
    print('Test 08 --> OK')

    # 9th example: world cup qualifier CONCACAF
    data = wrap_test({'zone':'world', 'competition':'world-cup-qualifier-concacaf', 'season':'2000-2001', 'stage':'Caribbean Semi-finals'})
    game = data[2]
    assert(game[1]=='Caribbean Semi-finals - 1st Leg' and game[2]=='01/04/2000' and game[3]==None and game[4]=='Haiti' and game[8]=='9' and game[9]=='0' and game[6]==game[7]==game[10]==game[11]==game[12]==game[13]==None)
    print('Test 09 --> OK')


In [21]:
# run battery of test
run_test()

Processing season 2016-2017, Week04
Test 01 --> OK
Processing season 2014-2015, Round of 16
Test 02 --> OK
Test 03 --> OK
Processing season 2015-2016, Round of 16
Test 04 --> OK
Test 05 --> OK
Test 06 --> OK
Processing season 2015-2016, Group A
Test 07 --> OK
Test 08 --> OK
Processing season 2000-2001, Caribbean Semi-finals
Test 09 --> OK
