# Introduction

The goal of this notebook is to load soccer results from tree structure downloaded from worldfootball.net website. Results will be loaded and then saved as csv so user can look at it quickly if needed.

# Import

In [1]:
import lxml.html as lh
import lxml.etree as et
from selenium import webdriver
import time
import pickle
import traceback
import logging
import os.path
import configparser
import os
import re
import datetime
import errno

# Params

In [2]:
root_path = 'L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/WorldFootballDotNet/'
conf_path = os.path.join(root_path, '_conf','worldfootball.ini')
log_path = os.path.join(root_path, '_logs')

# Logging

In [3]:
logger = logging.getLogger('myapp')
timestamp = datetime.datetime.now().strftime('%H%M%S')
hdlr = logging.FileHandler(os.path.join(log_path,'worldfootballnet_TreeSaver_' + timestamp + '.log'))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

In [4]:
def logWrapper(message, level="info", print_console=True):
    if(print_console):
        print(message)
    if(level=="info"):
        logger.info(message)
    elif(level=="warning"):
        logger.warning(message)
    elif(level=="debug"):
        logger.debug(message)
    elif(level=="error"):
        logger.error(message)

In [5]:
#logger.error("test error")
#logger.warning("this is a warning")
#logger.info("this is just info")
#logger.debug("this is debug")
#logWrapper('hello world')
#logWrapper('message')

# Conf

In [6]:
class ConfHandler():

    dicSections = {}
    
    def __init__(self,file_full_path):

        if not os.path.exists(file_full_path):
            raise Exception('path not correct: {0}'.format(file_full_path))
        
        config = configparser.ConfigParser()
        config.read(file_full_path)
        
        for s in config.sections():
            dicParams = {}
            for p in config[s]:
                dicParams[p] = config[s][p]
            self.dicSections[s] = dicParams

    def get(self,section,param, is_param_list = False):
        if(is_param_list):
            return self.dicSections[section][param].replace(' ','').split(',')
        return self.dicSections[section][param]
    
    def get_section(self,section):
        return self.dicSections[section]
    
    def get_sections(self):
        return sorted(list(self.dicSections.keys()))

In [7]:
# test conf
#conf = ConfHandler(conf_path)
#print(conf.get('france','ligue1'))
#dic = conf.get_section('france')
#print(dic)
#sections = conf.get_sections()
#print(sections)
#print(conf.get('cup', 'club_cup_list', True))

# Functions

## [selenium_url_to_tree] download url final output as a tree

In [8]:
# get url html as elementtree using selenium.webdriver
def selenium_url_to_tree(driver, url):
    driver.get(url)
    time.sleep(2)
    htmlSource = driver.page_source
    tree = lh.fromstring(htmlSource)
    return tree

# HOW IT WORKS....
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#results_tree = selenium_url_to_tree(driver,week_url)

# close firefox window once done
#driver.quit()

In [9]:
# return html page as a tree.
def get_tree_from_file(filename):
    
    if(not os.path.isfile(filename)):
        logWrapper("File doesn't exist: {0}".format(filename), level='error')
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
    
    return lh.fromstring(open(filename, 'r').read())

In [74]:
def extract_result_from_tree(tree, competition_type, competition, season, stage):
    if(competition_type=="league"):
        return extract_result_from_league_tree(tree, competition, season, stage)
    elif(competition_type=="cup"):
        return extract_result_from_league_tree(tree, competition, season, stage)
    else:
        logWrapper('Unhandled competition type: {0}'.format(competition_type), level='error')
        raise

In [75]:
def analyse_score(score):
    
    regs = {'score': r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2})\)',
            'score-extra-time': r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2})\) aet',
            'score-penalty': r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2})\) pso'}
    
    # '3:1 (2:1)'
    matchObj = re.match(regs['score'], score)
    
    if matchObj:
        home_score_ht = matchObj.group(3)
        away_score_ht = matchObj.group(4)
        home_score = matchObj.group(1)
        away_score = matchObj.group(2)
        home_score_et = None
        away_score_et = None
        home_pk = None
        away_pk = None
    
    else:
        # '1:3 (0:0, 1:1) aet'
        matchObj = re.match(regs['score-extra-time'], score)
    
        if matchObj:
            home_score_ht = matchObj.group(3)
            away_score_ht = matchObj.group(4)
            home_score = matchObj.group(5)
            away_score = matchObj.group(6)
            home_score_et = matchObj.group(1)
            away_score_et = matchObj.group(2)
            home_pk = None
            away_pk = None
        else:
            # '1:4 (0:0, 0:0, 0:0) pso'
            matchObj = re.match(regs['score-penalty'], score)

            if matchObj:
                home_score_ht = matchObj.group(3)
                away_score_ht = matchObj.group(4)
                home_score = matchObj.group(5)
                away_score = matchObj.group(6)
                home_score_et = matchObj.group(7)
                away_score_et = matchObj.group(8)
                home_pk = matchObj.group(1)
                away_pk = matchObj.group(2)
            else:
                raise ValueError('could not match {0}'.format(score))
    
    if home_score_ht > home_score:
        raise ValueError("half time home score ({0}) can't be higher than final home score ({1})".format(home_score_ht, home_score))
    
    if away_score_ht > away_score:
        raise ValueError("half time away score ({0}) can't be higher than final away score ({1})".format(away_score_ht, away_score))
    
    return {'home_score_ht':home_score_ht,
            'away_score_ht':away_score_ht,
            'home_score':home_score,
            'away_score':away_score,
            'home_score_et':home_score_et,
            'away_score_et':away_score_et,
            'home_pk':home_pk,
            'away_pk':away_pk}

In [76]:
# test analyse_score
#print(analyse_score('3:1 (0:0)'))
#print(analyse_score('1:3 (0:0, 1:1) aet'))
#print(analyse_score('1:4 (0:1, 2:2, 3:3) pso'))

In [77]:
def extract_result_from_league_tree(tree, competition, season, stage):
    
    # get tables html element from the tree
    table = tree.find('.//div[@class="data"]//table[@class="standard_tabelle"]')
    
    # log current process
    logWrapper("Processing season {0}, {1}".format(season, stage))
    
    # store results
    data = []
    
    # get all rows
    trs = table.findall('.//tbody/tr')
    
    current_date = None
    
    for tr in trs:
        
        date = tr[0].text_content().strip()
        if(date != ''):
            current_date = date
        else:
            date = current_date
            
        time = tr[1].text_content().strip()
        home_team = tr[2].text_content().strip()
        away_team = tr[4].text_content().strip()
        
        scores = tr[5].text_content().strip()
        
        dic_scores = analyse_score(scores)
        home_score_ht = dic_scores['home_score_ht']
        away_score_ht = dic_scores['away_score_ht']
        home_score = dic_scores['home_score']
        away_score = dic_scores['away_score']
        home_score_et = dic_scores['home_score_et']
        away_score_et = dic_scores['away_score_et']
        home_pk = dic_scores['home_pk']
        away_pk = dic_scores['away_pk']
        
        #score = (scores[:scores.find('(')-1]).strip()
        #score_ht = (scores[scores.find('(')+1:-1]).strip()
        #home_score_ht = score_ht[:score.find(':')].strip()
        #away_score_ht = score_ht[score.find(':')+1:].strip()
        #home_score = score[:score.find(':')].strip()
        #away_score = score[score.find(':')+1:].strip()
        #home_score_et = None
        #away_score_et = None
        #home_pen = None
        #away_pen = None
        
        data += [[season, stage, date, time, home_team, away_team, home_score_ht, away_score_ht, home_score, away_score, home_score_et, away_score_et, home_pk, away_pk]]
        
    return data

In [78]:
def construct_stage_tree_path(zone, competition, season, stage):
    return os.path.join(root_path, zone, competition, season, stage + '.html')    

In [79]:
def wrap_test(params):
    tree = get_tree_from_file(construct_stage_tree_path(params['zone'], params['competition'], params['season'], params['stage']))
    return extract_result_from_tree(tree, params['competition_type'], params['competition'], params['season'], params['stage'])

In [82]:
# load conf
conf = ConfHandler(conf_path)

# 1st example: league game - ligue1
data = wrap_test({'zone':'france', 'competition':'ligue1', 'season':'2016-2017', 'stage':'Week04', 'competition_type':'league'})
game = data[0]
assert(game[4]=='Paris Saint-Germain' and game[6]=='0' and game[8]=='1')
print('Test 01 --> OK')

# 2nd example: cup game - coupe de france with extra time and penalty shots
data = wrap_test({'zone':'france', 'competition':'coupe-de-france', 'season':'2014-2015', 'stage':'Round of 16', 'competition_type':'cup'})
game = data[1]
assert(game[5]=='AJ Auxerre' and game[6]=='0' and game[7]=='0' and game[8]=='1' and game[9]=='1' and game[12]=='5' and game[13]=='6')
print('Test 02 --> OK')

# 3rd example: cup game - coupe de france with extra time only
game = data[4]
assert(game[4]=='AS Yzeure' and game[6]=='0' and game[7]=='0' and game[8]=='1' and game[9]=='1' and game[10]=='1' and game[11]=='3' and game[12]==None)
print('Test 03 --> OK')



Processing season 2016-2017, Week04
Test 01 --> OK
Processing season 2014-2015, Round of 16
Test 02 --> OK
Test 03 --> OK
