# Introduction

The goal of this notebook is to load soccer results from tree structure downloaded from worldfootball.net website. Results will be loaded and then saved as csv so user can look at it quickly if needed.

# Import

In [89]:
import lxml.html as lh
import lxml.etree as et
from selenium import webdriver
import time
import pickle
import traceback
import logging
import configparser
import os
import os.path
from os import listdir
from os.path import isfile, join
import re
import datetime
import errno
import pandas as pd

# Params

In [90]:
csv_path = 'L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/csv/'
root_path = 'L:/Dev/Sandbox/Apps Development/SoccerWebApp/PySoccer/WorldFootballDotNet/'
conf_path = os.path.join(root_path, '_conf','worldfootball.ini')
log_path = os.path.join(root_path, '_logs')

# Logging

In [91]:
logger = logging.getLogger('myapp')
timestamp = datetime.datetime.now().strftime('%H%M%S')
hdlr = logging.FileHandler(os.path.join(log_path,'worldfootballnet_TreeLoader_' + timestamp + '.log'))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)

In [92]:
def logWrapper(message, level="info", print_console=True):
    if(print_console):
        print(message)
    if(level=="info"):
        logger.info(message)
    elif(level=="warning"):
        logger.warning(message)
    elif(level=="debug"):
        logger.debug(message)
    elif(level=="error"):
        logger.error(message)

In [93]:
#logger.error("test error")
#logger.warning("this is a warning")
#logger.info("this is just info")
#logger.debug("this is debug")
#logWrapper('hello world')
#logWrapper('message')

# Conf

In [94]:
class ConfHandler():

    dicSections = {}
    
    def __init__(self,file_full_path):

        if not os.path.exists(file_full_path):
            raise Exception('path not correct: {0}'.format(file_full_path))
        
        config = configparser.ConfigParser()
        config.read(file_full_path)
        
        for s in config.sections():
            dicParams = {}
            for p in config[s]:
                dicParams[p] = config[s][p]
            self.dicSections[s] = dicParams

    def get(self,section,param, is_param_list = False):
        if section not in self.dicSections.keys():
            return None
        if param not in self.dicSections[section].keys():
            return None
        if self.dicSections[section][param]=='':
            return None
        if(is_param_list):
            return self.dicSections[section][param].replace(' ','').split(',')
        return self.dicSections[section][param]
    
    def get_section(self,section):
        if section not in self.dicSections.keys():
            return None
        return self.dicSections[section]
    
    def get_sections(self):
        return sorted(list(self.dicSections.keys()))

In [95]:
# test conf
conf = ConfHandler(conf_path)

# return setting within section
print(conf.get('france','ligue1'))

# france section only
dic = conf.get_section('france')
print(dic)

# all sections
sections = conf.get_sections()
print(sections)

# this section doesn't exist, should return None:
print(conf.get('cup', 'club_cup_list', True))

# this setting doesn't exist within existing section, should return None:
print(conf.get('general', 'club_cup_list', True))

fra-ligue-1
{'ligue1': 'fra-ligue-1', 'coupe-de-france': 'fra-coupe-de-france', 'coupe-de-la-ligue': 'fra-coupe-de-la-ligue', 'trophee-des-champions': 'fra-trophee-des-champions'}
['england', 'europe', 'france', 'general', 'germany', 'italy', 'netherlands', 'portugal', 'spain', 'world']
None
None


# Functions

In [96]:
# return html page as a tree.
def get_tree_from_file(filename):
    
    if(not os.path.isfile(filename)):
        logWrapper("File doesn't exist: {0}".format(filename), level='error')
        raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), filename)
    
    return lh.fromstring(open(filename, 'r').read())

In [97]:
# analyse a tree structure and guess its type (ie: league game? 2 legs cup game?)
def guess_tree_type(tree):
    
    # specific to 2 legs cup games (ie: champions league knock out games)
    if len(tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr[@class="dunkel"]')) > 0:
        #logWrapper('Tree type is 2legs-cup')
        return '2legs-cup'
    
    # standard league games or group stage games
    elif len(tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr')) > 0:
        #logWrapper('Tree type is league')
        return 'league'
    
    raise ValueError("Couldn't guess tree type") 

In [98]:
def extract_result_from_tree(tree, zone, competition, season, stage):
    
    tree_type = guess_tree_type(tree)
    
    if(tree_type=="league"):
        return extract_result_from_league_tree(tree, zone, competition, season, stage)
    elif(tree_type=="2legs-cup"):
        return extract_result_from_2legscup_tree(tree, zone, competition, season, stage)
    else:
        logWrapper('Unhandled tree type: {0}'.format(tree_type), level='error')
        raise

## Score extraction logic

In [99]:
# '3:1 (2:1)'
def extract_standard_score(matchObj):
    return {'status': 'Completed',
            'home_score_ht':matchObj.group(3),
            'away_score_ht':matchObj.group(4),
            'home_score':matchObj.group(1),
            'away_score':matchObj.group(2),
            'home_score_et':None,
            'away_score_et':None,
            'home_pk':None,
            'away_pk':None}

# '1:3 (0:0, 1:1) aet'
def extract_score_et(matchObj):
    return {'status': 'Completed',
            'home_score_ht':matchObj.group(3),
            'away_score_ht':matchObj.group(4),
            'home_score':matchObj.group(5),
            'away_score':matchObj.group(6),
            'home_score_et':matchObj.group(1),
            'away_score_et':matchObj.group(2),
            'home_pk':None,
            'away_pk':None}

# '1:4 (0:0, 0:0, 0:0) pso'
def extract_score_pk(matchObj):
    return {'status': 'Completed',
            'home_score_ht':matchObj.group(3),
            'away_score_ht':matchObj.group(4),
            'home_score':matchObj.group(5),
            'away_score':matchObj.group(6),
            'home_score_et':matchObj.group(7),
            'away_score_et':matchObj.group(8),
            'home_pk':matchObj.group(1),
            'away_pk':matchObj.group(2)}
# '1:3'  
def extract_simple_score(matchObj):
    return {'status': 'Completed',
            'home_score_ht':None,
            'away_score_ht':None,
            'home_score':matchObj.group(1),
            'away_score':matchObj.group(2),
            'home_score_et':None,
            'away_score_et':None,
            'home_pk':None,
            'away_pk':None}

# '1:3'  
def handle_cancelled_game(matchObj):
    return {'status': 'Cancelled',
            'home_score_ht':None,
            'away_score_ht':None,
            'home_score':None,
            'away_score':None,
            'home_score_et':None,
            'away_score_et':None,
            'home_pk':None,
            'away_pk':None}


# analyse a score string, using a list of regular expressions corresponding to each possible score patterns    
def analyse_score(score, isException = False):
    
    regs = {'standard-score': (r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2})\)', extract_standard_score),
            'score-extra-time': (r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2})\) aet', extract_score_et),
            'score-penalty': (r'([0-9]{1,2}):([0-9]{1,2}) \(([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2}), ([0-9]{1,2}):([0-9]{1,2})\) pso', extract_score_pk),
            'simple-score': (r'([0-9]{1,2}):([0-9]{1,2})', extract_simple_score),
            'cancelled-game': (r'(annull|dnp|abor\.)', handle_cancelled_game)}
    
    data = None
    
    for key in regs.keys():
        matchObj = re.match(regs[key][0], score)
        if matchObj:
            data = regs[key][1](matchObj)
            break
    
    if data==None:
        raise ValueError("Couldn't match score to any regular expression: {0}".format(score))
    
    #if data['home_score_ht']:
    #    if int(data['home_score_ht']) > int(data['home_score']) and not isException:
    #        raise ValueError("half time home score ({0}) can't be higher than final home score ({1})".format(data['home_score_ht'], data['home_score']))
    
    #if data['away_score_ht']:
    #    if int(data['away_score_ht']) > int(data['away_score']) and not isException:
    #        raise ValueError("half time away score ({0}) can't be higher than final away score ({1})".format(data['away_score_ht'], data['away_score']))
    
    return data

In [100]:
# test analyse_score
print(analyse_score('3:1 (0:0)'))
print(analyse_score('1:3 (0:0, 1:1) aet'))
print(analyse_score('1:4 (0:1, 2:2, 3:3) pso'))
print(analyse_score('1:3'))
print(analyse_score('10:2 (5:0)'))
print(analyse_score('dnp'))
print(analyse_score('annull'))
print(analyse_score('abor.'))
print(analyse_score('2:0 (1:3)', True))

{'status': 'Completed', 'home_score_ht': '0', 'away_score_ht': '0', 'home_score': '3', 'away_score': '1', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None}
{'status': 'Completed', 'home_score_ht': '0', 'away_score_ht': '0', 'home_score': '1', 'away_score': '1', 'home_score_et': '1', 'away_score_et': '3', 'home_pk': None, 'away_pk': None}
{'status': 'Completed', 'home_score_ht': '0', 'away_score_ht': '1', 'home_score': '2', 'away_score': '2', 'home_score_et': '3', 'away_score_et': '3', 'home_pk': '1', 'away_pk': '4'}
{'status': 'Completed', 'home_score_ht': None, 'away_score_ht': None, 'home_score': '1', 'away_score': '3', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None}
{'status': 'Completed', 'home_score_ht': '5', 'away_score_ht': '0', 'home_score': '10', 'away_score': '2', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None}
{'status': 'Cancelled', 'home_score_ht': None, 'away_score_ht': None, 'ho

## Data extraction logic

In [101]:
# use regex to match date in string (ie: '05/07/2016')
def match_date(s):
    matchObj = re.match(r'.*([0-3][0-9]/[0-1][0-9]/[1-3][0-9]{3}).*', s)
    if not matchObj:
        raise
    return matchObj.group(1)

In [102]:
# use regex to match time in string (ie: '17:05')
def match_time(s):
    matchObj = re.match(r'.*([0-2][0-9]:[0-5][0-9]).*', s)
    if not matchObj:
        return None
    return matchObj.group(1)

In [103]:
def is_matching_date(s):
    matchObj = re.match(r'.*([0-3][0-9]/[0-1][0-9]/[1-3][0-9]{3}).*', s)
    if not matchObj:
        return False
    return True

In [104]:
# need for a list of games which have been cancelled for some reason
# in order to treat it differently (ie: if the score doesn't make sense)
game_exceptions = [
    {'competition': 'bundesliga', 'season': '1992-1993', 'stage': 'Week32', 'date': '22/05/1993', 'time': '14:30', 'home_team': 'Bayer 05 Uerdingen', 'away_team': 'Eintracht Frankfurt'},
    {'competition': 'bundesliga', 'season': '1994-1995', 'stage': 'Week26', 'date': '15/04/1995', 'time': '14:30', 'home_team': 'Eintracht Frankfurt', 'away_team': 'Bayern München'},
    # 2legs cup tree should have date=time=None because we still don't have the date/time when looking for exception..
    {'competition': 'cup-winners-cup', 'season': '1987-1988', 'stage': '2. Round', 'date': None, 'time': None, 'home_team': 'Hajduk Split', 'away_team': 'Olympique Marseille'},
    {'competition': 'primeira-liga', 'season': '2007-2008', 'stage': 'Week16', 'date': '13/01/2008', 'time': None, 'home_team': 'Os Belenenses', 'away_team': 'Naval 1° de Maio'},
    {'competition': 'euro-qualifier', 'season': '2006-2007', 'stage': 'Group F', 'date': '02/06/2007', 'time': '19:00', 'home_team': 'Denmark', 'away_team': 'Sweden'}
]

def game_exception_contains(game_info):
    #print('game info: \n{0}'.format(game_info))
    for game in game_exceptions:
        if game==game_info:
            return True
    return False

In [105]:
def extract_result_from_league_tree(tree, zone, competition, season, stage):
    
    # get tables html element from the tree
    table = tree.find('.//div[@class="data"]//table[@class="standard_tabelle"]')
    
    # log current process
    #logWrapper("Processing season {0}, {1}".format(season, stage))
    
    # store results
    data = []
    
    # get all rows
    trs = table.findall('.//tbody/tr')
    
    current_date = None
    time = None
    
    for tr in trs:
        
        # check number of rows
        # help to ignore some special case (ie: decisive matches in fa-cup final 1982/1983..)
        if len(tr) < 5:
            continue
        
        date = tr[0].text_content().strip()
        if(date != ''):
            current_date = date
        else:
            date = current_date
        
        time = tr[1].text_content().strip()
        if time=='':
            time = None
        
        home_team = tr[2].text_content().strip()
        away_team = tr[4].text_content().strip()
        
        scores = tr[5].text_content().strip()
        isException = game_exception_contains({'competition': competition, 'season': season, 'stage': stage, 'date': date, 'time': time, 'home_team': home_team, 'away_team': away_team})
        dic_scores = analyse_score(scores, isException)
        status = dic_scores['status']
        home_score_ht = dic_scores['home_score_ht']
        away_score_ht = dic_scores['away_score_ht']
        home_score = dic_scores['home_score']
        away_score = dic_scores['away_score']
        home_score_et = dic_scores['home_score_et']
        away_score_et = dic_scores['away_score_et']
        home_pk = dic_scores['home_pk']
        away_pk = dic_scores['away_pk']
        
        data += [
                    {
                        'zone': zone,
                        'competition': competition, 
                        'season': season, 
                        'stage': stage,
                        'status': status,
                        'date': date,
                        'time': time,
                        'home_team': home_team,
                        'away_team': away_team,
                        'home_score_ht': home_score_ht,
                        'away_score_ht': away_score_ht,
                        'home_score': home_score,
                        'away_score': away_score,
                        'home_score_et': home_score_et,
                        'away_score_et': away_score_et,
                        'home_pk': home_pk,
                        'away_pk': away_pk
                    }
                ]
    return data

In [106]:
def extract_result_from_2legscup_tree(tree, zone, competition, season, stage):
    
    # log current process
    #logWrapper("Processing season {0}, {1}".format(season, stage))
    
    # store results
    data = []
    
    # get all rows which represent a results
    trs = tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr[@class="hell"]')
        
    date = None
    time = None
    
    for tr in trs:
        
        # check number of rows
        # help to ignore some special case (ie: decisive matches in fa-cup final 1982/1983..)
        if len(tr) < 5:
            continue
        
        leg = tr[0].text_content().strip()
        if leg[:3]=='1st':
            curr_stage = stage + ' - 1st Leg'
        else:
            curr_stage = stage + ' - 2nd Leg'
        
        home_team = tr[1].text_content().strip()
        away_team = tr[3].text_content().strip()
        
        scores = tr[4].text_content().strip()
        isException = game_exception_contains({'competition': competition, 'season': season, 'stage': stage, 'date': None, 'time': None, 'home_team': home_team, 'away_team': away_team})
        dic_scores = analyse_score(scores, isException)
        status = dic_scores['status']
        home_score_ht = dic_scores['home_score_ht']
        away_score_ht = dic_scores['away_score_ht']
        home_score = dic_scores['home_score']
        away_score = dic_scores['away_score']
        home_score_et = dic_scores['home_score_et']
        away_score_et = dic_scores['away_score_et']
        home_pk = dic_scores['home_pk']
        away_pk = dic_scores['away_pk']
        
        data += [
                    {
                        'zone': zone,
                        'competition': competition, 
                        'season': season, 
                        'stage': curr_stage,
                        'status': status,
                        'date': date,
                        'time': time,
                        'home_team': home_team,
                        'away_team': away_team,
                        'home_score_ht': home_score_ht,
                        'away_score_ht': away_score_ht,
                        'home_score': home_score,
                        'away_score': away_score,
                        'home_score_et': home_score_et,
                        'away_score_et': away_score_et,
                        'home_pk': home_pk,
                        'away_pk': away_pk
                    }
                ]
    # get all rows which represent date/time informations
    trs = tree.findall('.//div[@class="data"]//table[@class="standard_tabelle"]//tbody/tr[@class="dunkel"]')
        
    index = 0
    for tr in trs:
        
        date_time_txt_leg1 = tr[1].text_content().strip()
        date_time_txt_leg2 = tr[3].text_content().strip()
        
        # handle the 1st leg date
        if is_matching_date(date_time_txt_leg1):
            data[index]['date'] = match_date(tr[1].text_content().strip())
            data[index]['time'] = match_time(tr[1].text_content().strip())
            index += 1
        
        # handle the 2nd leg date
        if is_matching_date(date_time_txt_leg2):
            data[index]['date'] = match_date(tr[3].text_content().strip())
            data[index]['time'] = match_time(tr[3].text_content().strip())
            index += 1
    
    return data

In [107]:
def construct_stage_tree_path(zone, competition, season, stage):
    return os.path.join(root_path, zone, competition, season, stage + '.html')    

In [108]:
def get_data_from_tree(zone, competition, season, stage):
    tree = get_tree_from_file(construct_stage_tree_path(zone, competition, season, stage))
    return extract_result_from_tree(tree, zone, competition, season, stage)

In [109]:
# test get_data_from_tree
#data = get_data_from_tree('europe', 'cup-winners-cup', '1987-1988', '2. Round')

# Test

In [110]:
def wrap_test_stage(stage_data, assert_data, descr):
    print('running test {0}'.format(descr))
    #tree = get_tree_from_file(construct_stage_tree_path(stage_data['zone'], stage_data['competition'], stage_data['season'], stage_data['stage']))
    #data = extract_result_from_tree(tree, stage_data['competition'], stage_data['season'], stage_data['stage'])
    data = get_data_from_tree(stage_data['zone'], stage_data['competition'], stage_data['season'], stage_data['stage'])
    for tuple in assert_data:
        game_index = tuple[0]
        game = data[game_index]
        expected_data = tuple[1]
        test_passed = True
        for key in expected_data.keys():
            try:
                assert(expected_data[key] == game[key])
            except AssertionError:
                print('\t# TEST FAILED: data is different from expected data: {0} != {1}'.format(game[key], expected_data[key]))
                test_passed = False
                continue
        if test_passed:
            #print('TEST PASSED: {0}'.format(expected_data))
            print('\tTEST PASSED')

In [111]:
def run_test():
    
    # 1st example: league game - ligue1
    wrap_test_stage(
        {'zone':'france', 'competition':'ligue1', 'season':'2016-2017', 'stage':'Week04'},
        [
            (0, {'home_team': 'Paris Saint-Germain', 'home_score_ht': '0', 'home_score': '1'})
        ],
        '#1')
    
    # 2nd example: cup game - coupe de france with extra time and penalty shots
    # 3rd example: cup game - coupe de france with extra time only
    wrap_test_stage(
        {'zone':'france', 'competition':'coupe-de-france', 'season':'2014-2015', 'stage':'Round of 16'},
        [
            (1, {'away_team': 'AJ Auxerre', 'home_score_ht': '0', 'away_score_ht': '0', 'home_score': '1', 'away_score': '1', 'home_pk': '5', 'away_pk': '6'}),
            (4, {'home_team': 'AS Yzeure', 'home_score_ht': '0', 'away_score_ht': '0', 'home_score': '1', 'away_score': '1', 'home_score_et': '1', 'away_score_et': '3', 'home_pk': None})
        ],
        '#2 and #3')
    
    # 4th example: 2 legs game - champions league
    # 5th example: 2 legs game - champions league with extra time
    # 6th example: 2 legs game - champions league with penalty shots
    wrap_test_stage(
        {'zone':'europe', 'competition':'champions-league', 'season':'2015-2016', 'stage':'Round of 16'},
        [
            (0, {'stage': 'Round of 16 - 1st Leg', 'date': '16/02/2016', 'time': '19:45', 'home_team': 'Paris Saint-Germain', 'home_score_ht': '1', 'away_score_ht': '1', 'home_score': '2', 'away_score': '1'}),
            (11, {'stage': 'Round of 16 - 2nd Leg', 'date': '16/03/2016', 'time': '19:45', 'away_team': 'Juventus', 'home_score_ht': '0', 'away_score_ht': '2', 'home_score': '2', 'away_score': '2', 'home_score_et': '4', 'away_score_et': '2'}),
            (13, {'stage': 'Round of 16 - 2nd Leg', 'date': '15/03/2016', 'time': '19:45', 'away_team': 'PSV Eindhoven', 'home_score_ht': '0', 'away_score_ht': '0', 'home_score': '0', 'away_score': '0', 'home_score_et': '0', 'away_score_et': '0', 'home_pk': '8', 'away_pk': '7'})
        ],
        '#4, #5 and #6')
    
    # 7th example: champions league group stage
    # 8th example: champions league group stage
    wrap_test_stage(
        {'zone':'europe', 'competition':'champions-league', 'season':'2015-2016', 'stage':'Group A'},
        [
            (0, {'stage': 'Group A', 'date': '15/09/2015', 'time': '19:45', 'home_team': 'Real Madrid', 'home_score_ht': '1', 'away_score_ht': '0', 'home_score': '4', 'away_score': '0'}),
            (1, {'stage': 'Group A', 'date': '15/09/2015', 'time': '19:45', 'home_team': 'Paris Saint-Germain', 'home_score_ht': '1', 'away_score_ht': '0', 'home_score': '2', 'away_score': '0'})
        ],
        '#7, #8')
    
    # 9th example: world cup qualifier CONCACAF
    wrap_test_stage(
        {'zone':'world', 'competition':'world-cup-qualifier-concacaf', 'season':'2000-2001', 'stage':'Caribbean Semi-finals'},
        [
            (2, {'stage': 'Caribbean Semi-finals - 1st Leg', 'date': '01/04/2000', 'time': None, 'home_team': 'Haiti', 'home_score_ht': None, 'away_score_ht': None, 'home_score': '9', 'away_score': '0', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#9')
    
    # 10th example: england fa-cup, final 1982/1983 with additional decisive game since first game was a draw (no penalty shots back then?)
    wrap_test_stage(
        {'zone':'england', 'competition':'fa-cup', 'season':'1982-1983', 'stage':'Final'},
        [
            (0, {'stage': 'Final', 'date': '21/05/1983', 'time': None, 'home_team': 'Manchester United', 'home_score_ht': '0', 'away_score_ht': '1', 'home_score': '2', 'away_score': '2', 'home_score_et': '2', 'away_score_et': '2', 'home_pk': None, 'away_pk': None})
        ],
        '#10')

    # 11th example: germany bundesliga, 1992/1993 Week32, Eintracht Frankfurt loses the game after breaking regulations
    wrap_test_stage(
        {'zone':'germany', 'competition':'bundesliga', 'season':'1992-1993', 'stage':'Week32'},
        [
            (3, {'stage': 'Week32', 'date': '22/05/1993', 'time': '14:30', 'home_team': 'Bayer 05 Uerdingen', 'away_team': 'Eintracht Frankfurt', 'home_score_ht': '1', 'away_score_ht': '3', 'home_score': '2', 'away_score': '0', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#11')
    
    # 12th example: germany bundesliga, 1994/1995 Week26, FC Bayern Munich loses the game after breaking regulations
    wrap_test_stage(
        {'zone':'germany', 'competition':'bundesliga', 'season':'1994-1995', 'stage':'Week26'},
        [
            (4, {'stage': 'Week26', 'date': '15/04/1995', 'time': '14:30', 'home_team': 'Eintracht Frankfurt', 'away_team': 'Bayern München', 'home_score_ht': '2', 'away_score_ht': '2', 'home_score': '2', 'away_score': '0', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#12')
    
    # 13th example: germany cup, 2nd round 1979/1980 with additional decisive game
    wrap_test_stage(
        {'zone':'germany', 'competition':'german-cup', 'season':'1979-1980', 'stage':'2. Round'},
        [
            (0, {'stage': '2. Round', 'date': '28/09/1979', 'time': None, 'home_team': 'VfB Stuttgart', 'away_team': 'SG Wattenscheid 09', 'home_score_ht': '7', 'away_score_ht': '1', 'home_score': '10', 'away_score': '2', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#13')
    
    # 14th example: champions league, Round of 16, 1982/1983, Dinamo Kiev vs KS 17 Nentori Tirana 2nd leg cancelled
    wrap_test_stage(
        {'zone':'europe', 'competition':'champions-league', 'season':'1982-1983', 'stage':'Round of 16'},
        [
            (2, {'stage': 'Round of 16 - 1st Leg', 'date': '20/10/1982', 'time': None, 'home_team': 'Dinamo Kiev', 'away_team': 'KS 17 Nentori Tirana', 'home_score_ht': None, 'away_score_ht': None, 'home_score': None, 'away_score': None, 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#14')
    
    # 15th example: cup winners cup, Round 2, 1987/1988, Hadjuk Split disqualified because crowd events
    wrap_test_stage(
        {'zone':'europe', 'competition':'cup-winners-cup', 'season':'1987-1988', 'stage':'2. Round'},
        [
            (15, {'stage': '2. Round - 2nd Leg', 'date': '05/11/1987', 'time': None, 'home_team': 'Hajduk Split', 'away_team': 'Olympique Marseille', 'home_score_ht': '1', 'away_score_ht': '0', 'home_score': '0', 'away_score': '3', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#15')
    
    # 16th example: portugal primeira liga, Week16, 2007/2008, Os Belenenses vs Naval 1° de Maio 
    wrap_test_stage(
        {'zone':'portugal', 'competition':'primeira-liga', 'season':'2007-2008', 'stage':'Week16'},
        [
            (3, {'stage': 'Week16', 'date': '13/01/2008', 'time': None, 'home_team': 'Os Belenenses', 'away_team': 'Naval 1° de Maio', 'home_score_ht': '1', 'away_score_ht': '1', 'home_score': '0', 'away_score': '3', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#16')
    
    # 17th example: euro qualifier, Group F, 2006/2007, Denmark Sweden: match abandoned after referee was attacked by a fan
    wrap_test_stage(
        {'zone':'europe', 'competition':'euro-qualifier', 'season':'2006-2007', 'stage':'Group F'},
        [
            (18, {'stage': 'Group F', 'date': '02/06/2007', 'time': '19:00', 'home_team': 'Denmark', 'away_team': 'Sweden', 'home_score_ht': '1', 'away_score_ht': '3', 'home_score': '0', 'away_score': '3', 'home_score_et': None, 'away_score_et': None, 'home_pk': None, 'away_pk': None})
        ],
        '#17')

    

In [112]:
# run battery of test
run_test()

running test #1
	TEST PASSED
running test #2 and #3
	TEST PASSED
	TEST PASSED
running test #4, #5 and #6
	TEST PASSED
	TEST PASSED
	TEST PASSED
running test #7, #8
	TEST PASSED
	TEST PASSED
running test #9
	TEST PASSED
running test #10
	TEST PASSED
running test #11
	TEST PASSED
running test #12
	TEST PASSED
running test #13
	TEST PASSED
running test #14
	TEST PASSED
running test #15
	TEST PASSED
running test #16
	TEST PASSED
running test #17
	TEST PASSED


# Remove potential stage duplicate

In [113]:
def stage_data_equivalent(stage1, stage2):
    if len(stage1) != len(stage2):
        return False
    for i in range(0, len(stage1) - 1):
        if not game_data_equivalent(stage1[0], stage2[0]):
            return False
    return True

def game_data_equivalent(game1, game2):
    if len(game1) != len(game2):
        return False
    keys_to_check = list(game1.keys())
    keys_to_check.remove('stage')
    for key in keys_to_check:
        if game1[key] != game2[key]:
            return False
    return True

In [114]:
erroneous_stage_name = [
    '9',
    '10',
    'Round of 1',
    'Quarter-final',
    'Semi-final',
    'Fina',
    'finale',
    'finale_2'
]

def decide_stage_name_to_remove(stage1, stage2):
    
    # special case for world-cup qualif concacaf: caribbean stuff..
    if stage1[:9]=='Caribbean' and stage2 in stage1:
        return stage2
    if stage2[:9]=='Caribbean' and stage1 in stage2:
        return stage1
    
    # general case
    if stage1 in erroneous_stage_name:
        return stage1
    if stage2 in erroneous_stage_name:
        return stage2
    return stage1

In [115]:
files_to_remove = []

def remove_duplicate_everywhere():
    global files_to_remove
    files_to_remove = []
    zones = [f for f in listdir(root_path) if not isfile(join(root_path,f)) and f[0]!='_']
    for zone in zones:
        print('processing zone {0}'.format(zone))
        remove_duplicate_in_zone(zone)
        
def remove_duplicate_in_zone(zone):
    path = os.path.join(root_path, zone)
    competitions = [f for f in listdir(path) if not isfile(join(path,f))]
    for competition in competitions:
        remove_duplicate_in_competition(zone,competition)

def remove_duplicate_in_competition(zone, competition):
    path = os.path.join(root_path, zone, competition)
    seasons = [f for f in listdir(path) if not isfile(join(path,f))]
    for season in seasons:
        remove_duplicate_in_season(zone,competition, season)

# main logic of duplicate removal
def remove_duplicate_in_season(zone, competition, season):
    
    #print('processing {0}, {1}, {2}'.format(zone, competition, season))
    
    global files_to_remove
    
    precision = 400
    min_file_size = 25000
    
    path = os.path.join(root_path, zone, competition, season)
    
    # construct list of (filename, size)
    season_data = []
    for file in listdir(path):
        size = os.path.getsize(os.path.join(path,file))
        season_data += [(file, size)]
    season_data = sorted(season_data, key=lambda tup: tup[1])
    
    for i in range(0,len(season_data)-1):
    #for i in range(3,4):
        filename = season_data[i][0]
        stage = season_data[i][0][:-5]
        size = season_data[i][1]
        next_filename = season_data[i+1][0]
        next_stage = season_data[i+1][0][:-5]
        next_size = season_data[i+1][1]
        
        #print('\tcomparing {0} to {1}'.format(filename, next_filename))
        
        if filename in files_to_remove:
            continue
        
        # if file too small, most probably a tree not saved properly
        if size < min_file_size:
            files_to_remove += [os.path.join(path, filename)]    
        
        # season_default.html file must be kept as it's used as cache for tree saver process
        if filename[:14]=='season_default' or next_filename[:14]=='season_default':
            continue
        
        # if next file is same size within a certain precision
        # and next file name is a sebset of current file name (or the other way around)
        if abs(size - next_size) < precision and (stage.lower() in next_stage.lower() or next_stage.lower() in stage.lower()):
            # then load stage as tree, extract data and compare both
            data = get_data_from_tree(zone, competition, season, stage)
            next_data = get_data_from_tree(zone, competition, season, next_stage)
            #print('\tdata extracted from tree, len: {0}'.format(len(data)))
            #print('\tdata (next) extracted from tree, len: {0}'.format(len(next_data)))
            
            # if same, then
            if stage_data_equivalent(data, next_data):
                files_to_remove += [os.path.join(path, decide_stage_name_to_remove(stage, next_stage) + '.html')]    

In [116]:
def print_files_to_remove_info():
    print('\nthere are {0} files to remove:'.format(len(files_to_remove)))
    for f in files_to_remove:
        print(f)

# actually remove all these files
def perform_removal():
    for f in files_to_remove:
        print('deleting {0}'.format(f))
        os.remove(f)

In [117]:
remove_duplicate_everywhere()
print_files_to_remove_info()

processing zone england
processing zone europe
processing zone france
processing zone germany
processing zone italy
processing zone netherlands
processing zone portugal
processing zone spain
processing zone world

there are 0 files to remove:


In [118]:
#perform_removal()

# CSV file generation

In [119]:
competition_to_ignore = [
    # worlcup qualifier africa seems buggy in worldfootball.net website
    {'competition': 'world-cup-qualifier-africa', 'season': '1992-1993'},
    {'competition': 'world-cup-qualifier-africa', 'season': '2000-2001'},
    {'competition': 'world-cup-qualifier-africa', 'season': '2015-2017'},
    {'competition': 'world-cup-qualifier-asia', 'season': '2015-2017'},
    {'competition': 'world-cup-qualifier-europe', 'season': '2016-2017'},
    {'competition': 'world-cup-qualifier-south-america', 'season': '2015-2017'},
    {'competition': 'world-cup-qualifier-concacaf'},
    {'competition': 'world-cup-qualifier-oceania'}
]

def csv_generation(root_path):
    
    # load conf
    conf = ConfHandler(conf_path)
    sections = conf.get_sections()
    ignore_zones = conf.get('general', 'ignore_sections_for_csv_generation', True)
    zone_filter = conf.get('general', 'section_filter_for_csv_generation', True)
    
    season_min = int(conf.get('general', 'season_min'))
    season_max = int(conf.get('general', 'season_max'))
        
    # for all zone (folder) in root_path (except those starting with '_', ie: _conf, _logs)
    zones_from_path = [f for f in listdir(root_path) if not isfile(join(root_path, f)) and f[0]!='_']
    
    if ignore_zones != None:
        zones = list(set(zones_from_path) - set(ignore_zones))
    
    if zone_filter != None:
        zones = list(set(zones_from_path) & set(zone_filter))
    
    data = []
    cols = ['zone', 'competition', 'season', 'stage', 'status', 'date', 'time', 'home_team', 'away_team', 
            'home_score_ht', 'away_score_ht', 'home_score', 'away_score',
            'home_score_et', 'away_score_et', 'home_pk', 'away_pk']
    
    try:
        
        for zone in zones:

            # for all competition (folder) in zone
            #print(zone)

            zone_path = os.path.join(root_path, zone)
            competitions = [f for f in listdir(zone_path) if not isfile(join(zone_path, f))]
            for competition in competitions:

                # for all season (folder) in competition
                #print('\t' + competition)

                comp_path = os.path.join(zone_path, competition)
                seasons = [f for f in listdir(comp_path) if not isfile(join(comp_path, f))]
                for season in seasons:

                    if(int(season[:4]) < season_min or int(season[:4]) > season_max):
                        #logWrapper('ignoring season {0}'.format(season))
                        continue
                    
                    if {'competition': competition, 'season': season} in competition_to_ignore:
                        logWrapper('ignoring season {0}, as part of the list of competition to ignore'.format(season))
                        continue
                    
                    if {'competition': competition} in competition_to_ignore:
                        logWrapper('ignoring whole competition {0}, as part of the list of competition to ignore'.format(competition))
                        continue
                    
                    # for all stage (.html files) in season (except 'season_default.html')
                    #print('\t\t' + season)

                    season_path = os.path.join(comp_path, season)
                    stages = [f[:-5] for f in listdir(season_path) if isfile(join(season_path, f)) and f[-5:]=='.html' and f!='season_default.html']
                    for stage in stages:
                        logWrapper('processing {0}, {1}, {2}, {3}'.format(zone, competition, season, stage))
                        #print('\t\t\t' + stage)
                        data += get_data_from_tree(zone, competition, season, stage)


        # write to csv
        df = pd.DataFrame(data,columns=cols)
        df.to_csv(os.path.join(csv_path, 'worldfootball.csv'), index=False)
    
    except Exception as e:
        logWrapper('Exception: {0}'.format(e))

In [120]:
csv_generation(root_path)

processing europe, champions-league, 1979-1980, 1. Round
processing europe, champions-league, 1979-1980, endspiel
processing europe, champions-league, 1979-1980, Final
processing europe, champions-league, 1979-1980, Quarter-finals
processing europe, champions-league, 1979-1980, Round of 16
processing europe, champions-league, 1979-1980, Semi-finals
processing europe, champions-league, 1980-1981, 1. Round
processing europe, champions-league, 1980-1981, endspiel
processing europe, champions-league, 1980-1981, Final
processing europe, champions-league, 1980-1981, Quarter-finals
processing europe, champions-league, 1980-1981, Round of 16
processing europe, champions-league, 1980-1981, Semi-finals
processing europe, champions-league, 1981-1982, 1. Round
processing europe, champions-league, 1981-1982, endspiel
processing europe, champions-league, 1981-1982, Final
processing europe, champions-league, 1981-1982, Quarter-finals
processing europe, champions-league, 1981-1982, Round of 16
process