# Introduction

In this notebook we are going to retrieve all past Ligue1 results from ESPNFC.com

# Imports

In [1]:
import lxml.html as lh
import lxml.etree as et
import urllib.request as ulib
import pandas as pd
from selenium import webdriver
import time

# Function to get Tree from url

In [2]:
# get url html as elementtree using selenium.webdriver
def selenium_url_to_tree(driver, url):
    driver.get(url)
    time.sleep(5)
    htmlSource = driver.page_source
    tree = lh.fromstring(htmlSource)
    return tree

# Get Leagues urls

In [3]:
espnfc_url = 'http://www.espnfcasia.com'

league_start_urls = {
    'Premier League' : 'http://www.espnfcasia.com/english-premier-league/23/scores?date=20010818',
    'Liga' : 'http://www.espnfcasia.com/spanish-primera-division/15/scores?date=20000930',
    'Ligue 1' : 'http://www.espnfcasia.com/french-ligue-1/9/scores?date=20010317'
}

# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#tree = selenium_url_to_tree(driver,espnfc_url)

# close firefox window once done
#driver.quit()

#main_div = tree.xpath('.//*[@class="desktop-nav-item leagues"]')[0]
#print(main_div)

#li_divs = main_div.xpath('.//*[@class="column-content"]')
#print("there are %s leagues" % (len(li_divs)))

#for li in li_divs:
#    href = li_divs.xpath('.//a/@href')
#    league_name = li_divs.xpath('.//a/text()')
#    print('%s : %s' % (league_name, href))

# Params

In [4]:
# league
league = 'Ligue 1'

# date
date_str = '20170311'

# Start URL

In [5]:
# start url we got from espnfc website by following Leagues -> French Ligue1 -> Scores & Fixtures
# then choosing the latest year possible in top right dropdown menu

# as you can see below, the url is following pattern:
# http://www.espnfcasia.com/%competitipon%/%id%/scores?date=%date%
# for instance the first date available for french ligue1 is 17th March 2001

#start_url = 'http://www.espnfcasia.com/french-ligue-1/9/scores?date=20010317'
start_url = league_start_urls[league]

#replace last 8 chars by date
start_url = start_url[:-8] + date_str
print(start_url)

http://www.espnfcasia.com/french-ligue-1/9/scores?date=20170311


# Get tree related to start url (using selenium method)

In [6]:
# open driver: that will open firefox window
#driver = webdriver.Firefox()

# construct the trees only once
#tree = selenium_url_to_tree(driver,start_url)

# close firefox window once done
#driver.quit()

# Get tree related to start url (simple method using urlopen)

In [7]:
def get_tree(url):
    return lh.parse(ulib.urlopen(url))

# Get next fixtures urls

In [8]:
def get_next_fixture_urls(tree):
    return tree.xpath('.//*[@class="inline-date-selector"]/ul//a/@href')

# Get season from current url

In [9]:
def get_season(tree):
    return tree.xpath('.//*[@id="score-date-picker"]//option[@selected="selected"]/text()')[0].replace(" ","")

# Get date from current url

In [10]:
def get_date(url):
    date = url[-8:]
    year = date[:4]
    month = date[4:-2]
    day = date[-2:]
    return year + '-' + month + '-' + day

# Get data from current url

In [11]:
def get_data(tree):
    
    data = []
    
    # get list of score boxes list - ie: list of games for that day
    score_boxes_divs = tree.xpath('.//*[@class="score-box"]')

    # get game details for game
    for div in score_boxes_divs:
    
        # teams
        teams = div.xpath('.//*[@class="team-name"]/span/text()')
        team_home = teams[0]
        team_away = teams[1]

        # score
        scores = div.xpath('.//*[@class="team-scores"]//span/text()')
        score_home = scores[0]
        score_away = scores[1]

        # display
        #print('%s %s:%s %s' % (team_home, score_home, score_away, team_away))

        # row of data
        data.append([team_home, team_away, score_home, score_away])
        #df.loc[i] = [season_str, date, team_home, team_away, score_home, score_away]
        #i = i + 1
    
    return data

#print(df)

# Recursive function which will analyse x dates for given league, starting with start_url (first date available on espnfc website)

In [12]:
# dataframe params
cols = ['Season', 'Date', 'HomeTeam', 'AwayTeam', 'HomeScore', 'AwayScore']
df = pd.DataFrame([],columns=cols)
i = 0

# analysis depth
depth = 10

def analyse_league(url):
    
    # exit condition
    if depth == 0:
        return
    
    tree = get_tree(url)
    next_fixture_urls = get_next_fixture_urls(tree)
    season = get_season(tree)
    date = get_date(url)
    data = get_data(url)
    
    # add to current df
    for d in data:
        l = [season, date]
        l.extend(d)
        df.loc[i] = l
        i = i + 1
    
    # to control the recursion depth
    depth = depth - 1
    
    # recursive call
    for next_fixture_url in next_fixture_urls:
        analyse_league(next_fixture_url)
    

In [13]:
url = start_url
tree = get_tree(url)
next_fixture_urls = get_next_fixture_urls(tree)
season = get_season(tree)
date = get_date(url)
data = get_data(tree)

for d in data:
    l = [season, date]
    l.extend(d)
    df.loc[i] = l
    i = i + 1

print(df)

      Season        Date           HomeTeam   AwayTeam HomeScore AwayScore
0  2016/2017  2017-03-11          AS Monaco   Bordeaux         2         1
1  2016/2017  2017-03-11  AS Nancy Lorraine      Lille         1         2
2  2016/2017  2017-03-11           Guingamp     Bastia         5         0
3  2016/2017  2017-03-11        Montpellier     Nantes         2         3
4  2016/2017  2017-03-11       Stade Rennes  Dijon FCO         1         1
