In [3]:
# IMPORTS
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re

In [40]:
# define some sample URLS to test the data gathering
# the urls we taken s.t. we then later just need to add the current round where we can extract important information like standings
# but also it is the starting point where we find the link to the detailed matches to extract more data and labels :)

ligen_urls = {
    #'2 Klasse Yspertal/AV': "https://www.oefb.at/bewerbe/Bewerb/208719/Runde-",
    '2 Klasse Ybbstal/AV': "https://www.oefb.at/bewerbe/Bewerb/208687/Runde-",
    '1 Klasse Nord': "https://www.oefb.at/bewerbe/Bewerb/208682/Runde-",
    '1 Klasse Ost': 'https://www.oefb.at/bewerbe/Bewerb/208710/Runde-',
    '1 Klasse Waldviertel': "https://www.oefb.at/bewerbe/Bewerb/208684/Runde-",
    '1 Klasse Süd': "https://www.oefb.at/bewerbe/Bewerb/208697/Runde-",
    '1 Klasse West': "https://www.oefb.at/bewerbe/Bewerb/208693/Runde-",
    '1 Klasse Nordwest': "https://www.oefb.at/bewerbe/Bewerb/208713/Runde-",
    'Gebietsliga Nord/Nordwest': "https://www.oefb.at/bewerbe/Bewerb/208717/Runde-",
    'Gebietsliga Nordwest/Waldviertel': "https://www.oefb.at/bewerbe/Bewerb/208680/Runde-",
    'Gebietsliga Süd/Südost': "https://www.oefb.at/bewerbe/Bewerb/208695/Runde-",
    'Gebietsliga West': "https://www.oefb.at/bewerbe/Bewerb/208708/Runde-",
    '2 Landesliga Ost': "https://www.oefb.at/bewerbe/Bewerb/208714/Runde-",
    '2 Landesliga West': "https://www.oefb.at/bewerbe/Bewerb/208715/Runde-",
    '1 Landesliga': "https://www.oefb.at/bewerbe/Bewerb/208685/Runde-"
}

In [41]:
def wait_for_full_load(driver, path, how='xpath'):
    '''
    A function that takes in an xpath of an element and a driver (browser).
    The function will be exited as soon as the element is available on the site.
    '''
    waiting = True
    while waiting:
        # check if at least one element is already available on the site
        if how == 'xpath': 
            elements = driver.find_elements(By.XPATH, path)
            print(len(elements))
        elif how == 'css':
            elements = driver.find_elements(By.CSS_SELECTOR, path)
        elif how == 'id':
            elements = driver.find_elements(By.ID, path)
        
        if len(elements)!=0: 
            waiting = False
            return;
        else:
            time.sleep(1)

In [None]:
# create the df to store the data in 
df = pd.DataFrame(columns=['Liga','Round','Matchday Date','Hometeam','Hometeam Standing','Hometeam points','Awayteam','Awayteam Standing','Awayteam points','Matchday Time','Fans watching'])

# start the session
driver = webdriver.Safari()
#driver.maximize_window()

# get some URL to accept the cookies on that website
driver.get(ligen_urls["1 Klasse West"])
cookies_xpath_preferences = """//*[@id="app-instance-1582128784421-0"]/div/div/div/form/div[4]/input[1]"""
wait_for_full_load(driver, cookies_xpath_preferences)
driver.find_element(By.XPATH, cookies_xpath_preferences).click()

# iterate over each league
for liga in ligen_urls.keys():
    print("#"*10,liga.upper(),"#"*10,'\n')
    
    # iterate over the first two rounds to test
    for i in range(25):
        print('-'*5,f"RUNDE {i+1}",'-'*5,'\n')
        # open current round website
        driver.get(ligen_urls[liga] + str(i+1))
        
        # if the matches have already been played there exists a link to the match summary = Spielbericht
        elems = driver.find_elements(By.XPATH, f"//a[@title='Spielbericht']")
        
        print(f'Es konnten in der aktuellen Runde {len(elems)} Spielberichte gefunden werden!\n')

        # if there have been matches played in that round we can continue the data extraction
        if len(elems) != 0:
            # before followin the urls to the detailed match site we extract the standings and point 
            # ATTENTION: since we deal with historical data, this data we now extract are already the standings after that round was played
            # therefore we need this data only as input for the next round (which means before the next round was played)
            # example: for round 1 we have everybody with 0 points, which will be our d_act for i == 0
            # but for the next round we store the standings after round 1 as input
            stats_teams = [elem.get_attribute('title') for elem in driver.find_elements(By.CLASS_NAME, "m_g_team_only_1")]
            stats_points = [elem.text for elem in driver.find_elements(By.CLASS_NAME, "m_g_points_1")]
            # the default when no games played yet
            if i == 0: 
                # first match -> everybody has 0 points and is in the middle position (//2) of the table
                d_act = dict((val, [len(stats_teams)//2, 0]) for val in stats_teams)
            
            # save the stats as input for the next round
            d_next = dict((val, [p+1, stats_points[p]]) for p, val in enumerate(stats_teams))

            # extract the links to the match summaries and iterate over them
            hrefs = [elm.get_attribute('href') for elm in elems]
            for href in hrefs:
                # create array to store the data
                l = []

                # append liga information
                l.append(liga)

                # open the new website
                driver.get(href)

                # get round of the season
                l.append(driver.find_element(By.XPATH, """//*[@id="app-instance-5595842886476943452-1475120657383348073"]/div/div[1]""").text)

                # get time of the matchstart
                l.append(driver.find_element(By.XPATH, """//*[@id="app-instance-5595842886476943452-1475120657383348073"]/div/div[2]""").text)

                # get team playing at home
                hometeam = driver.find_element(By.XPATH, """//*[@id="app-instance-5595842886476943452-1475120657383348073"]/div/div[3]/a[1]/span""").text
                l.append(hometeam)
                if hometeam not in d_act.keys():
                    print(f"Hometeam {hometeam} not found in dict keys\n")
                    hometeam = [team for team in d_act.keys() if hometeam in team][0]
                    print(f"Hometeam has dict key name {hometeam}!\n")

                # hometeam place and points
                l.append(d_act[hometeam][0])
                l.append(d_act[hometeam][1])
                
                # get team playing at outwards
                awayteam = driver.find_element(By.XPATH, """//*[@id="app-instance-5595842886476943452-1475120657383348073"]/div/div[3]/a[2]/span""").text
                l.append(awayteam)
                if awayteam not in d_act.keys():
                    print(f"Hometeam {awayteam} not found in dict keys\n")
                    awayteam = [team for team in d_act.keys() if awayteam in team][0]
                    print(f"Hometeam has dict key name {awayteam}!\n")
                
                # awayteam place and points
                l.append(d_act[awayteam][0])
                l.append(d_act[awayteam][1])
                
                # get start of the matchtime
                l.append(driver.find_element(By.XPATH, """//*[@id="app-instance-5595842886476943452-1475120657383348073"]/div/div[4]/div[3]/div[1]/span[2]""").text)

                # get fans that watched the game
                l.append(driver.find_element(By.XPATH, """//*[@id="app-instance-5595842886476943452-1475120657383348073"]/div/div[4]/div[3]/div[3]/span[2]""").text)

                # append array as last row to df
                df.loc[len(df)] = l
            
            # set the actual standings as input for the next game
            d_act = d_next
        
        # if no game was played in that round we continue with the next round
        else:
            continue
    
# close driver
driver.quit()

1
########## 2 KLASSE YBBSTAL/AV ########## 

----- RUNDE 1 ----- 

Es konnten in der aktuellen Runde 7 Spielberichte gefunden werden!

Hometeam Scheibbs not found in dict keys

Hometeam has dict key name Scheibbs *!

----- RUNDE 2 ----- 

Es konnten in der aktuellen Runde 7 Spielberichte gefunden werden!

----- RUNDE 3 ----- 

Es konnten in der aktuellen Runde 7 Spielberichte gefunden werden!

Hometeam Scheibbs not found in dict keys

Hometeam has dict key name Scheibbs *!

----- RUNDE 4 ----- 

Es konnten in der aktuellen Runde 7 Spielberichte gefunden werden!



In [12]:
df_store