In [1]:
import os
import re

import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup

from tqdm.auto import tqdm
from geopy.geocoders import Nominatim

os.environ['WDM_LOG_LEVEL'] = "false"

import logging
logging.getLogger('WDM').setLevel(logging.NOTSET)

### DATA SOURCE: https://www.tff.org/default.aspx?pageID=520

In [2]:
def get_driver():

    options = webdriver.ChromeOptions()
    # options.headless = True
    options.add_argument('--log-level=3')
    options.add_argument("--silent")
    options.add_argument('--disable-gpu')
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--ignore-ssl-errors')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options = options)

    return driver

In [3]:
def select_year_range(driver, year_range):
    
    # Select Tab
    (
        driver.find_element(by="css selector", value="div[class='levelwrap level1']")
        .find_elements(by="tag name", value="li")[1]
        .click()
    )

    # Click Dropdown
    dropdown = driver.find_element(by="css selector", value="div.ComboBox_TFF_Ana[id*='_rdbSezonSec']")
    dropdown.click()

    # Select Option
    option = dropdown.find_element(by="xpath", value=f"//div[contains(text(), '{year_range}')]")
    option.click()

    # Click Search
    (
        driver.find_element(by="css selector", value="div[id*='_PageView2']")
        .find_elements(by="tag name", value="tr")[-1]
        .find_elements(by="tag name", value="td")[-1]
        .find_element(by="tag name", value="input")
        .click()
    )

    return driver

In [4]:
def parse_data(driver, stadium_list):
    
    row_data=[]

    soup = BeautifulSoup(driver.page_source, "lxml")
    page_result_text = soup.select("fieldset > div.RadGrid_TFF_Contents > table > tfoot")[0].text
    last_page = int(re.search("Next » \|  1 / (\d+), Total", page_result_text).group(1))

    print("Total Pages:", last_page)

    current_page=1

    while current_page<=last_page:

        print("page", current_page)

        soup = BeautifulSoup(driver.page_source, "lxml")
        rows = soup.select("fieldset > div.RadGrid_TFF_Contents > table > tbody")[0].find_all("tr")

        for row in rows:
            sub_rows = row.find_all("td")

            if any(stad.lower() in sub_rows[6].text.lower() for stad in stadium_list):
                
                row_data.append({
                    "date" : pd.to_datetime(sub_rows[4].text + " " + sub_rows[5].text, dayfirst=True),
                    "stadium" : sub_rows[6].text
                })

        # Click next page
        (
            driver.find_element(by="tag name", value="tfoot")
            .find_elements(by="tag name", value="a")[-1]
            .click()
        )

        current_page+=1

    return driver, row_data

In [5]:
year_ranges=['2019-2020', '2020-2021', '2021-2022', '2022-2023']
stadium_list = pd.read_csv("tr_ist_stadiumns.csv")['stadium_names'].values

url="https://www.tff.org/default.aspx?pageID=520"
driver=get_driver()
driver.get(url)

data=[]
for yr in year_ranges:
    
    print("Scraping data for", yr)
    driver = select_year_range(driver=driver, year_range=yr)
    driver, row_data = parse_data(driver=driver, stadium_list=stadium_list)
    data.extend(row_data)
    print()

driver.quit()
print("Completed!")


Scraping data for 2019-2020
Total Pages: 13
page 1
page 2
page 3
page 4
page 5
page 6
page 7
page 8
page 9
page 10
page 11
page 12
page 13

Scraping data for 2020-2021
Total Pages: 17
page 1
page 2
page 3
page 4
page 5
page 6
page 7
page 8
page 9
page 10
page 11
page 12
page 13
page 14
page 15
page 16
page 17

Scraping data for 2021-2022
Total Pages: 16
page 1
page 2
page 3
page 4
page 5
page 6
page 7
page 8
page 9
page 10
page 11
page 12
page 13
page 14
page 15
page 16

Scraping data for 2022-2023
Total Pages: 14
page 1
page 2
page 3
page 4
page 5
page 6
page 7
page 8
page 9
page 10
page 11
page 12
page 13
page 14

Completed!


In [34]:
df = pd.DataFrame(data)

In [35]:
df.duplicated().sum()

0

In [36]:
np.sort(df['stadium'].unique())

array(['ALİ SAMİ YEN SPOR KOMPLEKSİ - İSTANBUL',
       'ALİ SAMİ YEN SPOR KOMPLEKSİ TÜRK TELEKOM STADYUMU ',
       'ATATÜRK OLİMPİYAT - İSTANBUL', 'BAŞAKŞEHİR FATİH TERİM',
       'BAŞAKŞEHİR FATİH TERİM - İSTANBUL - BAŞAKŞEHİR',
       'ESENYURT NECMİ KADIOĞLU STADI - İSTANBUL - ESENYUR',
       'RECEP TAYYİP ERDOĞAN STADYUMU - İSTANBUL',
       'VODAFONE PARK - İSTANBUL', 'VODAFONE PARK - İSTANBUL - BEŞİKTAŞ',
       'ÜLKER STADYUMU FB ŞÜKRÜ SARACOĞLU SPOR KOMPLEKSİ -',
       'ÜMRANİYE BELEDİYESİ ŞEHİR STADI - İSTANBUL',
       'ÜMRANİYE BELEDİYESİ ŞEHİR STADI - İSTANBUL - ÜMRAN'], dtype=object)

In [37]:
df['stadium'] = (
    df['stadium'].replace(r" - [ÜMRANBEŞİKTAŞESENYURBAŞAKŞEHİR]+$", "", regex=True)
    .replace("STADI", "STADYUMU", regex=True)
    .replace(
        ['ESENYURT NECMİ KADIOĞLU STADYUMU - İSTANBUL', 
         'ALİ SAMİ YEN SPOR KOMPLEKSİ - İSTANBUL', 
         'ALİ SAMİ YEN SPOR KOMPLEKSİ TÜRK TELEKOM STADYUMU ',
         'ÜLKER STADYUMU FB ŞÜKRÜ SARACOĞLU SPOR KOMPLEKSİ -',
         'BAŞAKŞEHİR FATİH TERİM',
         'ÜMRANİYE BELEDİYESİ ŞEHİR STADYUMU - İSTANBUL'],
        ['NECMİ KADIOĞLU - İSTANBUL', 
         'Nef Stadyumu - ISTANBUL', 
         'Nef Stadyumu - ISTANBUL',
         "ÜLKER STADYUMU",
         'BAŞAKŞEHİR FATİH TERİM - İSTANBUL',
         'ÜMRANİYE STADYUMU - İSTANBUL']
        )
    .str.upper()
)

In [38]:
stadiums = df['stadium'].unique()
stadiums

array(['RECEP TAYYİP ERDOĞAN STADYUMU - İSTANBUL', 'ÜLKER STADYUMU',
       'VODAFONE PARK - İSTANBUL', 'BAŞAKŞEHİR FATİH TERİM - İSTANBUL',
       'NEF STADYUMU - ISTANBUL', 'ATATÜRK OLİMPİYAT - İSTANBUL',
       'NECMİ KADIOĞLU - İSTANBUL', 'ÜMRANİYE STADYUMU - İSTANBUL'],
      dtype=object)

In [39]:
# Get coordinates of each stadium
geolocator = Nominatim(user_agent="istanbul_stadiums")

In [40]:
stad_info=[]
for stad in tqdm(stadiums):
    
    location = geolocator.geocode(stad)

    if location:
        stad_info.append({
            "stadium" : stad,
            "stad_lat" : location.latitude,
            "stad_long" : location.longitude,
            "stad_address" : location.address
        })

    else:
        print(stad)

  0%|          | 0/8 [00:00<?, ?it/s]

In [41]:
stad_df = pd.DataFrame(stad_info)
stad_df

Unnamed: 0,stadium,stad_lat,stad_long,stad_address
0,RECEP TAYYİP ERDOĞAN STADYUMU - İSTANBUL,41.032745,28.972365,"Recep Tayyip Erdoğan Stadyumu, Yunus Kaptan So..."
1,ÜLKER STADYUMU,40.987636,29.036942,"Ülker Stadyumu, 3, Recep Peker Caddesi, Kızılt..."
2,VODAFONE PARK - İSTANBUL,41.039444,28.994412,"Vodafone Park, 1, Dolmabahçe Caddesi, Akaretle..."
3,BAŞAKŞEHİR FATİH TERİM - İSTANBUL,41.122807,28.809385,"İstanbul Başakşehir Fatih Terim Stadyumu, Ordu..."
4,NEF STADYUMU - ISTANBUL,41.103426,28.991038,"Nef Stadyumu, Metin Oktay Caddesi, Huzur Mahal..."
5,ATATÜRK OLİMPİYAT - İSTANBUL,41.074473,28.765661,"Atatürk Olimpiyat Stadyumu, Olimpiyat Bulvarı,..."
6,NECMİ KADIOĞLU - İSTANBUL,41.024317,28.697846,"Necmi Kadıoğlu Stadı, Okurlar Caddesi, Yunus E..."
7,ÜMRANİYE STADYUMU - İSTANBUL,41.058762,29.109409,"Ümraniye Şehir Stadyumu, Semerkant Sokağı, Hek..."


In [42]:
df_merged = df.merge(stad_df, how="inner", on="stadium").sort_values("date", ignore_index=True)
df_merged

Unnamed: 0,date,stadium,stad_lat,stad_long,stad_address
0,2019-08-18 21:45:00,RECEP TAYYİP ERDOĞAN STADYUMU - İSTANBUL,41.032745,28.972365,"Recep Tayyip Erdoğan Stadyumu, Yunus Kaptan So..."
1,2019-08-19 20:00:00,ÜLKER STADYUMU,40.987636,29.036942,"Ülker Stadyumu, 3, Recep Peker Caddesi, Kızılt..."
2,2019-08-23 20:30:00,VODAFONE PARK - İSTANBUL,41.039444,28.994412,"Vodafone Park, 1, Dolmabahçe Caddesi, Akaretle..."
3,2019-08-24 21:45:00,BAŞAKŞEHİR FATİH TERİM - İSTANBUL,41.122807,28.809385,"İstanbul Başakşehir Fatih Terim Stadyumu, Ordu..."
4,2019-08-25 21:49:00,NEF STADYUMU - ISTANBUL,41.103426,28.991038,"Nef Stadyumu, Metin Oktay Caddesi, Huzur Mahal..."
...,...,...,...,...,...
457,2023-06-06 17:00:00,NEF STADYUMU - ISTANBUL,41.103426,28.991038,"Nef Stadyumu, Metin Oktay Caddesi, Huzur Mahal..."
458,2023-06-06 20:00:00,BAŞAKŞEHİR FATİH TERİM - İSTANBUL,41.122807,28.809385,"İstanbul Başakşehir Fatih Terim Stadyumu, Ordu..."
459,2023-06-07 00:00:00,ÜLKER STADYUMU,40.987636,29.036942,"Ülker Stadyumu, 3, Recep Peker Caddesi, Kızılt..."
460,2023-06-07 20:00:00,NECMİ KADIOĞLU - İSTANBUL,41.024317,28.697846,"Necmi Kadıoğlu Stadı, Okurlar Caddesi, Yunus E..."


In [43]:
df_merged.to_csv("istanbul_football_superleague.csv", index=False)