In [139]:
import pandas as pd
import requests
import requests
from bs4 import BeautifulSoup
import time
import random
from io import StringIO
# temporary save to sqlite3
import sqlite3
import hashlib
from datetime import datetime

In [127]:
#Create headers to circumvent non-browser activity blocking (pd.read_html blocked)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

#int'l tournaments without market-value data
blacklist_league_codes = {
    "CL", "EL", "UCOL", "CLI", "AFCL", "ACL", "CCL", "KLUB",
    "EM24", "EMQ", "CAM4", "WMQ4", "WM22", "AM23", "AFCN",
    "GC23", "19YL", "CLIY", "berater"
}

In [None]:
def commit_changes(df, table_name, db = "transfermarkt.db"):
    conn = sqlite3.connect(db)

    # Save DataFrame to a new table (schema) named "players"
    df.to_sql(table, conn, if_exists="append", index=False)

    # Close connection
    conn.close()

def load_keys(table, primary_key, db = "transfermarkt.db"):
    conn = sqlite3.connect(db)

    # SQL query to fetch data
    query = f"SELECT DISTINCT {', '.join(primary_key)}, hash_key FROM {table}"

    # Load data into a DataFrame
    df = pd.read_sql_query(query, conn)

    # Close the connection
    conn.close()
    
    return df

# history = load_keys('league', ['link'])

def hash_row(row):
    row_string = ''.join(str(value) for value in row.values)
    return hashlib.sha256(row_string.encode()).hexdigest()


def get_soup(url):
    time.sleep(random.uniform(1, 5))
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")
    else:
        print(f"Request failed with status: {response.status_code}")
        return None

### Competitions

In [184]:
def get_competitions():
    url = "https://www.transfermarkt.com/wettbewerbe/national"
    soup = get_soup(url)
    if not soup:
        return []

    competitions = []

    # Look for all <ul class="tm-button-list"> sections
    sections = soup.find_all("ul", class_="tm-button-list")
    for section in sections:
        links = section.find_all("a", class_="tm-button-list__list-item")
        for link in links:
            name = link.get("title")
            href = link.get("href")
            if name and href and 'agent' not in name.lower() and href.split("/")[-1] not in blacklist_league_codes:
                full_url = f"https://www.transfermarkt.com{href}"
                competitions.append((name, full_url))

    return competitions

comp_data = get_competitions()
comps =  pd.DataFrame(comp_data, columns =  ['name', 'link'])
comps['hash_key'] = comps.apply(hash_row, axis=1)
comps = comps.drop_duplicates(subset = ["hash_key"]).reset_index(drop = True)
comps['effective_start_date'] = datetime.today().date()
comps['effective_end_date'] = pd.to_datetime('2099-12-31').date()

comps.head()

In [191]:
def get_teams(url):
    print(url)
    response = requests.get(url, headers=headers)
    time.sleep(random.uniform(1, 5))
    
    if response.status_code == 200:
        
        try:
            data = pd.read_html(StringIO(response.text))
            if len(data) > 0:
                df = [i for i in data if 'total market value' in str(i).lower()][0]
                df["link"] = url
                return df
        
        except:
            print(f"Table not found: {url}")
    else:
        print(f"Request failed with status code {response.status_code} ({url})")
        
league_urls = [i[1] for i in comp_data]
teams_df = pd.DataFrame()
for url in league_urls:
    teams_df = pd.concat([teams_df, get_teams(url)])
    
teams_df = teams_df.dropna(subset = 'Club.1')

https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1
https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1
https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1
https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1
https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1
https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1
https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1
https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1
https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1
https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1
https://www.transfermarkt.com/liga-portugal/startseite/wettbewerb/PO1
https://www.transfermarkt.com/eredivisie/startseite/wettbewerb/NL1
https://www.transfermarkt.com/jupiler-pro-league/startseite/wettbewerb/BE1
https://www.transfermarkt.com/super-liga-srbije/startseite/wettbewerb/SER1
https://www.transfermarkt.com/super-league-1/startseite/w

### Teams

In [239]:
url = "https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1"
headers = {
    "User-Agent": "Mozilla/5.0",
}

league_urls = teams_df.link.unique()

team_links = []

for idx, league_url in enumerate(league_urls):
    if idx // 5 == idx / 5:
        print(f"{round(100*idx/len(league_urls))}% complete")
    print(league_url)
    response = requests.get(league_url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table containing the teams
    temp = soup.select('table.items tbody tr')
    for row in soup.select('table.items tbody tr'):
        link_tag = row.select_one('td.hauptlink a')
        if link_tag:
            team_name = link_tag.text.strip()
            relative_link = link_tag['href']
            #filter out top scorer links
            if r"profil/spieler" not in relative_link:
                full_link = "https://www.transfermarkt.com" + relative_link

                team_links.append((league_url, team_name, full_link))

#     # Output
#     for league, team, link in team_links:
#         print(f"{team}: {link}")
#     if counter > 4:
#         break

0% complete
https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1
Removing /mohamed-salah/profil/spieler/148455
Removing /mohamed-salah/profil/spieler/148455
Removing /alexander-isak/profil/spieler/349066
Removing /alexander-isak/profil/spieler/349066
Removing /erling-haaland/profil/spieler/418560
Removing /erling-haaland/profil/spieler/418560
Removing /chris-wood/profil/spieler/108725
Removing /chris-wood/profil/spieler/108725
Removing /bryan-mbeumo/profil/spieler/413039
Removing /bryan-mbeumo/profil/spieler/413039
https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1
Removing /robert-lewandowski/profil/spieler/38253
Removing /robert-lewandowski/profil/spieler/38253
Removing /kylian-mbappe/profil/spieler/342229
Removing /kylian-mbappe/profil/spieler/342229
Removing /ante-budimir/profil/spieler/46413
Removing /ante-budimir/profil/spieler/46413
Removing /raphinha/profil/spieler/411295
Removing /raphinha/profil/spieler/411295
Removing /oihan-sancet/profil/spiele

Removing /amirhossein-hosseinzadeh/profil/spieler/542380
Removing /amirhossein-hosseinzadeh/profil/spieler/542380
Removing /ali-alipour/profil/spieler/249360
Removing /ali-alipour/profil/spieler/249360
Removing /ramin-rezaeian/profil/spieler/188312
Removing /ramin-rezaeian/profil/spieler/188312
Removing /mohammad-amin-kazemian/profil/spieler/477113
Removing /mohammad-amin-kazemian/profil/spieler/477113
Removing /mehdi-limouchi/profil/spieler/882100
Removing /mehdi-limouchi/profil/spieler/882100
https://www.transfermarkt.com/qatar-stars-league/startseite/wettbewerb/QSL
Removing /roger-guedes/profil/spieler/348263
Removing /roger-guedes/profil/spieler/348263
Removing /akram-afif/profil/spieler/336646
Removing /akram-afif/profil/spieler/336646
Removing /baghdad-bounedjah/profil/spieler/209992
Removing /baghdad-bounedjah/profil/spieler/209992
Removing /rafa-mujica/profil/spieler/361504
Removing /rafa-mujica/profil/spieler/361504
Removing /pelle-van-amersfoort/profil/spieler/262613
Removing

In [256]:
teams_df = pd.DataFrame(team_links, columns = ['league_link', 'team', 'team_link']).drop_duplicates()

def createDetailedURL(url):
    return url.replace('startseite', 'kader') + r"/plus/1"

teams_df["team_link_detailed"] = teams_df['team_link'].apply(lambda x: createDetailedURL(x))

#Test
print(teams_df[teams_df.team == 'Liverpool FC'].league_link.iloc[0])
print('https://www.transfermarkt.com/fc-liverpool/startseite/verein/31/saison_id/2024')
print(teams_df[teams_df.team == 'Liverpool FC'].team_link.iloc[0])
print(teams_df[teams_df.team == 'Liverpool FC'].team_link_detailed.iloc[0])

https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1
https://www.transfermarkt.com/fc-liverpool/startseite/verein/31/saison_id/2024
https://www.transfermarkt.com/fc-liverpool/startseite/verein/31/saison_id/2024
https://www.transfermarkt.com/fc-liverpool/kader/verein/31/saison_id/2024/plus/1


### Old

In [None]:
# from sqlalchemy import create_engine, Column, String, Integer, Float, Date, ForeignKey
# from sqlalchemy.ext.declarative import declarative_base

# Base = declarative_base()

# class League(Base):
#     __tablename__ = "league"
#     link = Column(String, primary_key=True)
#     name = Column(String)

# import sqlite3

# # Connect to SQLite database (creates it if it doesn't exist)
# conn = sqlite3.connect("transfermarkt.db")

# # Save DataFrame to a new table (schema) named "players"
# comps.to_sql("league", conn, if_exists="replace", index=False)

# # Optional: verify schema creation
# print("Schema created and data saved successfully.")

# # Close connection
# conn.close()


# from sqlalchemy import create_engine

# # Define your credentials
# db_user = "your_username"
# db_password = "your_password"
# db_host = "localhost"         # Or your RDS endpoint if hosted on AWS
# db_port = "5432"
# db_name = "your_db_name"

# # Use f-string to create the engine
# engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}")