In [59]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

# =========================
# Paths
# =========================
PATH_WY_TRANSFERS = r"C:\Users\falkj\Documents\Player_trading\data\male_transfers_data.parquet"
PATH_WY_PLAYERS = r"C:\Users\falkj\Documents\Player_trading\data\players_wyscout.parquet"
PATH_WY_COMPETITIONS = r"C:\Users\falkj\Documents\Player_trading\data\competitions_wyscout.parquet"

PATH_TM_LEAGUES = r"C:\Users\falkj\Documents\Player_trading\data\tm_league_links.parquet"
PATH_WY_TM_MAP = r"C:\Users\falkj\Documents\Player_trading\data\wy_tm_players_mapping.parquet"
PATH_TM_TEAMS = r"C:\Users\falkj\Documents\Player_trading\data\tm_teams.parquet"

OUTPUT_TM_TRANSFERS = r"C:\Users\falkj\Documents\Player_trading\data\tm_transfers.parquet"

# =========================
# HTTP
# =========================
BASE_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    )
}

API_BASE = "https://tmapi-alpha.transfermarkt.technology"
REQUEST_SLEEP = 2.5  # be conservative


# =========================
# Utility helpers
# =========================
def _as_int(x):
    if x is None:
        return None
    try:
        if isinstance(x, str) and x.strip() == "":
            return None
        return int(float(x))  # handles "11.0" too
    except Exception:
        return None


def _compact_to_number(compact_dict):
    """
    TM API compact examples:
      {'prefix':'‚Ç¨','content':'1.20','suffix':'M'}
      {'prefix':'','content':'Free Transfer','suffix':''}
      {'prefix':'‚Ç¨','content':'25.00','suffix':'K'}
      {'prefix':'','content':'?','suffix':''}
    """
    if not isinstance(compact_dict, dict):
        return None

    content = (compact_dict.get("content") or "").strip()
    suffix = (compact_dict.get("suffix") or "").strip().lower()

    if not content or content in {"-", "?", "unknown"}:
        return None

    lowered = content.lower()
    if "free" in lowered or "loan" in lowered or "end of loan" in lowered:
        return 0

    # numeric
    content = content.replace(",", "")
    try:
        base = float(content)
    except ValueError:
        return None

    mult = 1
    if suffix == "k":
        mult = 1_000
    elif suffix == "m":
        mult = 1_000_000
    elif suffix in {"bn", "b"}:
        mult = 1_000_000_000

    return int(base * mult)


def _money_value(obj):
    """
    Handles:
      {'value': 1200000, 'currency': 'EUR', 'compact': {...}}
      {'value': None, 'currency': '', 'compact': {'content':'Free Transfer'}}
    """
    if not isinstance(obj, dict):
        return None

    v = obj.get("value", None)
    if isinstance(v, (int, float)):
        return int(v)

    if v is None:
        return _compact_to_number(obj.get("compact"))

    # sometimes value is string
    try:
        return int(float(str(v)))
    except Exception:
        return _compact_to_number(obj.get("compact"))


def _extract_history_list(data):
    """
    'history' can be:
      - list of transfers
      - dict of transfers (id -> transfer)
      - dict grouped by status (e.g. {'terminated':[...], 'pending':[...]} )
    Normalize to a flat list[dict]
    """
    history = data.get("history")
    if history is None:
        return []

    if isinstance(history, list):
        return [t for t in history if isinstance(t, dict)]

    if isinstance(history, dict):
        # grouped lists?
        if any(isinstance(v, list) for v in history.values()):
            out = []
            for v in history.values():
                if isinstance(v, list):
                    out.extend([t for t in v if isinstance(t, dict)])
            return out
        # id -> dict
        return [t for t in history.values() if isinstance(t, dict)]

    return []


# =========================
# Build name mappings
# =========================
def build_tm_player_name_map(wy_players: pd.DataFrame, wy_tm_map: pd.DataFrame):
    """
    Map Transfermarkt player_id -> player display name using Wyscout data + mapping.
    wy_tm_map columns: wy_id, tm_id
    wy_players columns include: player_id, first_name, last_name, short_name, name
    """
    # ensure ints
    wy_tm_map = wy_tm_map.copy()
    wy_tm_map["wy_id"] = wy_tm_map["wy_id"].apply(_as_int)
    wy_tm_map["tm_id"] = wy_tm_map["tm_id"].apply(_as_int)

    wy_players = wy_players.copy()
    wy_players["player_id"] = wy_players["player_id"].apply(_as_int)

    merged = wy_tm_map.merge(
        wy_players[["player_id", "short_name", "first_name", "last_name", "name"]],
        left_on="wy_id",
        right_on="player_id",
        how="left"
    )

    # Build a dict tm_id -> dict of names
    name_map = {}
    for _, r in merged.iterrows():
        tm_id = r["tm_id"]
        if tm_id is None:
            continue
        name_map[tm_id] = {
            "player_short_name": r.get("short_name"),
            "player_first_name": r.get("first_name"),
            "player_last_name": r.get("last_name"),
            "player_name": r.get("name"),
        }

    return name_map


def build_tm_team_name_map(tm_teams: pd.DataFrame):
    """
    tm_teams appears to have: team_name, tm_id, country
    Map clubId(int) -> team_name
    """
    df = tm_teams.copy()

    # normalize tm_id
    if "tm_id" in df.columns:
        df["tm_id_int"] = df["tm_id"].apply(_as_int)
        id_col = "tm_id_int"
    elif "team_id" in df.columns:
        df["team_id_int"] = df["team_id"].apply(_as_int)
        id_col = "team_id_int"
    else:
        raise ValueError("tm_teams must have 'tm_id' or 'team_id'")

    # find name column
    name_col = None
    for c in ["team_name", "name", "club_name"]:
        if c in df.columns:
            name_col = c
            break
    if name_col is None:
        raise ValueError("tm_teams must have a team name column (team_name/name/club_name)")

    # build map
    out = {}
    for _, r in df.iterrows():
        tid = r[id_col]
        if tid is None:
            continue
        out[tid] = r[name_col]
    return out


def build_tm_competition_map(tm_leagues: pd.DataFrame):
    """
    tm_leagues has: league_name, country, tm_link e.g. /allsvenskan/startseite/wettbewerb/SE1
    Extract competition code after /wettbewerb/ => SE1, SE3N, GB1, etc.

    Return map comp_id(str) -> {'competition_name': league_name, 'competition_country': country}
    """
    df = tm_leagues.copy()

    def extract_comp_id(link):
        if not isinstance(link, str):
            return None
        m = re.search(r"/wettbewerb/([^/?]+)", link)
        return m.group(1) if m else None

    df["competition_id"] = df["tm_link"].apply(extract_comp_id)

    comp_map = {}
    for _, r in df.iterrows():
        cid = r.get("competition_id")
        if not cid:
            continue
        comp_map[cid] = {
            "competition_name": r.get("league_name"),
            "competition_country": r.get("country")
        }
    return comp_map


# =========================
# Transfermarkt scraping
# =========================
def build_transfer_url(tm_league_link, season):
    base_url = "https://www.transfermarkt.com"
    transfer_path = tm_league_link.replace("startseite", "transfers")
    return base_url + transfer_path + f"?saison_id={season}&leihe=1&intern=0"


def scrape_player_ids_from_transfer_table(url):
    response = requests.get(url, headers=BASE_HEADERS, timeout=30)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, "html.parser")
    player_links = soup.select("a[href*='/profil/spieler/']")

    player_ids = set()
    for link in player_links:
        href = link.get("href", "")
        m = re.search(r"/spieler/(\d+)", href)
        if m:
            player_ids.add(int(m.group(1)))

    return list(player_ids)


# =========================
# API calls
# =========================
def fetch_player_transfer_history(player_id, max_retries=3):
    url = f"{API_BASE}/transfer/history/player/{player_id}"

    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, headers=BASE_HEADERS, timeout=30)
            resp.raise_for_status()
            return resp.json()
        except requests.exceptions.RequestException as e:
            wait = 5 * attempt
            print(f"‚ö†Ô∏è API error for player {player_id} (attempt {attempt}/{max_retries}): {e} -> sleep {wait}s")
            time.sleep(wait)

    raise RuntimeError(f"Failed to fetch transfer history for player {player_id}")


# =========================
# Parsing with name filling
# =========================
def parse_transfer_history(
    player_id,
    json_data,
    player_name_map,
    team_name_map,
    competition_map
):
    """
    Output columns REQUIRED by supervisors:
      player_id
      team_id_from
      team_name_from
      team_id_to
      team_name_to
      competition_id_from
      competition_name_from
      competition_country_from
      competition_id_to
      competition_name_to
      competition_country_to
      age_at_transfer
      transfer_fee
      transfer_value
      date
      remaining_contract_period
      contract_until_date

    Plus extra helpful columns:
      player_name, player_first_name, player_last_name, player_short_name
    """
    rows = []

    if not isinstance(json_data, dict):
        return rows

    data = json_data.get("data")
    if not isinstance(data, dict):
        return rows

    transfers = _extract_history_list(data)

    # player names from wyscout mapping
    pname = player_name_map.get(_as_int(player_id), {})
    player_name = pname.get("player_name")
    player_first = pname.get("player_first_name")
    player_last = pname.get("player_last_name")
    player_short = pname.get("player_short_name")

    for t in transfers:
        src = t.get("transferSource") or {}
        dst = t.get("transferDestination") or {}
        det = t.get("details") or {}

        from_club_id = _as_int(src.get("clubId"))
        to_club_id = _as_int(dst.get("clubId"))

        from_comp_id = src.get("competitionId") or None
        to_comp_id = dst.get("competitionId") or None

        # Map competition name/country from tm_leagues
        from_comp_name = None
        from_comp_country = None
        if from_comp_id and from_comp_id in competition_map:
            from_comp_name = competition_map[from_comp_id]["competition_name"]
            from_comp_country = competition_map[from_comp_id]["competition_country"]

        to_comp_name = None
        to_comp_country = None
        if to_comp_id and to_comp_id in competition_map:
            to_comp_name = competition_map[to_comp_id]["competition_name"]
            to_comp_country = competition_map[to_comp_id]["competition_country"]

        # If competition ids missing, fall back to countryId numeric from API (optional)
        # But supervisor wants country; keep our mapped country when possible.
        if from_comp_country is None:
            from_comp_country = src.get("countryId")
        if to_comp_country is None:
            to_comp_country = dst.get("countryId")

        # Team names from tm_teams
        team_name_from = team_name_map.get(from_club_id)
        team_name_to = team_name_map.get(to_club_id)

        fee_value = _money_value(det.get("fee"))
        mv_value = _money_value(det.get("marketValue"))

        date_str = det.get("date")
        contract_until = det.get("contractUntilDate")

        remaining_days = None
        remaining_contract = det.get("remainingContractPeriod")
        if isinstance(remaining_contract, dict):
            remaining_days = remaining_contract.get("days")

        rows.append({
            # extra (helpful)
            "player_name": player_name,
            "player_first_name": player_first,
            "player_last_name": player_last,
            "player_short_name": player_short,

            # required
            "player_id": _as_int(player_id),

            "team_id_from": from_club_id,
            "team_name_from": team_name_from,

            "team_id_to": to_club_id,
            "team_name_to": team_name_to,

            "competition_id_from": from_comp_id if from_comp_id not in ["", None] else None,
            "competition_name_from": from_comp_name,
            "competition_country_from": from_comp_country,

            "competition_id_to": to_comp_id if to_comp_id not in ["", None] else None,
            "competition_name_to": to_comp_name,
            "competition_country_to": to_comp_country,

            "age_at_transfer": det.get("age"),
            "transfer_fee": fee_value,
            "transfer_value": mv_value,

            "date": date_str,
            "remaining_contract_period": remaining_days,
            "contract_until_date": contract_until,
        })

    return rows


# =========================
# Pipeline
# =========================
def collect_transfers_for_leagues(
    tm_leagues_df,
    seasons,
    player_name_map,
    team_name_map,
    competition_map,
    max_players=None,
    sleep_between_requests=REQUEST_SLEEP
):
    all_player_ids = set()

    print("üîç Scraping transfer tables...")
    for _, row in tm_leagues_df.iterrows():
        for season in seasons:
            url = build_transfer_url(row["tm_link"], season)
            try:
                pids = scrape_player_ids_from_transfer_table(url)

                for pid in pids:
                    all_player_ids.add(pid)
                    if max_players and len(all_player_ids) >= max_players:
                        break

                print(f"  {row['tm_link']} {season}: {len(pids)} players (total collected: {len(all_player_ids)})")
                time.sleep(sleep_between_requests)

                if max_players and len(all_player_ids) >= max_players:
                    break

            except Exception as e:
                print("ERROR:", url, e)

        if max_players and len(all_player_ids) >= max_players:
            break

    print(f"\n‚úÖ Unique players selected: {len(all_player_ids)}")

    all_rows = []

    print("\nüì° Fetching player transfer histories...")
    for i, player_id in enumerate(sorted(all_player_ids), 1):
        try:
            json_data = fetch_player_transfer_history(player_id)
            rows = parse_transfer_history(
                player_id,
                json_data,
                player_name_map=player_name_map,
                team_name_map=team_name_map,
                competition_map=competition_map
            )
            all_rows.extend(rows)

            print(f"[{i}/{len(all_player_ids)}] Player {player_id} -> {len(rows)} transfers")
            time.sleep(sleep_between_requests)

        except Exception as e:
            print("ERROR player:", player_id, e)

    return pd.DataFrame(all_rows)

    
# Load data
wy_transfers = pd.read_parquet(PATH_WY_TRANSFERS)
wy_players = pd.read_parquet(PATH_WY_PLAYERS)
wy_competitions = pd.read_parquet(PATH_WY_COMPETITIONS)  # not used for TM competition codes, kept for your project

tm_leagues = pd.read_parquet(PATH_TM_LEAGUES)
wy_tm_players_mapping = pd.read_parquet(PATH_WY_TM_MAP)
tm_teams = pd.read_parquet(PATH_TM_TEAMS)

# Build lookup maps
player_name_map = build_tm_player_name_map(wy_players, wy_tm_players_mapping)
team_name_map = build_tm_team_name_map(tm_teams)
competition_map = build_tm_competition_map(tm_leagues)

# Choose league subset (example)
tm_leagues_subset = tm_leagues[tm_leagues["country"] == "Sweden"]

# Run (test small first)
df_transfers = collect_transfers_for_leagues(
    tm_leagues_subset,
    seasons=[2021],
    player_name_map=player_name_map,
    team_name_map=team_name_map,
    competition_map=competition_map,
    max_players=10
)

# Ensure required columns exist (even if empty)
required_cols = [
    "player_id",
    "team_id_from", "team_name_from",
    "team_id_to", "team_name_to",
    "competition_id_from", "competition_name_from", "competition_country_from",
    "competition_id_to", "competition_name_to", "competition_country_to",
    "age_at_transfer",
    "transfer_fee", "transfer_value",
    "date",
    "remaining_contract_period",
    "contract_until_date"
]
for c in required_cols:
    if c not in df_transfers.columns:
        df_transfers[c] = None

# Clean date columns (remove time + timezone)
for col in ["date", "contract_until_date"]:
    df_transfers[col] = (
        pd.to_datetime(df_transfers[col], errors="coerce", utc=True)
        .dt.date
        .astype("string")
    )

print("\nSaved:", OUTPUT_TM_TRANSFERS)
print(df_transfers.head(20))
print(df_transfers.shape)

# Fix mixed types for parquet (country columns often contain strings + ints)
for col in ["competition_country_from", "competition_country_to"]:
    df_transfers[col] = df_transfers[col].astype("string")
    df_transfers.loc[df_transfers[col].isin(["0", "147"]), col] = pd.NA  # optional cleanup

df_transfers.to_parquet(OUTPUT_TM_TRANSFERS, index=False)




üîç Scraping transfer tables...
  /allsvenskan/startseite/wettbewerb/SE1 2021: 376 players (total collected: 10)

‚úÖ Unique players selected: 10

üì° Fetching player transfer histories...
[1/10] Player 114691 -> 13 transfers
[2/10] Player 135209 -> 12 transfers
[3/10] Player 204858 -> 13 transfers
[4/10] Player 208905 -> 9 transfers
[5/10] Player 319490 -> 15 transfers
[6/10] Player 428050 -> 4 transfers
[7/10] Player 485395 -> 14 transfers
[8/10] Player 555058 -> 4 transfers
[9/10] Player 651269 -> 15 transfers
[10/10] Player 675847 -> 6 transfers

Saved: C:\Users\falkj\Documents\Player_trading\data\tm_transfers.parquet
         player_name player_first_name player_last_name player_short_name  \
0   Rasmus Lindkvist            Rasmus        Lindkvist      R. Lindkvist   
1   Rasmus Lindkvist            Rasmus        Lindkvist      R. Lindkvist   
2   Rasmus Lindkvist            Rasmus        Lindkvist      R. Lindkvist   
3   Rasmus Lindkvist            Rasmus        Lindkvist     