In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re

# =========================
# Paths
# =========================
PATH_WY_TRANSFERS = r"C:\Users\falkj\Documents\Player_trading\data\male_transfers_data.parquet"
PATH_WY_PLAYERS = r"C:\Users\falkj\Documents\Player_trading\data\players_wyscout.parquet"
PATH_WY_COMPETITIONS = r"C:\Users\falkj\Documents\Player_trading\data\competitions_wyscout.parquet"

PATH_TM_LEAGUES = r"C:\Users\falkj\Documents\Player_trading\data\tm_league_links.parquet"
PATH_WY_TM_MAP = r"C:\Users\falkj\Documents\Player_trading\data\wy_tm_players_mapping.parquet"
PATH_TM_TEAMS = r"C:\Users\falkj\Documents\Player_trading\data\tm_teams.parquet"

OUTPUT_TM_TRANSFERS = r"C:\Users\falkj\Documents\Player_trading\data\tm_transfers.parquet"

# =========================
# HTTP
# =========================
BASE_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    )
}

API_BASE = "https://tmapi-alpha.transfermarkt.technology"
REQUEST_SLEEP = 2.5  # be conservative

# TM pseudo-clubs (important!)
SPECIAL_CLUB_NAMES = {
    515: "Without Club",
    123: "Retired",
}

# =========================
# Utility helpers
# =========================
def _as_int(x):
    if x is None:
        return None
    try:
        if isinstance(x, str) and x.strip() == "":
            return None
        return int(float(x))  # handles "11.0"
    except Exception:
        return None


def _compact_to_number(compact_dict):
    if not isinstance(compact_dict, dict):
        return None

    content = (compact_dict.get("content") or "").strip()
    suffix = (compact_dict.get("suffix") or "").strip().lower()

    if not content or content in {"-", "?", "unknown"}:
        return None

    lowered = content.lower()
    if "free" in lowered or "loan" in lowered or "end of loan" in lowered:
        return 0

    content = content.replace(",", "")
    try:
        base = float(content)
    except ValueError:
        return None

    mult = 1
    if suffix == "k":
        mult = 1_000
    elif suffix == "m":
        mult = 1_000_000
    elif suffix in {"bn", "b"}:
        mult = 1_000_000_000

    return int(base * mult)


def _money_value(obj):
    if not isinstance(obj, dict):
        return None

    v = obj.get("value", None)
    if isinstance(v, (int, float)):
        return int(v)

    if v is None:
        return _compact_to_number(obj.get("compact"))

    try:
        return int(float(str(v)))
    except Exception:
        return _compact_to_number(obj.get("compact"))


def _looks_like_transfer(d):
    """Strict filter so we don't accidentally treat feeSum/totalMarketValue as transfers."""
    return (
        isinstance(d, dict)
        and "id" in d
        and isinstance(d.get("transferSource"), dict)
        and isinstance(d.get("transferDestination"), dict)
        and isinstance(d.get("details"), dict)
    )


def _extract_history_list(data):
    """
    STRICT normalization:
    history is a dict with keys: terminated(list), pending(list), plus other keys like feeSum(dict), url(str), etc.
    We ONLY keep dicts that look like actual transfers.
    """
    history = data.get("history")
    if history is None:
        return []

    out = []

    # History can occasionally be list, keep strict dicts
    if isinstance(history, list):
        for t in history:
            if _looks_like_transfer(t):
                out.append(t)
        return out

    # Typical: dict with terminated/pending lists + misc fields
    if isinstance(history, dict):
        for v in history.values():
            if isinstance(v, list):
                for t in v:
                    if _looks_like_transfer(t):
                        out.append(t)
            elif _looks_like_transfer(v):
                out.append(v)
        return out

    return []


# =========================
# Fallback scraping caches
# =========================
CLUB_NAME_CACHE = {}
COUNTRY_NAME_CACHE = {}


def fetch_club_name_from_tm(club_id):
    club_id = _as_int(club_id)
    if club_id is None:
        return None

    # handle pseudo clubs first
    if club_id in SPECIAL_CLUB_NAMES:
        return SPECIAL_CLUB_NAMES[club_id]

    if club_id in CLUB_NAME_CACHE:
        return CLUB_NAME_CACHE[club_id]

    url = f"https://www.transfermarkt.com/-/startseite/verein/{club_id}"
    try:
        resp = requests.get(url, headers=BASE_HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # Prefer h1 with itemprop name if present
        h1 = soup.select_one("h1[itemprop='name']")
        if not h1:
            h1 = soup.select_one("h1")

        name = h1.get_text(" ", strip=True) if h1 else None

        # avoid catching the generic site title page
        if name and name.lower().startswith("transfermarkt"):
            name = None

        if name:
            name = re.sub(r"\s+", " ", name).strip()

        CLUB_NAME_CACHE[club_id] = name
        return name

    except Exception:
        CLUB_NAME_CACHE[club_id] = None
        return None


def fetch_country_name_from_tm(country_id):
    country_id = _as_int(country_id)
    if country_id in (None, 0):
        return None

    if country_id in COUNTRY_NAME_CACHE:
        return COUNTRY_NAME_CACHE[country_id]

    url = f"https://www.transfermarkt.com/-/startseite/land_id/{country_id}"
    try:
        resp = requests.get(url, headers=BASE_HEADERS, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        h1 = soup.select_one("h1")
        name = h1.get_text(" ", strip=True) if h1 else None
        if name:
            name = re.sub(r"\s+", " ", name).strip()

        COUNTRY_NAME_CACHE[country_id] = name
        return name
    except Exception:
        COUNTRY_NAME_CACHE[country_id] = None
        return None


# =========================
# Build lookup maps
# =========================
def build_tm_player_name_map(wy_players: pd.DataFrame, wy_tm_map: pd.DataFrame):
    wy_tm_map = wy_tm_map.copy()
    wy_tm_map["wy_id"] = wy_tm_map["wy_id"].apply(_as_int)
    wy_tm_map["tm_id"] = wy_tm_map["tm_id"].apply(_as_int)

    wy_players = wy_players.copy()
    wy_players["player_id"] = wy_players["player_id"].apply(_as_int)

    merged = wy_tm_map.merge(
        wy_players[["player_id", "short_name", "first_name", "last_name", "name"]],
        left_on="wy_id",
        right_on="player_id",
        how="left"
    )

    name_map = {}
    for _, r in merged.iterrows():
        tm_id = r["tm_id"]
        if tm_id is None:
            continue
        name_map[tm_id] = {
            "player_short_name": r.get("short_name"),
            "player_first_name": r.get("first_name"),
            "player_last_name": r.get("last_name"),
            "player_name": r.get("name"),
        }
    return name_map


def build_tm_team_name_map(tm_teams: pd.DataFrame):
    df = tm_teams.copy()

    if "tm_id" in df.columns:
        df["tm_id_int"] = df["tm_id"].apply(_as_int)
        id_col = "tm_id_int"
    elif "team_id" in df.columns:
        df["team_id_int"] = df["team_id"].apply(_as_int)
        id_col = "team_id_int"
    else:
        raise ValueError("tm_teams must have 'tm_id' or 'team_id'")

    name_col = None
    for c in ["team_name", "name", "club_name"]:
        if c in df.columns:
            name_col = c
            break
    if name_col is None:
        raise ValueError("tm_teams must have a team name column")

    out = {}
    for _, r in df.iterrows():
        tid = r[id_col]
        if tid is None:
            continue
        out[tid] = r[name_col]

    # inject pseudo clubs too
    out.update(SPECIAL_CLUB_NAMES)
    return out


def build_tm_competition_map(tm_leagues: pd.DataFrame):
    df = tm_leagues.copy()

    def extract_comp_id(link):
        if not isinstance(link, str):
            return None
        m = re.search(r"/wettbewerb/([^/?]+)", link)
        return m.group(1) if m else None

    df["competition_id"] = df["tm_link"].apply(extract_comp_id)

    comp_map = {}
    for _, r in df.iterrows():
        cid = r.get("competition_id")
        if not cid:
            continue
        comp_map[cid] = {
            "competition_name": r.get("league_name"),
            "competition_country": r.get("country")
        }
    return comp_map


def build_wy_to_tm_map(wy_tm_map: pd.DataFrame):
    df = wy_tm_map.copy()
    df["wy_id"] = df["wy_id"].apply(_as_int)
    df["tm_id"] = df["tm_id"].apply(_as_int)
    return dict(zip(df["wy_id"], df["tm_id"]))


# =========================
# API call (once)
# =========================
def fetch_player_transfer_history(tm_player_id):
    url = f"{API_BASE}/transfer/history/player/{tm_player_id}"
    try:
        resp = requests.get(url, headers=BASE_HEADERS, timeout=30)
        resp.raise_for_status()
        return resp.json()
    except requests.exceptions.RequestException as e:
        print(f"‚ö†Ô∏è Skipping TM player {tm_player_id}: {e}")
        return None


# =========================
# Parse transfers
# =========================
def parse_transfer_history(
    wy_player_id,
    tm_player_id,
    json_data,
    player_name_map,
    team_name_map,
    competition_map
):
    rows = []
    if not isinstance(json_data, dict):
        return rows

    data = json_data.get("data")
    if not isinstance(data, dict):
        return rows

    transfers = _extract_history_list(data)

    # names from mapping keyed by TM id
    pname = player_name_map.get(_as_int(tm_player_id), {})
    player_name = pname.get("player_name")
    player_first = pname.get("player_first_name")
    player_last = pname.get("player_last_name")
    player_short = pname.get("player_short_name")

    for t in transfers:
        src = t.get("transferSource") or {}
        dst = t.get("transferDestination") or {}
        det = t.get("details") or {}

        from_club_id = _as_int(src.get("clubId"))
        to_club_id = _as_int(dst.get("clubId"))

        from_comp_id = (src.get("competitionId") or "").strip() or None
        to_comp_id = (dst.get("competitionId") or "").strip() or None

        # competition map (best)
        from_comp_name = None
        from_comp_country = None
        if from_comp_id and from_comp_id in competition_map:
            from_comp_name = competition_map[from_comp_id]["competition_name"]
            from_comp_country = competition_map[from_comp_id]["competition_country"]

        to_comp_name = None
        to_comp_country = None
        if to_comp_id and to_comp_id in competition_map:
            to_comp_name = competition_map[to_comp_id]["competition_name"]
            to_comp_country = competition_map[to_comp_id]["competition_country"]

        # fallback: countryId -> country name
        if from_comp_country is None:
            from_comp_country = fetch_country_name_from_tm(src.get("countryId"))
        if to_comp_country is None:
            to_comp_country = fetch_country_name_from_tm(dst.get("countryId"))

        # team names: parquet -> fallback scrape -> pseudo club mapping
        team_name_from = team_name_map.get(from_club_id)
        team_name_to = team_name_map.get(to_club_id)

        if team_name_from is None and from_club_id is not None:
            team_name_from = fetch_club_name_from_tm(from_club_id)
            if team_name_from:
                team_name_map[from_club_id] = team_name_from

        if team_name_to is None and to_club_id is not None:
            team_name_to = fetch_club_name_from_tm(to_club_id)
            if team_name_to:
                team_name_map[to_club_id] = team_name_to

        fee_value = _money_value(det.get("fee"))
        mv_value = _money_value(det.get("marketValue"))

        date_str = det.get("date")
        contract_until = det.get("contractUntilDate")

        remaining_days = None
        remaining_contract = det.get("remainingContractPeriod")
        if isinstance(remaining_contract, dict):
            remaining_days = remaining_contract.get("days")

        rows.append({
            "wy_player_id": _as_int(wy_player_id),

            "player_name": player_name,
            "player_first_name": player_first,
            "player_last_name": player_last,
            "player_short_name": player_short,

            "player_id": _as_int(tm_player_id),

            "team_id_from": from_club_id,
            "team_name_from": team_name_from,

            "team_id_to": to_club_id,
            "team_name_to": team_name_to,

            "competition_id_from": from_comp_id,
            "competition_name_from": from_comp_name,
            "competition_country_from": from_comp_country,

            "competition_id_to": to_comp_id,
            "competition_name_to": to_comp_name,
            "competition_country_to": to_comp_country,

            "age_at_transfer": det.get("age"),
            "transfer_fee": fee_value,
            "transfer_value": mv_value,

            "date": date_str,
            "remaining_contract_period": remaining_days,
            "contract_until_date": contract_until,
        })

    return rows


# =========================
# Debug mapping
# =========================
def debug_player_mapping(wy_player_id, wy_to_tm_map, player_name_map):
    tm_id = wy_to_tm_map.get(_as_int(wy_player_id))
    names = player_name_map.get(_as_int(tm_id))

    print("\nDEBUG mapping")
    print("WY player_id:", wy_player_id)
    print("TM player_id:", tm_id)
    print("Names:", names)
    if tm_id is None:
        print("‚ùå No TM mapping found!")


# =========================
# MAIN
# =========================
wy_transfers = pd.read_parquet(PATH_WY_TRANSFERS)
wy_players = pd.read_parquet(PATH_WY_PLAYERS)
wy_competitions = pd.read_parquet(PATH_WY_COMPETITIONS)

tm_leagues = pd.read_parquet(PATH_TM_LEAGUES)
wy_tm_players_mapping = pd.read_parquet(PATH_WY_TM_MAP)
tm_teams = pd.read_parquet(PATH_TM_TEAMS)

wy_to_tm_map = build_wy_to_tm_map(wy_tm_players_mapping)
player_name_map = build_tm_player_name_map(wy_players, wy_tm_players_mapping)
team_name_map = build_tm_team_name_map(tm_teams)
competition_map = build_tm_competition_map(tm_leagues)

# -------------------------
# Slice for group splitting
# -------------------------
START_IDX = 0
END_IDX = 10  # <-- change per machine

wy_transfers_slice = wy_transfers.iloc[START_IDX:END_IDX].copy()

wy_player_ids_to_scrape = (
    wy_transfers_slice["player_id"]
    .dropna()
    .apply(_as_int)
    .drop_duplicates()
    .tolist()
)

print(f"\nScraping transfer rows {START_IDX} ‚Üí {END_IDX}")
print(f"Unique WY players to scrape: {len(wy_player_ids_to_scrape)}")

player_cache = {}
all_rows = []

print("\nüì° Fetching player transfer histories...")

total = len(wy_player_ids_to_scrape)

for i, wy_player_id in enumerate(wy_player_ids_to_scrape, 1):
    tm_player_id = wy_to_tm_map.get(_as_int(wy_player_id))

    print(f"‚û°Ô∏è [{i}/{total}] Processing WY {wy_player_id} -> TM {tm_player_id}")

    if tm_player_id is None:
        print(f"   ‚ùå No TM mapping for WY player {wy_player_id}, skipping")
        continue

    # fetch once per TM player
    if tm_player_id not in player_cache:
        json_data = fetch_player_transfer_history(tm_player_id)
        player_cache[tm_player_id] = json_data
        time.sleep(REQUEST_SLEEP)
    else:
        json_data = player_cache[tm_player_id]

    if json_data is None:
        print(f"   ‚ùå No history found for TM {tm_player_id}, skipping")
        continue

    api_count = len(_extract_history_list(json_data.get("data", {})))

    rows = parse_transfer_history(
        wy_player_id=wy_player_id,
        tm_player_id=tm_player_id,
        json_data=json_data,
        player_name_map=player_name_map,
        team_name_map=team_name_map,
        competition_map=competition_map
    )

    parsed_count = len(rows)

    if parsed_count != api_count:
        print(f"   ‚ö†Ô∏è COUNT MISMATCH: API={api_count}, parsed={parsed_count}")
    else:
        print(f"   ‚úÖ {parsed_count} transfers added")

    all_rows.extend(rows)

df_transfers = pd.DataFrame(all_rows)

# Clean date columns
for col in ["date", "contract_until_date"]:
    df_transfers[col] = (
        pd.to_datetime(df_transfers[col], errors="coerce", utc=True)
        .dt.date
        .astype("string")
    )

# Make country columns safe for parquet and consistent (strings only)
for col in ["competition_country_from", "competition_country_to"]:
    df_transfers[col] = df_transfers[col].astype("string")

print("\nSaved:", OUTPUT_TM_TRANSFERS)
print(df_transfers.head(20))
print(df_transfers.shape)

df_transfers.to_parquet(OUTPUT_TM_TRANSFERS, index=False)



Scraping transfer rows 0 ‚Üí 10
Unique WY players to scrape: 8

üì° Fetching player transfer histories...
‚û°Ô∏è [1/8] Processing WY 39558 -> TM 90824
   ‚úÖ 15 transfers added
‚û°Ô∏è [2/8] Processing WY 40619 -> TM 230748
   ‚úÖ 22 transfers added
‚û°Ô∏è [3/8] Processing WY 64543 -> TM 402041
   ‚úÖ 16 transfers added
‚û°Ô∏è [4/8] Processing WY 73017 -> TM 66205
   ‚úÖ 12 transfers added
‚û°Ô∏è [5/8] Processing WY 73161 -> TM 66618
   ‚úÖ 9 transfers added
‚û°Ô∏è [6/8] Processing WY 73300 -> TM 133160
   ‚úÖ 17 transfers added
‚û°Ô∏è [7/8] Processing WY 73305 -> TM 66224
   ‚úÖ 16 transfers added
‚û°Ô∏è [8/8] Processing WY 82868 -> TM 47074
   ‚úÖ 27 transfers added

Saved: C:\Users\falkj\Documents\Player_trading\data\tm_transfers.parquet
    wy_player_id           player_name player_first_name player_last_name  \
0          39558        Donjet Shkodra            Donjet          Shkodra   
1          39558        Donjet Shkodra            Donjet          Shkodra   
2          39558 