In [31]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

# Read parquet file
df = pd.read_parquet(r"C:\Users\falkj\Documents\Player_trading\data\male_transfers_data.parquet")
players = pd.read_parquet(r"C:\Users\falkj\Documents\Player_trading\data\players_wyscout.parquet")
competitions = pd.read_parquet(r"C:\Users\falkj\Documents\Player_trading\data\competitions_wyscout.parquet")

print("Rows, Columns:", players.shape)
print("Columns:", players.columns.tolist())

print("Rows, Columns:", competitions.shape)
print("Columns:", competitions.columns.tolist())

print("Rows, Columns:", df.shape)
print("Columns:", df.columns.tolist())


# ------------------- Your existing code -------------------
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}


def parse_market_value(value_text):
    if not value_text:
        return None

    value_text = value_text.replace("€", "").lower().strip()

    if value_text in {"-", ""}:
        return None

    if "m" in value_text:
        return int(float(value_text.replace("m", "")) * 1_000_000)

    if "k" in value_text:
        return int(float(value_text.replace("k", "")) * 1_000)

    return None


def scrape_market_value_from_search(player_name):
    query = player_name.replace(" ", "+")
    url = (
        "https://www.transfermarkt.com/"
        f"schnellsuche/ergebnis/schnellsuche?query={query}"
    )

    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "lxml")

    table = soup.find("table", class_="items")
    if table is None:
        return None

    rows = table.find_all("tr", class_=["odd", "even"])
    if not rows:
        return None

    first_row = rows[0]

    # player name
    name_cell = first_row.find("td", class_="hauptlink")
    player_name_found = name_cell.get_text(strip=True) if name_cell else None

    # market value cell
    mv_cell = first_row.find("td", class_="rechts hauptlink")
    market_value_raw = mv_cell.get_text(strip=True) if mv_cell else None
    market_value_eur = parse_market_value(market_value_raw)

    return {
        "player_name_found": player_name_found,
        "market_value_raw": market_value_raw,
        "market_value_eur": market_value_eur,
    }


# Number of players to scrape
NUM_PLAYERS = 0

# ------------------- Scrape loop -------------------
results = []

for i, row in players.head(NUM_PLAYERS).iterrows():
    full_name = f"{row['first_name']} {row['last_name']}"
    print(f"Scraping {i+1}/{NUM_PLAYERS}: {full_name} ...")
    
    try:
        mv_info = scrape_market_value_from_search(full_name)
        if mv_info:
            mv_info["first_name"] = row["first_name"]
            mv_info["last_name"] = row["last_name"]
        else:
            mv_info = {
                "first_name": row["first_name"],
                "last_name": row["last_name"],
                "player_name_found": None,
                "market_value_raw": None,
                "market_value_eur": None,
            }
        results.append(mv_info)
    except Exception as e:
        print(f"Error scraping {full_name}: {e}")
        results.append({
            "first_name": row["first_name"],
            "last_name": row["last_name"],
            "player_name_found": None,
            "market_value_raw": None,
            "market_value_eur": None,
        })

    # Be polite to Transfermarkt servers
    time.sleep(1.5)  # 1.5 seconds delay

# ------------------- Create new dataframe -------------------
output_path = r"C:\Users\falkj\Documents\Player_trading\data\players_wyscout_with_market_value.csv"
players_wyscout_with_mv.to_csv(output_path, index=False)

print(players_wyscout_with_mv)



Rows, Columns: (50000, 14)
Columns: ['player_id', 'short_name', 'first_name', 'last_name', 'name', 'birth_date', 'height', 'weight', 'passport', 'birth_country', 'image_url', 'gender', 'foot', 'role']
Rows, Columns: (1793, 18)
Columns: ['competition_id', 'competition', 'name', 'country', 'division', 'gender', 'type', 'category', 'season_id', 'start_date', 'end_date', 'completed', 'season', 'youth', 'domestic_cup', 'domestic_league', 'international_cup', 'season_name']
Rows, Columns: (77782, 366)
Columns: ['player_id', 'gender', 'from_competition', 'to_competition', 'from_team_id', 'to_team_id', 'from_season', 'to_season', 'last_played_date', 'first_played_date', 'team_id', 'competition', 'season', 'position', 'Minutes', 'from_Hold-up play', 'from_Involvement', 'from_Providing teammates', 'from_Aerial threat', 'from_Poaching', 'from_Run quality', 'from_Pressing', 'from_Finishing', 'from_Box threat', 'from_Dribbling', 'from_Active defence', 'from_Progression', 'from_Intelligent defence',