In [11]:
import requests
from bs4 import BeautifulSoup
import re
import json
import time
import numpy as np


# Set up headers to mimic a real browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

# URL to scrape
urls = ["https://www.futbin.com/players?page=2&league=53", "https://www.futbin.com/players?page=3&league=53", "https://www.futbin.com/players?page=4&league=53"]

url_basic = "https://www.futbin.com/players?page="

urls = [url_basic + str(i) for i in range(1, 20)]

# Send request

responses = []

for url in urls:
    # sleep for random float amount of seconds
    time.sleep(np.random.uniform(0, np.random.uniform(1,3)))
    response = requests.get(url, headers=headers)
    # Check if request was successful
    if response.status_code == 200:
        responses.append(response)
    else:
        print("Failed to fetch page with URL: " + url + ". Status code: " + str(response.status_code))
        break


In [18]:
def extract_info(player_container):
    club = league = nation = playstyle = None
    
    info_container = player_container.find("div", class_="table-player-sub-info")
    # extract atribute all subcontainers inside info_container
    info = info_container.find_all("a")
    info = [i['href'] for i in info]
    
    club_pattern = re.compile(r'club=(\d+)')
    league_pattern = re.compile(r'league=(\d+)')
    nation_pattern = re.compile(r'nation=(\d+)')
    playstyle_pattern = re.compile(r'playstylesplus=([^&]+)')
    for i in info:
        club_match = club_pattern.search(i)
        league_match = league_pattern.search(i)
        nation_match = nation_pattern.search(i)
        playstyle_match = playstyle_pattern.search(i)
        
        club = club_match.group(1) if club_match else club
        league = league_match.group(1) if league_match else league
        nation = nation_match.group(1) if nation_match else nation
        playstyle = playstyle_match.group(1) if playstyle_match else playstyle
    return club, league, nation, playstyle

def extract_global_score(player_container):
    global_score = player_container.find("div", class_="player-rating-card-text").text.strip()
    return global_score

def extract_stats(player_container):
    stats_names = ["pace", "shooting", "passing", "dribbling", "defending", "physicality"]
    stats = {}
    for stat_name in stats_names:
        stat = int(player_container.find("td", class_="table-" + stat_name).find("div", class_="table-key-stats").text.strip())
        stats[stat_name] = stat
    return stats

def extract_price(player_container):
    price = player_container.find("div", class_="price").text.strip().lower()
    kk = price.replace('k', '').strip()
    if 'm' in price:
        price = round(float(price.replace('m', '')) * 1000000)
    elif 'k' in price:
        price = round(float(price.replace('k', '')) * 1000)
    else:
        price = round(float(price))
    return price

def extract_position(player_container):
    position_container = player_container.find("div", class_="table-pos-main")
    position = position_container.text.strip()
    position = position.replace('+', '')
    return position

def extract_player_type(player_container):
    player_type = player_container.find("div", class_="table-player-revision").text.strip()
    return player_type

players_dict = {}
for r in responses:
    soup = BeautifulSoup(r.content, "html.parser")
    players = soup.find_all("tr", class_="player-row")

    for p in players:
        name_container = p.find("div", class_="table-player-info")
        name = name_container.find("a").text.strip()
        
        club, league, nation, playstyle = extract_info(p)
        
        global_score = extract_global_score(p)
        
        stats = extract_stats(p)
        
        price = extract_price(p)
        
        position = extract_position(p)
        
        player_type = extract_player_type(p)
        
        player_dict = {}
        player_dict["name"] = name
        player_dict["club"] = club
        player_dict["league"] = league
        player_dict["nation"] = nation
        player_dict["playstyle"] = playstyle
        player_dict["global_score"] = global_score
        player_dict["stats"] = stats
        player_dict["price"] = price
        player_dict["position"] = position
        player_dict["player_type"] = player_type
        player_dict["is_icon"] = "icon" in player_type.lower()
        player_dict["is_hero"] = "hero" in player_type.lower()
        
        players_dict[name] = player_dict

In [19]:
# save to a file the dictionary
with open('players.json', 'w') as f:
    json.dump(players_dict, f)

Read players from dict and convert them to csv

In [20]:
def get_cols_dict(dict_var):
    # get keys recursively
    cols = []
    for key in dict_var.keys():
        if isinstance(dict_var[key], dict):
            cols = cols + get_cols_dict(dict_var[key])
        else:
            cols.append(key)
    return cols

In [21]:
def flatten_dict(dict_var):
    # flatten the dictionary recursively
    flat_dict = {}
    for key in dict_var.keys():
        if isinstance(dict_var[key], dict):
            for k, v in dict_var[key].items():
                flat_dict[key + "_" + k] = v
        else:
            flat_dict[key] = dict_var[key]
    return flat_dict

In [22]:
import json
import pandas as pd

with open("players.json", "r") as f:
    players = json.load(f)
    
players_list = []

for p_name in players:
    p = players[p_name]
    p = flatten_dict(p)
    players_list.append(p)    

df = pd.DataFrame(players_list)
df.to_csv("players.csv", index=False)