In [1]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
import time
import random

### Scrapowanie pojedynczego zawodnika z linku do zawodnika w SOFIFA

In [2]:
def scrape_player(url, season):
    data = {}
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        data["season"] = season

        data["name"] = soup.select_one(".profile.clearfix h1").text
        data["position"] = soup.select_one(".profile.clearfix span.pos").text

        birth_day_pattern = r"\((\w{3} \d{1,2}, \d{4})"
        height_pattern = r"(\d{2,3})cm"
        weight_pattern = r"(\d{2,3})kg"

        data["birth_day"] = re.search(birth_day_pattern, soup.select_one(".profile.clearfix p").text).group(1)
        data["height"] = int(re.search(height_pattern, soup.select_one(".profile.clearfix p").text).group(1))
        data["weight"] = int(re.search(weight_pattern, soup.select_one(".profile.clearfix p").text).group(1))


        data["overall_rating"] = int(soup.select_one('div.sub:-soup-contains("Overall")').parent.select_one('em').text)
        data["potential"] = int(soup.select_one('div.sub:-soup-contains("Potential")').parent.select_one('em').text)
        data["value"] = soup.select_one('div.sub:-soup-contains("Value")').parent.select_one('em').text
        data["wage"] = soup.select_one('div.sub:-soup-contains("Wage")').parent.select_one('em').text
        data["stat_crossing"] = int(soup.select_one('span:-soup-contains("Crossing")').find_previous_sibling().text)
        data["stat_finishing"] = int(soup.select_one('span:-soup-contains("Finishing")').find_previous_sibling().text)
        data["stat_heading_accuracy"] = int(soup.select_one('span:-soup-contains("Heading accuracy")').find_previous_sibling().text)
        data["stat_short_passing"] = int(soup.select_one('span:-soup-contains("Short passing")').find_previous_sibling().text)
        data["stat_volleys"] = int(soup.select_one('span:-soup-contains("Volleys")').find_previous_sibling().text)
        data["stat_dribbling"] = int(soup.select_one('span:-soup-contains("Dribbling")').find_previous_sibling().text)
        data["stat_curve"] = int(soup.select_one('span:-soup-contains("Curve")').find_previous_sibling().text)
        data["stat_fk_accuracy"] = int(soup.select_one('span:-soup-contains("FK Accuracy")').find_previous_sibling().text)
        data["stat_long_passing"] = int(soup.select_one('span:-soup-contains("Long passing")').find_previous_sibling().text)
        data["stat_ball_control"] = int(soup.select_one('span:-soup-contains("Ball control")').find_previous_sibling().text)
        data["stat_accelaration"] = int(soup.select_one('span:-soup-contains("Acceleration")').find_previous_sibling().text)
        data["stat_sprint_speed"] = int(soup.select_one('span:-soup-contains("Sprint speed")').find_previous_sibling().text)
        data["stat_agility"] = int(soup.select_one('span:-soup-contains("Agility")').find_previous_sibling().text)
        data["stat_reactions"] = int(soup.select_one('span:-soup-contains("Reactions")').find_previous_sibling().text)
        data["stat_balance"] = int(soup.select_one('span:-soup-contains("Balance")').find_previous_sibling().text)
        data["stat_reactions"] = int(soup.select_one('span:-soup-contains("Shot power")').find_previous_sibling().text)
        data["stat_jumping"] = int(soup.select_one('span:-soup-contains("Jumping")').find_previous_sibling().text)
        data["stat_stamina"] = int(soup.select_one('span:-soup-contains("Stamina")').find_previous_sibling().text)
        data["stat_strength"] = int(soup.select_one('span:-soup-contains("Strength")').find_previous_sibling().text)
        data["stat_long_shots"] = int(soup.select_one('span:-soup-contains("Long shots")').find_previous_sibling().text)
        data["stat_aggression"] = int(soup.select_one('span:-soup-contains("Aggression")').find_previous_sibling().text)
        data["stat_interceptions"] = int(soup.select_one('span:-soup-contains("Interceptions")').find_previous_sibling().text)
        data["stat_att_position"] = int(soup.select_one('span:-soup-contains("Att. Position")').find_previous_sibling().text)
        data["stat_vision"] = int(soup.select_one('span:-soup-contains("Vision")').find_previous_sibling().text)
        data["stat_penalties"] = int(soup.select_one('span:-soup-contains("Penalties")').find_previous_sibling().text)
        data["stat_composure"] = int(soup.select_one('span:-soup-contains("Composure")').find_previous_sibling().text)
        # data["stat_marking"] = int(soup.select_one('span:-soup-contains("Marking")').find_previous_sibling().text)
        data["stat_marking"] = int(soup.select_one('span:-soup-contains("Defensive awareness")').find_previous_sibling().text)
        data["stat_standing_tackle"] = int(soup.select_one('span:-soup-contains("Standing tackle")').find_previous_sibling().text)
        data["stat_sliding_tackle"] = int(soup.select_one('span:-soup-contains("Sliding tackle")').find_previous_sibling().text)
        data["stat_gk_diving"] = int(soup.select_one('span:-soup-contains("GK Diving")').find_previous_sibling().text)
        data["stat_gk_handling"] = int(soup.select_one('span:-soup-contains("GK Handling")').find_previous_sibling().text)
        data["stat_gk_kicking"] = int(soup.select_one('span:-soup-contains("GK Kicking")').find_previous_sibling().text)
        data["stat_gk_positioning"] = int(soup.select_one('span:-soup-contains("GK Positioning")').find_previous_sibling().text)
        data["stat_gk_reflexes"] = int(soup.select_one('span:-soup-contains("GK Reflexes")').find_previous_sibling().text)

        data["preferred_foot"] = soup.select_one('p:-soup-contains("Preferred foot")').text.split()[2]


        return data
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

### Test scrapowania pojedynczego zawodnika

In [4]:
scrape_player("https://sofifa.com/player/177475/tobias-sippel/220069/", "2019-2020")

{'season': '2019-2020',
 'name': '\xa0Tobias Sippel',
 'position': 'GK',
 'birth_day': 'Mar 22, 1988',
 'height': 183,
 'weight': 80,
 'overall_rating': 73,
 'potential': 73,
 'value': '€1M',
 'wage': '€16K',
 'stat_crossing': 14,
 'stat_finishing': 12,
 'stat_heading_accuracy': 13,
 'stat_short_passing': 33,
 'stat_volleys': 12,
 'stat_dribbling': 13,
 'stat_curve': 15,
 'stat_fk_accuracy': 14,
 'stat_long_passing': 26,
 'stat_ball_control': 19,
 'stat_accelaration': 50,
 'stat_sprint_speed': 42,
 'stat_agility': 48,
 'stat_reactions': 57,
 'stat_balance': 41,
 'stat_jumping': 64,
 'stat_stamina': 41,
 'stat_strength': 65,
 'stat_long_shots': 11,
 'stat_aggression': 36,
 'stat_interceptions': 15,
 'stat_att_position': 16,
 'stat_vision': 31,
 'stat_penalties': 18,
 'stat_composure': 41,
 'stat_marking': 12,
 'stat_standing_tackle': 15,
 'stat_sliding_tackle': 12,
 'stat_gk_diving': 74,
 'stat_gk_handling': 69,
 'stat_gk_kicking': 76,
 'stat_gk_positioning': 69,
 'stat_gk_reflexes': 77

### Scrapowanie zespołów

In [5]:
def scrape_players(base_url, season):

    headers = {'User-Agent': 'Mozilla/5.0'}
    offset = 0
    flag = True
    links = []
    while flag:
        url = base_url + "&offset=" + str(offset)
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            offset += 60
            soup = BeautifulSoup(response.content, 'html.parser')

            trs = soup.select("article tbody tr")
            for tr in trs:
                tds = tr.find_all("td")
                if len(tds) > 1:
                    links.append(tds[1].select_one("a").get("href"))
        else:
            flag = False
            print(f"Failed to retrieve the page. Status code: {response.status_code}")
    player_data = []
    for link in links:
        print(link)
        player_data.append(scrape_player("https://sofifa.com/" + link, season))
    return player_data

In [6]:
# sofifa_players_17_18 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=170099&set=true", "2017-2018")
# sofifa_players_18_19 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=180084&set=true", "2018-2019")
# sofifa_players_19_20 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=190075&set=true", "2019-2020")
# sofifa_players_20_21 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=200061&set=true", "2020-2021")
# sofifa_players_21_22 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=210064&set=true", "2021-2022")
# sofifa_players_22_23 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=220069&set=true", "2022-2023")
# sofifa_players_23_24 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=230054&set=true", "2023-2024")
sofifa_players_24_25 = scrape_players("https://sofifa.com/players?type=all&lg%5B0%5D=13&lg%5B1%5D=16&lg%5B2%5D=19&lg%5B3%5D=31&lg%5B4%5D=53&r=240050&set=true", "2024-2025")

Failed to retrieve the page. Status code: 404
/player/192985/kevin-de-bruyne/240050/
/player/239085/erling-haaland/240050/
/player/231747/kylian-mbappe/240050/
/player/231866/rodrigo-hernandez-cascante/240050/
/player/202126/harry-kane/240050/
/player/188545/robert-lewandowski/240050/
/player/192119/thibaut-courtois/240050/
/player/203376/virgil-van-dijk/240050/
/player/209331/mohamed-salah/240050/
/player/212831/alisson-ramses-becker/240050/
/player/239818/ruben-santos-gato-alves-dias/240050/
/player/192448/marc-andre-ter-stegen/240050/
/player/238794/vinicius-jose-de-oliveira-junior/240050/
/player/222665/martin-odegaard/240050/
/player/210257/ederson-santana-de-moraes/240050/
/player/218667/bernardo-mota-carvalho-e-silva/240050/
/player/231478/lautaro-martinez/240050/
/player/194765/antoine-griezmann/240050/
/player/200389/jan-oblak/240050/
/player/239053/federico-valverde/240050/
/player/252371/jude-bellingham/240050/
/player/234378/declan-rice/240050/
/player/237692/phil-foden/240

In [7]:
# df_sofifa_players_17_18 = pd.DataFrame(sofifa_players_17_18)
# df_sofifa_players_18_19 = pd.DataFrame(sofifa_players_18_19)
# df_sofifa_players_19_20 = pd.DataFrame(sofifa_players_19_20)
# df_sofifa_players_20_21 = pd.DataFrame(sofifa_players_20_21)
# df_sofifa_players_21_22 = pd.DataFrame(sofifa_players_21_22)
# df_sofifa_players_22_23 = pd.DataFrame(sofifa_players_22_23)
# df_sofifa_players_23_24 = pd.DataFrame(sofifa_players_23_24)
df_sofifa_players_24_25 = pd.DataFrame(sofifa_players_24_25)

In [8]:
# df_sofifa_players_17_18.to_csv("data/sofifa_players_17_18.csv", index=False)
# df_sofifa_players_18_19.to_csv("data/sofifa_players_18_19.csv", index=False)
# df_sofifa_players_19_20.to_csv("data/sofifa_players_19_20.csv", index=False)
# df_sofifa_players_20_21.to_csv("data/sofifa_players_20_21.csv", index=False)
# df_sofifa_players_21_22.to_csv("data/sofifa_players_21_22.csv", index=False)
# df_sofifa_players_22_23.to_csv("data/sofifa_players_22_23.csv", index=False)
df_sofifa_players_24_25.to_csv("data/sofifa_players_24_25.csv", index=False)