### Data collection
Source: https://sofifa.com/

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
# set current year and number of players records to scrape
current_year = 2022
n_players = 10000

# set up URL to scrape
urls = []
first_page = "https://sofifa.com/players?col=oa&sort=desc"
urls.append(first_page)
current_offset = 0
for ii in range(n_players // 60 + 2): # target: 5000 players
    current_offset += 60
    urls.append(first_page + '&offset=' + str(current_offset))

In [3]:
# function to get player stats from URL
def get_player_stats(player_columns):
    name = player_columns[1].find_all('a')[0].text
    country = player_columns[1].find_all('img')[0].get('title')
    best_position = player_columns[1].find_all('a')[1].text
    age = int(player_columns[2].text)
    rat_ova = int(player_columns[3].text)
    rat_pot = int(player_columns[4].text)
    team = player_columns[5].find('a').text
    if 'On Loan' in str(player_columns[5]):
        contract_exp = int(str(player_columns[5]).split('On Loan')[0].split('\n')[-1].split('<')[0].strip()[-4:])
        contract_start = current_year
    elif 'Free' in str(player_columns[5]):
        contract_exp = current_year
        contract_start = 0
    else:
        contract_exp = int(str(player_columns[5]).split('~')[1].split(' ')[1][:4])
        contract_start = int(str(player_columns[5]).split('~')[0][-5:].strip())
    qq = player_columns[6].text[1:]
    if qq.endswith('M'):
        value = (float(qq[:-1])*1000000)
    elif qq.endswith('K'):
        value = (float(qq[:-1])*1000)  
    else:
        value = qq
    qq = player_columns[7].text[1:]
    if qq.endswith('K'):
        wage = (float(qq[:-1])*1000)
    else:
        wage = qq
    return name, age, country, best_position, rat_ova, rat_pot, team, contract_start, contract_exp, value, wage

In [4]:
# scrape the data
name_list, age_list, country_list, pos_list, ova_list, pot_list, team_list, start_list, exp_list,\
val_list, wage_list = ([] for xx in range(11))

for page in urls: # iterate: urls/pages
    print(page)
    pageTree = requests.get(page)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    table = pageSoup.find_all('table')[0]
    n_players_table = len(table.find('tbody').find_all('tr'))
    table_body = table.find('tbody').find_all('tr')
    
    for j in range(0, n_players_table): # iterate: players
        player_table = table_body[j]
        player_columns = player_table.find_all('td')
        stats = get_player_stats(player_columns)
        name_list.append(stats[0])
        age_list.append(stats[1])
        country_list.append(stats[2])
        pos_list.append(stats[3])
        ova_list.append(stats[4])
        pot_list.append(stats[5])
        team_list.append(stats[6])
        start_list.append(stats[7])
        exp_list.append(stats[8])
        val_list.append(stats[9])
        wage_list.append(stats[10])

player_stats = pd.DataFrame({'name':name_list, 'age':age_list, 'country':country_list, 'position':pos_list,
                             'overall':ova_list, 'potential':pot_list, 'team':team_list, 
                             'start':start_list, 'expiration':exp_list,
                             'value':val_list, 'wage':wage_list})

https://sofifa.com/players?col=oa&sort=desc
https://sofifa.com/players?col=oa&sort=desc&offset=60
https://sofifa.com/players?col=oa&sort=desc&offset=120
https://sofifa.com/players?col=oa&sort=desc&offset=180
https://sofifa.com/players?col=oa&sort=desc&offset=240
https://sofifa.com/players?col=oa&sort=desc&offset=300
https://sofifa.com/players?col=oa&sort=desc&offset=360
https://sofifa.com/players?col=oa&sort=desc&offset=420
https://sofifa.com/players?col=oa&sort=desc&offset=480
https://sofifa.com/players?col=oa&sort=desc&offset=540
https://sofifa.com/players?col=oa&sort=desc&offset=600
https://sofifa.com/players?col=oa&sort=desc&offset=660
https://sofifa.com/players?col=oa&sort=desc&offset=720
https://sofifa.com/players?col=oa&sort=desc&offset=780
https://sofifa.com/players?col=oa&sort=desc&offset=840
https://sofifa.com/players?col=oa&sort=desc&offset=900
https://sofifa.com/players?col=oa&sort=desc&offset=960
https://sofifa.com/players?col=oa&sort=desc&offset=1020
https://sofifa.com/pl

https://sofifa.com/players?col=oa&sort=desc&offset=8820
https://sofifa.com/players?col=oa&sort=desc&offset=8880
https://sofifa.com/players?col=oa&sort=desc&offset=8940
https://sofifa.com/players?col=oa&sort=desc&offset=9000
https://sofifa.com/players?col=oa&sort=desc&offset=9060
https://sofifa.com/players?col=oa&sort=desc&offset=9120
https://sofifa.com/players?col=oa&sort=desc&offset=9180
https://sofifa.com/players?col=oa&sort=desc&offset=9240
https://sofifa.com/players?col=oa&sort=desc&offset=9300
https://sofifa.com/players?col=oa&sort=desc&offset=9360
https://sofifa.com/players?col=oa&sort=desc&offset=9420
https://sofifa.com/players?col=oa&sort=desc&offset=9480
https://sofifa.com/players?col=oa&sort=desc&offset=9540
https://sofifa.com/players?col=oa&sort=desc&offset=9600
https://sofifa.com/players?col=oa&sort=desc&offset=9660
https://sofifa.com/players?col=oa&sort=desc&offset=9720
https://sofifa.com/players?col=oa&sort=desc&offset=9780
https://sofifa.com/players?col=oa&sort=desc&offs

In [5]:
# remove duplicate records
print('duplicate records: {}/{:.1f}%'.format(np.sum(player_stats.duplicated()), 
                                             100*np.mean(player_stats.duplicated())))
player_stats.drop_duplicates(inplace=True, ignore_index=True)
player_stats

duplicate records: 105/1.0%


Unnamed: 0,name,age,country,position,overall,potential,team,start,expiration,value,wage
0,L. Messi,34,Argentina,RW,93,93,Paris Saint-Germain,2021,2023,78000000.0,320000.0
1,R. Lewandowski,32,Poland,ST,92,92,FC Bayern München,2014,2023,119500000.0,270000.0
2,K. Mbappé,22,France,ST,91,95,Paris Saint-Germain,2018,2022,194000000.0,230000.0
3,M. Salah,29,Egypt,RW,91,91,Liverpool,2017,2023,129000000.0,350000.0
4,K. De Bruyne,30,Belgium,CM,91,91,Manchester City,2015,2025,125500000.0,350000.0
...,...,...,...,...,...,...,...,...,...,...,...
10030,P. Pokorný,19,Slovakia,CDM,65,78,Real Sociedad B,2021,2024,1600000.0,1000.0
10031,T. Antonis,27,Australia,CM,65,65,Western Sydney Wanderers,2021,2024,750000.0,3000.0
10032,S. Gannon,29,Republic of Ireland,RWB,65,65,Shamrock Rovers,2020,2023,625000.0,1000.0
10033,L. Dalsaro,29,Uruguay,CDM,65,65,Cerro Largo Fútbol Club,2022,2026,600000.0,500


In [6]:
# save output
player_stats.to_csv(f'player_stats_{n_players}.csv', encoding='utf-8') 