Adapted from https://github.com/amanthedorkknight/fifa18-all-player-statistics/blob/master/2019/crawler.py

In [8]:
#!/usr/bin/env python3

# Adapted from https://github.com/amanthedorkknight/fifa18-all-player-statistics/blob/master/2019/crawler.py

import sys
from time import sleep
from random import randint
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

import logging,sys
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

# Build a full list of players. Set this to False for a rerun where we've already gathered this information.
get_basic_player_info = True
saved_player_list = 'basic_player_info.json.zip'

# If you want to write blocks of player records to the CSV more or less often,
# change player_block_size. Don't mess with processed_count.
player_block_size = 500
processed_count = 1

output_file = 'data.csv'

# Overwrite an existing output CSV (this switches to 'a' in the loop, to append)
to_csv_mode = 'w'

# Only write a header the first time we write to the CSV.
write_header = True

base_url = "https://sofifa.com/players?offset="
columns = ['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall', 'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special']
data = pd.DataFrame(columns = columns)

In [9]:
# Get basic players information for all players
if (get_basic_player_info):
    for offset in range(0, 300):
        url = base_url + str(offset * 61)
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'html.parser')
        table_body = soup.find('tbody')
        for row in table_body.findAll('tr'):
            td = row.findAll('td')
            picture = td[0].find('img').get('data-src')
            pid = td[0].find('img').get('id')
            nationality = td[1].find('a').get('title')
            flag_img = td[1].find('img').get('data-src')
            name = td[1].findAll('a')[1].text
            logging.info("Retrieving player info for {}, ID {}".format(name,pid))
            age = td[2].text.strip()
            overall = td[3].text.strip()
            potential = td[4].text.strip()
            club = td[5].find('a').text
            club_logo = td[5].find('img').get('data-src')
            value = td[6].text.strip()
            wage = td[7].text.strip()
            special = td[8].text.strip()
            player_data = pd.DataFrame([[pid, name, age, picture, nationality, flag_img, overall, potential, club, club_logo, value, wage, special]])
            player_data.columns = columns
            data = data.append(player_data, ignore_index=True)
    data = data.drop_duplicates()

# Write basic player info, so we don't have to retrieve it again in a rerun.
    data.to_json(saved_player_list,orient='split',index=False)
else:
# Try reading the players list from a saved file
    data = pd.read_json(saved_player_list,orient='split')
    logging.info("Successfully read {}".format(saved_player_list))

INFO:root:Retrieving player info for Borja Iglesias, ID 224179
INFO:root:Retrieving player info for D. Man, ID 239978
INFO:root:Retrieving player info for D. Arzani, ID 234867
INFO:root:Retrieving player info for L. Waldschmidt, ID 220085
INFO:root:Retrieving player info for A. Halilović, ID 216349
INFO:root:Retrieving player info for C. Nkunku, ID 232411
INFO:root:Retrieving player info for P. Foden, ID 237692
INFO:root:Retrieving player info for Savio, ID 8287
INFO:root:Retrieving player info for K. Havertz, ID 235790
INFO:root:Retrieving player info for Adama Traoré, ID 213956
INFO:root:Retrieving player info for S. Gnabry, ID 206113
INFO:root:Retrieving player info for João Félix, ID 242444
INFO:root:Retrieving player info for E. Håland, ID 239085
INFO:root:Retrieving player info for M. Kean, ID 236610
INFO:root:Retrieving player info for Fábio Silva, ID 252037
INFO:root:Retrieving player info for D. Malen, ID 231447
INFO:root:Retrieving player info for N. Domínguez, ID 237819
INFO

INFO:root:Retrieving player info for C. Romero, ID 232488
INFO:root:Retrieving player info for T. Müller, ID 189596
INFO:root:Retrieving player info for L. Goretzka, ID 209658
INFO:root:Retrieving player info for T. Kehrer, ID 226229
INFO:root:Retrieving player info for A. Marin, ID 244267
INFO:root:Retrieving player info for O. Zinchenko, ID 227813
INFO:root:Retrieving player info for Rúben Dias, ID 239818
INFO:root:Retrieving player info for M. Zaracho, ID 235926
INFO:root:Retrieving player info for M. Škriniar, ID 232363
INFO:root:Retrieving player info for S. Esposito, ID 247888
INFO:root:Retrieving player info for C. Eriksen, ID 190460
INFO:root:Retrieving player info for Renan Lodi, ID 251573
INFO:root:Retrieving player info for M. Gibbs-White, ID 236015
INFO:root:Retrieving player info for P. Cutrone, ID 237715
INFO:root:Retrieving player info for Junior Firpo, ID 241184
INFO:root:Retrieving player info for W. Zaha, ID 198717
INFO:root:Retrieving player info for Emerson, ID 2472

In [10]:
logging.info("Player info dataframe shape: {}".format(data.shape))

INFO:root:Player info dataframe shape: (180, 13)


In [11]:
# Get detailed player information from player page.
# 2020 updates: 'Release Clause' and 'DefensiveAwareness' added.
detailed_columns = ['ID', 'Preferred Foot', 'International Reputation', 'Weak Foot', 'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Player Positions', 'Position', 'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until', 'Release Clause', 'Height', 'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking', 'DefensiveAwareness', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']
detailed_data = pd.DataFrame(index = range(0, data.count()[0]), columns = detailed_columns)
player_data_url = 'https://sofifa.com/player/'

In [19]:
for id in data.ID:
# It's rude to hammer a website with requests. Let's sleep for some random amount of time to slow things down.
    sleep(randint(1,5))

    skill_map = {}
    logging.info("Retrieving individual info for player ID {}. Processed {}/{}".format(id,processed_count,player_block_size))
    url = player_data_url + str(id)
    try:
        source_code = requests.get(url)
    except:
        logging.info("Unable to retrieve info for player ID {}! Skipping.".format(id))
        continue
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, 'html.parser')

# Metadata format is: first_name, last_name, flag (returns an empty string), position(s), 'Age', age_number, '('+birth_month, birth_day+',', birth_year+')'
    meta_data = soup.find('div', {'class': 'meta'}).text.split(' ')
    length = len(meta_data)
    flag_space_ind = meta_data.index('')
    age_string_ind = meta_data.index('Age')
# We'll capture all of the positions listed here now. We can decide what to do with them in analysis.
    skill_map['Player Positions'] = str.join(' ',meta_data[flag_space_ind+1:age_string_ind])
    weight = meta_data[length - 1]
    height = meta_data[length - 2].split('\'')[0] + '\'' + meta_data[length - 2].split('\'')[1].split('\"')[0]
    skill_map["Height"] = height
    skill_map['Weight'] = weight

# Players' attributes have been split across a few different classes, so we'll extract values from all of them.
    for col_idx in [5,6]:
        columns = soup.find('div', {'class': 'teams'}).find('div', {'class': 'columns'}).find_all('div', {'class': 'column col-{}'.format(col_idx)})
        for column in columns:
            skills = column.find_all('li')
            for skill in skills:
                if(skill.find('label') != None):
                    label = skill.find('label').text
                    value = skill.text.replace(label, '').strip()
                    skill_map[label] = value

# col-5 information (the team to which a player belongs, and the position they play) may not be populated.
# If it isn't, use the first position listed next to their name as Position.
    if('Position' not in skill_map.keys()):
        skill_map['Position'] = meta_data[3]


# Position scores have been moved outside of the 'article' tag to an 'aside' tag.
# Goalkeepers don't have these stats.
    if (skill_map['Position'] != 'GK'):
        positions = soup.find('aside').find_all('div', class_='columns')
        for position in positions:
            for pos_div in position.find_all('div', class_=re.compile('column col-sm-2 text-center p')):
                my_output = re.split(r'(\d+)',pos_div.text,maxsplit=1)
                my_output[0] = my_output[0][1:]

# There is a bonus awarded to some players. Capture it--we can decide what to do with it in analysis.
                skill_map[my_output[0]] = str.join('',my_output[1:3])

# The rest of the players' attributes have been split into a bunch of classes as well.
# Bonus work, tbd: Capture the class information regarding the players' skill levels in each area--
#  tags look like "bp3-intent-(success|warning|danger)". Could be useful summary info?
    sections = soup.find('article').find_all('div', class_=re.compile('bp3-callout '))[2:]
    for section in sections:
        items = section.find('ul').find_all('li')
        for item in items:
            value = int(re.findall(r'\d+', item.text)[0])
            name = ''.join(re.findall('[a-zA-Z]*', item.text))
            skill_map[str(name)] = value

    detailed_data = detailed_data.append({'ID': id},ignore_index=True)
    for key, value in skill_map.items():
        detailed_data.loc[detailed_data.ID == id, key] = value
    processed_count += 1

# Have we processed a full block of records? If so, merge and write them to the CSV.
    if (processed_count > player_block_size):
        logging.info("Merging player data...")
        full_data = pd.merge(data, detailed_data, how = 'inner', on = 'ID')
        logging.info("Writing player data to CSV...")
        full_data.to_csv(output_file, encoding='utf-8-sig',mode=to_csv_mode,header=write_header)
        processed_count = 1
        to_csv_mode = 'a'
        write_header = False
        detailed_data = pd.DataFrame(index = range(0, data.count()[0]), columns = detailed_columns)

INFO:root:Retrieving individual info for player ID 224179. Processed 2/500
INFO:root:Retrieving individual info for player ID 239978. Processed 3/500
INFO:root:Retrieving individual info for player ID 234867. Processed 4/500
INFO:root:Retrieving individual info for player ID 220085. Processed 5/500
INFO:root:Retrieving individual info for player ID 216349. Processed 6/500
INFO:root:Retrieving individual info for player ID 232411. Processed 7/500
INFO:root:Retrieving individual info for player ID 237692. Processed 8/500
INFO:root:Retrieving individual info for player ID 8287. Processed 9/500
INFO:root:Retrieving individual info for player ID 235790. Processed 10/500
INFO:root:Retrieving individual info for player ID 213956. Processed 11/500
INFO:root:Retrieving individual info for player ID 206113. Processed 12/500
INFO:root:Retrieving individual info for player ID 242444. Processed 13/500
INFO:root:Retrieving individual info for player ID 239085. Processed 14/500
INFO:root:Retrieving i

INFO:root:Retrieving individual info for player ID 242816. Processed 110/500
INFO:root:Retrieving individual info for player ID 226491. Processed 111/500
INFO:root:Retrieving individual info for player ID 205693. Processed 112/500
INFO:root:Retrieving individual info for player ID 246420. Processed 113/500
INFO:root:Retrieving individual info for player ID 230658. Processed 114/500
INFO:root:Retrieving individual info for player ID 225100. Processed 115/500
INFO:root:Retrieving individual info for player ID 246191. Processed 116/500
INFO:root:Retrieving individual info for player ID 243828. Processed 117/500
INFO:root:Retrieving individual info for player ID 242236. Processed 118/500
INFO:root:Retrieving individual info for player ID 235353. Processed 119/500
INFO:root:Retrieving individual info for player ID 229984. Processed 120/500
INFO:root:Retrieving individual info for player ID 252259. Processed 121/500
INFO:root:Retrieving individual info for player ID 231866. Processed 122/500

In [21]:
# One final merge and write to cover whatever we haven't yet processed
logging.info("Merging player data...")
full_data = pd.merge(data, detailed_data, how = 'inner', on = 'ID')
logging.info("Writing player data to CSV...")
full_data.to_csv(output_file, encoding='utf-8-sig',mode=to_csv_mode,header=write_header)

INFO:root:Merging player data...
INFO:root:Writing player data to CSV...


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17948 entries, 0 to 17947
Data columns (total 13 columns):
ID             17948 non-null int64
Name           17948 non-null object
Age            17948 non-null int64
Photo          17948 non-null object
Nationality    17948 non-null object
Flag           17948 non-null object
Overall        17948 non-null int64
Potential      17948 non-null int64
Club           17948 non-null object
Club Logo      17948 non-null object
Value          17948 non-null object
Wage           17948 non-null object
Special        17948 non-null int64
dtypes: int64(5), object(8)
memory usage: 1.8+ MB


In [20]:
meta_data

['Julian',
 'Brandt',
 '',
 'CAM',
 'LM',
 'RM',
 'Age',
 '23',
 '(May',
 '2,',
 '1996)',
 '6\'1"',
 '183lbs']

In [20]:
length = len(meta_data)
flag_space_ind = meta_data.index('')
age_string_ind = meta_data.index('Age')

In [21]:
flag_space_ind

2

In [22]:
age_string_ind

5

In [26]:
meta_data[flag_space_ind+1:age_string_ind]

['CF', 'ST']

In [27]:
skill_map['Player Positions'] = meta_data[flag_space_ind+1:age_string_ind]

In [42]:
pos_div.text

'\nRB50+2'

In [49]:
my_output = re.split(r'(\d+)',pos_div.text,maxsplit=1)
my_output

['\nRB', '50', '+2']

In [15]:
skill_map

{'Player Positions': ['RM', 'RW'],
 'Height': "6'0",
 'Weight': '181lbs',
 'Position': 'RW',
 'Jersey Number': '98',
 'Joined': 'Sep 5, 2016',
 'Contract Valid Until': '2022',
 'Preferred Foot': 'Left',
 'International Reputation': '1',
 'Weak Foot': '2',
 'Skill Moves': '3',
 'Work Rate': 'High/ Low',
 'Body Type': 'Normal',
 'Real Face': 'No',
 'Release Clause': '€8.8M',
 'LS': '71+2',
 'ST': '71+2',
 'RS': '71+2',
 'LW': '71+2',
 'LF': '71+2',
 'CF': '71+2',
 'RF': '71+2',
 'RW': '71+2',
 'LAM': '69+2',
 'CAM': '69+2',
 'RAM': '69+2',
 'LM': '70+2',
 'LCM': '62+2',
 'CM': '62+2',
 'RCM': '62+2',
 'RM': '70+2',
 'LWB': '54+2',
 'LDM': '50+2',
 'CDM': '50+2',
 'RDM': '50+2',
 'RWB': '54+2',
 'LB': '51+2',
 'LCB': '47+2',
 'CB': '47+2',
 'RCB': '47+2',
 'RB': '51+2',
 'Crossing': 66,
 'Finishing': 73,
 'HeadingAccuracy': 63,
 'ShortPassing': 66,
 'Volleys': 57,
 'Dribbling': 71,
 'Curve': 64,
 'FKAccuracy': 56,
 'LongPassing': 54,
 'BallControl': 73,
 'Acceleration': 86,
 'SprintSpeed'

In [16]:
detailed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17949 entries, 0 to 17948
Data columns (total 77 columns):
ID                          1 non-null float64
Preferred Foot              1 non-null object
International Reputation    1 non-null object
Weak Foot                   1 non-null object
Skill Moves                 1 non-null object
Work Rate                   1 non-null object
Body Type                   1 non-null object
Real Face                   1 non-null object
Position                    1 non-null object
Jersey Number               1 non-null object
Joined                      1 non-null object
Loaned From                 0 non-null float64
Contract Valid Until        1 non-null object
Release Clause              1 non-null object
Height                      1 non-null object
Weight                      1 non-null object
LS                          1 non-null float64
ST                          1 non-null float64
RS                          1 non-null float64
LW          

In [32]:
skill_map

{'Player Positions': ['ST'],
 'Height': "6'2",
 'Weight': '190lbs',
 'Position': 'ST',
 'Jersey Number': '9',
 'Joined': 'Aug 14, 2019',
 'Contract Valid Until': '2024',
 'Preferred Foot': 'Right',
 'International Reputation': '1',
 'Weak Foot': '3',
 'Skill Moves': '3',
 'Work Rate': 'High/ Medium',
 'Body Type': 'Normal',
 'Real Face': 'No',
 'Release Clause': '€70.7M',
 'LS': 82,
 'ST': 82,
 'RS': 82,
 'LW': 76,
 'LF': 80,
 'CF': 80,
 'RF': 80,
 'RW': 76,
 'LAM': 77,
 'CAM': 77,
 'RAM': 77,
 'LM': 74,
 'LCM': 72,
 'CM': 72,
 'RCM': 72,
 'RM': 74,
 'LWB': 55,
 'LDM': 58,
 'CDM': 58,
 'RDM': 58,
 'RWB': 55,
 'LB': 52,
 'LCB': 53,
 'CB': 53,
 'RCB': 53,
 'RB': 52,
 'Crossing': 50,
 'Finishing': 86,
 'HeadingAccuracy': 82,
 'ShortPassing': 71,
 'Volleys': 71,
 'Dribbling': 77,
 'Curve': 41,
 'FKAccuracy': 36,
 'LongPassing': 65,
 'BallControl': 80,
 'Acceleration': 65,
 'SprintSpeed': 69,
 'Agility': 64,
 'Reactions': 78,
 'Balance': 72,
 'ShotPower': 83,
 'Jumping': 70,
 'Stamina': 76,

In [33]:
skill_map['Player Positions'][0]

'ST'