In [1]:
!python --version

Python 3.7.5


# Web Scraping from sofifa.com

We will use the following libraries:
- `requests` to send HTTP requests to sofifa.com and get web pages programmatically.
- `BeautifulSoup` for parsing web pages and extracting information.
- `pandas` to create a tabular structure of the data we extract and save it locally.

In [2]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd

print('Library versions:\n')
print(f'requests: {requests.__version__}')
print(f'BeautifulSoup: {bs4.__version__}')
print(f'pandas: {pd.__version__}')

Library versions:

requests: 2.22.0
BeautifulSoup: 4.8.2
pandas: 0.25.3


By default, sofifa.com displays only 15 attributes. We can check all the attributes on their site and see what the URL becomes. We will use that URL to programmatically fetch all the attributes of the players:

In [3]:
BASE_URL='https://sofifa.com/players?showCol%5B0%5D=pi&showCol%5B1%5D=ae&showCol%5B2%5D=hi&showCol%5B3%5D=wi&showCol%5B4%5D=pf&showCol%5B5%5D=oa&showCol%5B6%5D=pt&showCol%5B7%5D=bo&showCol%5B8%5D=bp&showCol%5B9%5D=gu&showCol%5B10%5D=jt&showCol%5B11%5D=le&showCol%5B12%5D=vl&showCol%5B13%5D=wg&showCol%5B14%5D=rc&showCol%5B15%5D=ta&showCol%5B16%5D=cr&showCol%5B17%5D=fi&showCol%5B18%5D=he&showCol%5B19%5D=sh&showCol%5B20%5D=vo&showCol%5B21%5D=ts&showCol%5B22%5D=dr&showCol%5B23%5D=cu&showCol%5B24%5D=fr&showCol%5B25%5D=lo&showCol%5B26%5D=bl&showCol%5B27%5D=to&showCol%5B28%5D=ac&showCol%5B29%5D=sp&showCol%5B30%5D=ag&showCol%5B31%5D=re&showCol%5B32%5D=ba&showCol%5B33%5D=tp&showCol%5B34%5D=so&showCol%5B35%5D=ju&showCol%5B36%5D=st&showCol%5B37%5D=sr&showCol%5B38%5D=ln&showCol%5B39%5D=te&showCol%5B40%5D=ar&showCol%5B41%5D=in&showCol%5B42%5D=po&showCol%5B43%5D=vi&showCol%5B44%5D=pe&showCol%5B45%5D=cm&showCol%5B46%5D=td&showCol%5B47%5D=ma&showCol%5B48%5D=sa&showCol%5B49%5D=sl&showCol%5B50%5D=tg&showCol%5B51%5D=gd&showCol%5B52%5D=gh&showCol%5B53%5D=gk&showCol%5B54%5D=gp&showCol%5B55%5D=gr&showCol%5B56%5D=tt&showCol%5B57%5D=bs&showCol%5B58%5D=wk&showCol%5B59%5D=sk&showCol%5B60%5D=aw&showCol%5B61%5D=dw&showCol%5B62%5D=ir&showCol%5B63%5D=pac&showCol%5B64%5D=sho&showCol%5B65%5D=pas&showCol%5B66%5D=dri&showCol%5B67%5D=def&showCol%5B68%5D=phy'

Also, attributes of only 60 players are displayed at a time. We can decide which 60 players we view using the `offset` flag in the URL.

For example, `offset=0` will fetch the details of the first 60 players (player 1 - player 60), `offset=60` will fetch details of players 61 - 120, and so on. So we will keep increasing the offset by 60 and fetching a new list of players until we've got details of all the players.

In [4]:
# This method gets details of 60 players from the specified offset
def get_players_from_offset(offset=0):
    
    # Create an array that will hold all the information to be returned
    result = []
    
    # Form the URL
    url = f'{BASE_URL}&offset={offset}'
    
    # Make the HTTP call to get the page
    page_source = requests.get(url)
    
    # Parse the HTML page using BeautifulSoup
    soup = BeautifulSoup(page_source.text, 'html.parser')
    
    # Get the table that contains all the player attributes
    table_body = soup.find('tbody')
    
    # Each row in the table is a different player.
    # Loop over each and extract the attributes.
    for row in table_body.findAll('tr'):
        
        # The `extract_information` method defined below extracts
        # the attributes of a single player from a table row
        info = extract_information(row)
        result.append(info)
        
    return result
        
    
# Extract the information of a single player from an HTML table row
def extract_information(row):
    
    player = {}
    
    td = row.findAll('td')
    
    player['id'] = td[6].text
    player['name'] = td[1].findAll('a')[1].text
    player['age'] = td[2].text
    player['height'] = td[7].text
    player['weight'] = td[8].text
    player['preferred_foot'] = td[9].text
    player['overall_rating'] = td[3].text
    player['potential'] = td[4].text
    player['team'] = td[5].find('a').text
    player['best_overall'] = td[10].text
    player['best_position'] = td[11].text
    player['growth'] = td[12].text
    player['joined'] = td[13].text
    player['loan_date_end'] = td[14].text
    player['value'] = td[15].text
    player['wage'] = td[16].text
    player['release_clause'] = td[17].text
    
    player['total_attacking'] = td[18].text
    player['crossing'] = td[19].text
    player['finishing'] = td[20].text
    player['heading_accuracy'] = td[21].text
    player['short_passing'] = td[22].text
    player['volleys'] = td[23].text
    
    player['total_skill'] = td[24].text
    player['dribbling'] = td[25].text
    player['curve'] = td[26].text
    player['fk_accuracy'] = td[27].text
    player['long_passing'] = td[28].text
    player['ball_control'] = td[29].text
    
    player['total_movement'] = td[30].text
    player['acceleration'] = td[31].text
    player['sprint_speed'] = td[32].text
    player['agility'] = td[33].text
    player['reactions'] = td[34].text
    player['balance'] = td[35].text
    
    player['total_power'] = td[36].text
    player['shot_power'] = td[37].text
    player['jumping'] = td[38].text
    player['stamina'] = td[39].text
    player['strength'] = td[40].text
    player['long_shots'] = td[41].text
    
    player['mentality'] = td[42].text
    player['aggression'] = td[43].text
    player['interception'] = td[44].text
    player['positioning'] = td[45].text
    player['vision'] = td[46].text
    player['penalties'] = td[47].text
    player['composure'] = td[48].text
    
    player['total_defending'] = td[49].text
    player['marking'] = td[50].text
    player['standing_tackle'] = td[51].text
    player['sliding_tackle'] = td[52].text
    
    player['total_goalkeeping'] = td[53].text
    player['gk_diving'] = td[54].text
    player['gk_handling'] = td[55].text
    player['gk_kicking'] = td[56].text
    player['gk_positioning'] = td[57].text
    player['gk_reflexes'] = td[58].text
    
    player['total_stats'] = td[59].text
    player['base_stats'] = td[60].text
    
    player['weak_foot'] = td[61].text.strip()
    player['skill_moves'] = td[62].text.strip()
    player['attacking_work_rate'] = td[63].text
    player['defensive_work_rate'] = td[64].text
    
    player['international_reputation'] = td[65].text.strip()
    
    player['PAC'] = td[66].text
    player['SHO'] = td[67].text
    player['PAS'] = td[68].text
    player['DRI'] = td[69].text
    player['DEF'] = td[70].text
    player['PHY'] = td[71].text
    
    return player

In [5]:
get_players_from_offset()

[{'id': '53352',
  'name': 'D. Bulman',
  'age': '40',
  'height': '5\'9"',
  'weight': '165lbs',
  'preferred_foot': 'Right',
  'overall_rating': '61',
  'potential': '61',
  'team': 'Crawley Town',
  'best_overall': '65',
  'best_position': 'CDM',
  'growth': '0',
  'joined': 'Jul 1, 2017',
  'loan_date_end': 'N/A',
  'value': '€0',
  'wage': '€1K',
  'release_clause': '€0',
  'total_attacking': '232',
  'crossing': '45',
  'finishing': '33',
  'heading_accuracy': '52',
  'short_passing': '65',
  'volleys': '37',
  'total_skill': '274',
  'dribbling': '55',
  'curve': '44',
  'fk_accuracy': '48',
  'long_passing': '62',
  'ball_control': '65',
  'total_movement': '258',
  'acceleration': '44',
  'sprint_speed': '47',
  'agility': '38',
  'reactions': '59',
  'balance': '70',
  'total_power': '343',
  'shot_power': '60',
  'jumping': '76',
  'stamina': '81',
  'strength': '72',
  'long_shots': '54',
  'mentality': '277',
  'aggression': '77',
  'interception': '63',
  'positioning': '