In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [2]:
# url from chelsea website containing list of loaned players

url = 'https://www.chelseafc.com/en/teams/on-loan-players?pageTab=players'

In [3]:
# get url and check response to request

r = requests.get(url)
print(r)

<Response [200]>


In [4]:
# create soup object from request
soup = BeautifulSoup(r.content)

In [5]:
# extract links from loanee homepage for individual players
player_links = [player.get('href') for player in soup.select(' div > cfc-player-tile > a')]

In [6]:
# initialize lists to store player data when scraping

name = []
loan_club = []
dob = []
birthplace = []
position = []

# loop through player links and extract basic info for each player

i = 0
print('Extraction Started:\n')

for player_link in player_links:
    
    # only want to consider male players
    if 'women' not in player_link.split('/')[3]:
        
        i += 1
        
        # add player extensions to chelsea website 
        link = f'https://www.chelseafc.com/{player_link}'
        
        # create soup object from player link
        sub_soup = BeautifulSoup(requests.get(link).content)
        
        # append player info to lists
        name.append(sub_soup.select('div > h1')[0].text)
        loan_club.append(sub_soup.select('div > span')[0].text)
        dob.append(sub_soup.select('cfc-property-list > div > ul > li:nth-child(1) > span')[0].text)
        birthplace.append(sub_soup.select('cfc-property-list > div > ul > li:nth-child(2) > span')[0].text)
        position.append(sub_soup.select('cfc-property-list > div > ul > li:nth-child(3) > span')[0].text)
    
    # print out when every 5 loops completed
    if (i) % 5 == 0:
        print(f'\t{i} Loops Completed...')
        
print("\nExtraction Completed.")

Extraction Started:

	5 Loops Completed...
	10 Loops Completed...
	15 Loops Completed...
	20 Loops Completed...

Extraction Completed.


In [7]:
players_df = pd.DataFrame(
    {
        "Name":name,
        "Loan Club":loan_club,
        "Date of Birth":dob,
        "Birthplace":birthplace,
        "Position":position
    }
)

In [8]:
# convert date of birth to datetime
players_df['Date of Birth'] = players_df['Date of Birth'].astype('datetime64')

# get todays date
today = datetime.today()

# calculate age and add to data frame
players_df['Age'] = (today - players_df['Date of Birth']).astype('timedelta64[Y]').astype('int')

In [9]:
players_df['Loan Club'].replace(
    to_replace=[
        'MK Dons', 
        'Alaves'
    ], 
    value=[
        'Milton Keynes Dons',
        'Deportivo Alavés'
    ], 
    inplace=True);

In [10]:
players_df.to_csv('Data/loan_players.csv', index=False)