In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [2]:
df = pd.read_csv('Data/loan_players.csv')

In [3]:
df = df[df['Position'] == 'Forward']
df.reset_index(inplace=True, drop=True)

In [4]:
# convert df to dictionary
players_dict = df.to_dict()

In [5]:
# define headers for request
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

# define base url for requests
base_url = 'http://www.transfermarkt.co.uk/schnellsuche/ergebnis/schnellsuche?query='

In [6]:
profile_links = []

for i in range(len(df)):
    
    # extract player info from players dictionary
    name = players_dict['Name'][i]
    club = players_dict['Loan Club'][i]
    age = players_dict['Age'][i]

    # replace space in name with '+'
    query = name.replace(' ', '+')

    # define url to search for player on transfermarkt
    url = f'{base_url}{query}'
    r = requests.get(url=url,headers=headers)

    soup = BeautifulSoup(r.content)

    for name_match in soup.select('table > tbody > tr'):
        if re.match(f'.*{club}.*',name_match.select('a')[2].text):
            profile_links.append(name_match.select('a')[1].get('href'))
            print(f'\tLoop {i+1}/{len(df)} --- Completed')
            break

	Loop 1/3 --- Completed
	Loop 2/3 --- Completed
	Loop 3/3 --- Completed


In [7]:
# initialize lists to store player data when scraping

apps = {}
goals = {}
assists = {}
own_goals = {}
sub_on = {}
sub_off = {}
yellow = {}
second_yellow = {}
red = {}
penalties = {}
mins_per_goal = {}
mins = {}

print('Extraction Started\n')

# loop through player links and extract detailed stats for each player

for i, link in enumerate(profile_links):
    
    # use player to profile link to get url to detailed stats for season
    detailed_stats = link.replace('profil', 'leistungsdaten')
    
    # define url to search for detailed player stats on transfermarkt
    url = f'https://www.transfermarkt.co.uk{detailed_stats}/saison/2021/plus/1#gesamt'
    r = requests.get(url=url, headers=headers)
    soup = BeautifulSoup(r.content)
    
    # append list with statsfor player
    apps[i] = soup.select('#yw1 > table > tfoot > tr > td')[2].text
    goals[i] = soup.select('#yw1 > table > tfoot > tr > td')[3].text
    assists[i] = soup.select('#yw1 > table > tfoot > tr > td')[4].text
    own_goals[i] = soup.select('#yw1 > table > tfoot > tr > td')[5].text
    sub_on[i] = soup.select('#yw1 > table > tfoot > tr > td')[6].text
    sub_off[i] = soup.select('#yw1 > table > tfoot > tr > td')[7].text
    yellow[i] = soup.select('#yw1 > table > tfoot > tr > td')[8].text
    second_yellow[i] = soup.select('#yw1 > table > tfoot > tr > td')[9].text
    red[i] = soup.select('#yw1 > table > tfoot > tr > td')[10].text
    penalties[i] = soup.select('#yw1 > table > tfoot > tr > td')[11].text
    mins_per_goal[i] = soup.select('#yw1 > table > tfoot > tr > td')[12].text.replace('.', '').replace("'", '')
    mins[i] = soup.select('#yw1 > table > tfoot > tr > td')[13].text.replace('.', '').replace("'", '')
    
    print(f'\tLoop {i+1}/{len(profile_links)} --- Completed')

print('\nExtraction Completed')

Extraction Started

	Loop 1/3 --- Completed
	Loop 2/3 --- Completed
	Loop 3/3 --- Completed

Extraction Completed


In [8]:
dicts_to_clean = [
    apps,
    goals,
    assists,
    own_goals,
    sub_on,
    sub_off,
    yellow,
    second_yellow,
    red,
    penalties,
    mins_per_goal,
    mins
]

# clean the detailed stats valued extracted previously

for dict_to_clean in dicts_to_clean:
    for key, val in dict_to_clean.items():
        # replace '-' with 0
        if val == '-':
            dict_to_clean[key]= 0
        # convert to other vals to int
        else:
            dict_to_clean[key] = int(val)

In [9]:
players_dict['Appearances'] = apps
players_dict['Goals'] = goals
players_dict['Assists'] = assists
players_dict['Own Goals'] = own_goals
players_dict['Substitions On'] = sub_on
players_dict['Substitutions Off'] = sub_off
players_dict['Yellow Cards'] = yellow
players_dict['Second Yellow Cards'] = second_yellow
players_dict['Red Cards'] = red
players_dict['Penalty Goals'] = penalties
players_dict['Minutes per Goal'] = mins_per_goal
players_dict['Minutes Played'] = mins

In [10]:
pd.DataFrame(players_dict).to_csv('Data/loan_forwards.csv')