# Monday, December 2nd, 2024

Example using `requests` and `re` to pull data from a webpage:

In [1]:
url = r'https://www.transfermarkt.us/everton-fc/kader/verein/29/saison_id/2024/plus/1'

In [2]:
import requests, re

In [3]:
headers = {'User-Agent':'Mozilla/5.0'}

In [4]:
page = requests.get(url,headers=headers)
text = page.text

In [5]:
#print(text)

It looks like the HTML code `<tr class="odd">` or `<tr class="even">` preceeds information about each player, so let's try splitting on this code to generate blocks of player information.

In [6]:
s = 'Twelve is an even number, 14 is an odd number. 15 is even.'


re.findall(r'an (?:odd|even)', s)

['an even', 'an odd']

In [7]:
player_info_pattern = r'<tr class="(?:odd|even)">'

player_info_blocks = re.split(player_info_pattern, text)[1:]

In [8]:
help(re.split)

Help on function split in module re:

split(pattern, string, maxsplit=0, flags=0)
    Split the source string by the occurrences of the pattern,
    returning a list containing the resulting substrings.  If
    capturing parentheses are used in pattern, then the text of all
    groups in the pattern are also returned as part of the resulting
    list.  If maxsplit is nonzero, at most maxsplit splits occur,
    and the remainder of the string is returned as the final element
    of the list.



In [135]:
player_info_block = player_info_blocks[10]

print(player_info_block)


<td class="zentriert rueckennummer bg_Abwehr" title="Defender"><div class=rn_nummer>23</div></td><td class="posrela">
<table class="inline-table">
    <tr>
        <td rowspan="2">
                    </td>
        <td class="hauptlink">
            <a href="/seamus-coleman/profil/spieler/68390">
                Séamus Coleman<span title="Team captain" class="kapitaenicon-table icons_sprite">&nbsp;</span>            </a>
        </td>
    </tr>
    <tr>
        <td>
            Right-Back        </td>
    </tr>
</table>
</td><td class="zentriert">Oct 11, 1988 (36)</td><td class="zentriert"><img src="https://tmssl.akamaized.net//images/flagge/verysmall/72.png?lm=1520611569" title="Ireland" alt="Ireland" class="flaggenrahm

Player number information looks to be of the form: `<div class=rn_nummer>(SOME NUMBER HER)</div>`

In [137]:
player_number_pattern = r'<div class=rn_nummer>(\d+)</div>'
number = re.findall(player_number_pattern, player_info_block)[0]

print(number)

23


The structure for finding names is:
 - `<td class="hauptlink">`
 - `<a href=...>`
 - THE NAME
 - `</a>`

In [138]:
player_name_pattern = r'<td class="hauptlink">\s*<a href=.+>\s*(.+?)<'

player_name = re.findall(player_name_pattern, player_info_block)[0].strip()

print(player_name)

Séamus Coleman


Player position information looks to be of the form `<tr>(WHITE SPACE)<td>(WHITE SPACE)POSITION(WHITE SPACE)</td>(WHITE SPACE)</tr>`

In [136]:
player_position_pattern = r'<tr>\s*<td>\s*(.+)\s*</td>\s*</tr>'

position = re.findall(player_position_pattern, player_info_block)[0].strip()
print(position)

Right-Back


It looks like each piece of data is separated by the HTML `<td class="zentriert">`, so let's split on this string.

In [139]:
for piece in player_info_block.split(r'<td class="zentriert">')[1:]:
    print(piece)

Oct 11, 1988 (36)</td>
<img src="https://tmssl.akamaized.net//images/flagge/verysmall/72.png?lm=1520611569" title="Ireland" alt="Ireland" class="flaggenrahmen" /></td>
1,77m</td>
right</td>
Jan 1, 2009</td>
<a title="Sligo Rovers: Ablöse €70k" href="/sligo-rovers/startseite/verein/8780/saison_id/2008"><img src="https://tmssl.akamaized.net//images/wappen/verysmall/8780.png?lm=1498249702" title="Sligo Rovers" alt="Sligo Rovers" class="" /></a></td>
Jun 30, 2025</td><td class="rechts hauptlink"><a href="/seamus-coleman/marktwertverlauf/spieler/68390">€500k</a></td></tr>



The ordering of these pieces are:
 - Age
 - Nationality
 - Height 
 - Preferred foot
 - When they signed
 - Where they signed from and for how much
 - Contract expiration and market value

In [140]:
(age_info, 
nationality_info, 
height_info, 
foot_info, 
sign_date_info, 
sign_from_info, 
contract_info) = player_info_block.split(r'<td class="zentriert">')[1:]

In [141]:
print(age_info)

Oct 11, 1988 (36)</td>


In [142]:
birthdate_pattern = r'\w{3} \d+, \d{4}'
age_pattern = r'\((\d+)\)'

birthdate = re.findall(birthdate_pattern,age_info)[0]
age = re.findall(age_pattern, age_info)[0]

print(birthdate)
print(age)

Oct 11, 1988
36


In [143]:
print(nationality_info)

<img src="https://tmssl.akamaized.net//images/flagge/verysmall/72.png?lm=1520611569" title="Ireland" alt="Ireland" class="flaggenrahmen" /></td>


In [144]:
nationality_pattern = r'title="(\w+)"'

nationality = re.findall(nationality_pattern, nationality_info)[0]
print(nationality)

Ireland


In [145]:
print(height_info)

1,77m</td>


In [146]:
height_pattern = r'\d+,\d+m'

height = re.findall(height_pattern, height_info)[0]
print(height)

1,77m


In [147]:
print(foot_info)

right</td>


In [148]:
foot_pattern = r'(\w+)<'

foot = re.findall(foot_pattern, foot_info)[0]
print(foot)

right


In [149]:
print(sign_date_info)

Jan 1, 2009</td>


In [150]:
sign_date_pattern = r'\w{3} \d+, \d{4}'

sign_date = re.findall(sign_date_pattern, sign_date_info)[0]
print(sign_date)

Jan 1, 2009


In [151]:
print(sign_from_info)

<a title="Sligo Rovers: Ablöse €70k" href="/sligo-rovers/startseite/verein/8780/saison_id/2008"><img src="https://tmssl.akamaized.net//images/wappen/verysmall/8780.png?lm=1498249702" title="Sligo Rovers" alt="Sligo Rovers" class="" /></a></td>


In [152]:
sign_from_pattern = r'<a title="(.+): Ablöse (.+)" href'

sign_from, sign_fee = re.findall(sign_from_pattern, sign_from_info)[0]
print(sign_from)
print(sign_fee)

Sligo Rovers
€70k


In [153]:
print(contract_info)

Jun 30, 2025</td><td class="rechts hauptlink"><a href="/seamus-coleman/marktwertverlauf/spieler/68390">€500k</a></td></tr>



In [154]:
contract_end_pattern = r'\w{3} \d+, \d{4}'

contract_end = re.findall(contract_end_pattern, contract_info)[0]
print(contract_end)

market_value_pattern = r'(€.+?)<'

market_value = re.findall(market_value_pattern, contract_info)[0]
print(market_value)

Jun 30, 2025
€500k


Let's assemble all of these patterns and searches into a function.

In [8]:
player_name_pattern = r'<td class="hauptlink">\s*<a href=.+>\s*(.+?)<'
player_number_pattern = r'<div class=rn_nummer>(\d+)</div>'
player_position_pattern = r'<tr>\s*<td>\s*(.+)\s*</td>\s*</tr>'

birthdate_pattern = r'\w{3} \d+, \d{4}'
age_pattern = r'\((\d+)\)'
nationality_pattern = r'title="(\w+)"'
height_pattern = r'\d+,\d+m'
foot_pattern = r'(\w+)<'
sign_date_pattern = r'\w{3} \d+, \d{4}'
sign_from_pattern = r'<a title="(.+): Ablöse (.+)" href'
contract_end_pattern = r'\w{3} \d+, \d{4}'
market_value_pattern = r'(€.+?)<'

def get_player_info(player_info_block):
    (age_info, 
    nationality_info, 
    height_info, 
    foot_info, 
    sign_date_info, 
    sign_from_info, 
    contract_info) = player_info_block.split(r'<td class="zentriert">')[1:]

    name = re.findall(player_name_pattern, player_info_block)[0].strip()
    number = re.findall(player_number_pattern, player_info_block)[0]
    position = re.findall(player_position_pattern, player_info_block)[0].strip()
    
    birthdate = re.findall(birthdate_pattern,age_info)[0]
    age = re.findall(age_pattern, age_info)[0]
    
    nationality = re.findall(nationality_pattern, nationality_info)[0]
    
    height = re.findall(height_pattern, height_info)[0]
    
    foot = re.findall(foot_pattern, foot_info)[0]
    
    sign_date = re.findall(sign_date_pattern, sign_date_info)[0]
    
    sign_from, sign_fee = re.findall(sign_from_pattern, sign_from_info)[0]
    
    contract_end = re.findall(contract_end_pattern, contract_info)[0]
    market_value = re.findall(market_value_pattern, contract_info)[0]
    
    return (name, number, position, birthdate, age, nationality, height, foot,
            sign_date, sign_from, sign_fee, contract_end, market_value)

In [9]:
get_player_info(player_info_blocks[0])

('Jordan Pickford',
 '1',
 'Goalkeeper',
 'Mar 7, 1994',
 '30',
 'England',
 '1,85m',
 'left',
 'Jul 1, 2017',
 'Sunderland AFC',
 '€28.50m',
 'Jun 30, 2027',
 '€22.00m')

In [10]:
for player_info_block in player_info_blocks:
    print(get_player_info(player_info_block))

('Jordan Pickford', '1', 'Goalkeeper', 'Mar 7, 1994', '30', 'England', '1,85m', 'left', 'Jul 1, 2017', 'Sunderland AFC', '€28.50m', 'Jun 30, 2027', '€22.00m')
('João Virgínia', '12', 'Goalkeeper', 'Oct 10, 1999', '25', 'Portugal', '1,92m', 'left', 'Jul 1, 2020', 'Everton FC U23', '-', 'Jun 30, 2025', '€800k')
('Asmir Begovic', '31', 'Goalkeeper', 'Jun 20, 1987', '37', 'Canada', '1,99m', 'right', 'Aug 23, 2024', 'Queens Park Rangers', 'free transfer', 'Jun 30, 2025', '€300k')
('Jarrad Branthwaite', '32', 'Centre-Back', 'Jun 27, 2002', '22', 'England', '1,95m', 'left', 'Jan 13, 2020', 'Carlisle United', '€1.10m', 'Jun 30, 2027', '€42.00m')
("Jake O'Brien", '15', 'Centre-Back', 'May 15, 2001', '23', 'Ireland', '1,97m', 'right', 'Jul 30, 2024', 'Olympique Lyon', '€19.50m', 'Jun 30, 2028', '€15.00m')
('James Tarkowski', '6', 'Centre-Back', 'Nov 19, 1992', '32', 'England', '1,85m', 'right', 'Jul 2, 2022', 'Burnley FC', 'free transfer', 'Jun 30, 2026', '€13.00m')
('Michael Keane', '5', 'Centr

In [11]:
names = []
numbers = []
positions = []
birthdates = []
ages = []
nationalities = []
heights = []
foots = []
sign_dates = []
sign_froms = []
sign_fees = []
contract_ends = []
market_values = []

for player_info_block in player_info_blocks:
    (name, number, position, birthdate, age, nationality, height, foot,
    sign_date, sign_from, sign_fee, contract_end, market_value) = get_player_info(player_info_block)
    
    names.append(name)
    numbers.append(number)
    positions.append(position)
    birthdates.append(birthdate)
    ages.append(age)
    nationalities.append(nationality)
    heights.append(height)
    foots.append(foot)
    sign_dates.append(sign_date)
    sign_froms.append(sign_from)
    sign_fees.append(sign_fee)
    contract_ends.append(contract_end)
    market_values.append(market_value)

In [12]:
import pandas as pd

In [13]:
player_df = pd.DataFrame({'Name':names,
                          'Number':numbers,
                          'Position':positions,
                          'Birthdate':birthdates,
                          'Age':ages,
                          'Nationality':nationalities,
                          'Height':heights,
                          'Preferred foot':foots,
                          'Date signed':sign_dates,
                          'Transfer fee':sign_fees,
                          'Contract expiration':contract_end,
                          'Market value':market_values})

In [14]:
player_df

Unnamed: 0,Name,Number,Position,Birthdate,Age,Nationality,Height,Preferred foot,Date signed,Transfer fee,Contract expiration,Market value
0,Jordan Pickford,1,Goalkeeper,"Mar 7, 1994",30,England,"1,85m",left,"Jul 1, 2017",€28.50m,"Jun 30, 2027",€22.00m
1,João Virgínia,12,Goalkeeper,"Oct 10, 1999",25,Portugal,"1,92m",left,"Jul 1, 2020",-,"Jun 30, 2027",€800k
2,Asmir Begovic,31,Goalkeeper,"Jun 20, 1987",37,Canada,"1,99m",right,"Aug 23, 2024",free transfer,"Jun 30, 2027",€300k
3,Jarrad Branthwaite,32,Centre-Back,"Jun 27, 2002",22,England,"1,95m",left,"Jan 13, 2020",€1.10m,"Jun 30, 2027",€42.00m
4,Jake O'Brien,15,Centre-Back,"May 15, 2001",23,Ireland,"1,97m",right,"Jul 30, 2024",€19.50m,"Jun 30, 2027",€15.00m
5,James Tarkowski,6,Centre-Back,"Nov 19, 1992",32,England,"1,85m",right,"Jul 2, 2022",free transfer,"Jun 30, 2027",€13.00m
6,Michael Keane,5,Centre-Back,"Jan 11, 1993",31,England,"1,88m",right,"Jul 3, 2017",€28.50m,"Jun 30, 2027",€6.00m
7,Vitaliy Mykolenko,19,Left-Back,"May 29, 1999",25,Ukraine,"1,80m",left,"Jan 1, 2022",€23.50m,"Jun 30, 2027",€28.00m
8,Ashley Young,18,Left-Back,"Jul 9, 1985",39,England,"1,75m",both,"Jul 13, 2023",free transfer,"Jun 30, 2027",€500k
9,Nathan Patterson,2,Right-Back,"Oct 16, 2001",23,Scotland,"1,89m",right,"Jan 4, 2022",€14.00m,"Jun 30, 2027",€15.00m


In [15]:
player_df.loc[player_df['Age'].astype(int) > 30]

Unnamed: 0,Name,Number,Position,Birthdate,Age,Nationality,Height,Preferred foot,Date signed,Transfer fee,Contract expiration,Market value
2,Asmir Begovic,31,Goalkeeper,"Jun 20, 1987",37,Canada,"1,99m",right,"Aug 23, 2024",free transfer,"Jun 30, 2027",€300k
5,James Tarkowski,6,Centre-Back,"Nov 19, 1992",32,England,"1,85m",right,"Jul 2, 2022",free transfer,"Jun 30, 2027",€13.00m
6,Michael Keane,5,Centre-Back,"Jan 11, 1993",31,England,"1,88m",right,"Jul 3, 2017",€28.50m,"Jun 30, 2027",€6.00m
8,Ashley Young,18,Left-Back,"Jul 9, 1985",39,England,"1,75m",both,"Jul 13, 2023",free transfer,"Jun 30, 2027",€500k
10,Séamus Coleman,23,Right-Back,"Oct 11, 1988",36,Ireland,"1,77m",right,"Jan 1, 2009",€70k,"Jun 30, 2027",€500k
13,Abdoulaye Doucouré,16,Central Midfield,"Jan 1, 1993",31,Mali,"1,83m",right,"Sep 8, 2020",€22.00m,"Jun 30, 2027",€10.00m
15,Idrissa Gueye,27,Central Midfield,"Sep 26, 1989",35,Senegal,"1,74m",right,"Sep 1, 2022",€4.00m,"Jun 30, 2027",€2.00m


In [16]:
player_df.to_csv('Everton.csv')

# Wednesday, December 4th, 2024

## Summary of results from Monday:

In [1]:
import requests, re

In [2]:
url = r'https://www.transfermarkt.us/everton-fc/kader/verein/29/saison_id/2024/plus/1'

headers = {'User-Agent':'Mozilla/5.0'}
page = requests.get(url,headers=headers)
text = page.text

In [3]:
player_info_pattern = r'<tr class="(?:odd|even)">'
player_info_blocks = re.split(player_info_pattern, text)[1:]

In [4]:
player_name_pattern = r'<td class="hauptlink">\s*<a href=.+>\s*(.+?)<'
player_number_pattern = r'<div class=rn_nummer>(\d+)</div>'
player_position_pattern = r'<tr>\s*<td>\s*(.+)\s*</td>\s*</tr>'

birthdate_pattern = r'\w{3} \d+, \d{4}'
age_pattern = r'\((\d+)\)'
nationality_pattern = r'title="(\w+)"'
height_pattern = r'\d+,\d+m'
foot_pattern = r'(\w+)<'
sign_date_pattern = r'\w{3} \d+, \d{4}'
sign_from_pattern = r'<a title="(.+): Ablöse (.+)" href'
contract_end_pattern = r'\w{3} \d+, \d{4}'
market_value_pattern = r'(€.+?)<'

def get_player_info(player_info_block):
    (age_info, 
    nationality_info, 
    height_info, 
    foot_info, 
    sign_date_info, 
    sign_from_info, 
    contract_info) = player_info_block.split(r'<td class="zentriert">')[1:]

    name = re.findall(player_name_pattern, player_info_block)[0].strip()
    number = re.findall(player_number_pattern, player_info_block)[0]
    position = re.findall(player_position_pattern, player_info_block)[0].strip()
    
    birthdate = re.findall(birthdate_pattern,age_info)[0]
    age = re.findall(age_pattern, age_info)[0]
    
    nationality = re.findall(nationality_pattern, nationality_info)[0]
    
    height = re.findall(height_pattern, height_info)[0]
    
    foot = re.findall(foot_pattern, foot_info)[0]
    
    sign_date = re.findall(sign_date_pattern, sign_date_info)[0]
    
    sign_from, sign_fee = re.findall(sign_from_pattern, sign_from_info)[0]
    
    contract_end = re.findall(contract_end_pattern, contract_info)[0]
    market_value = re.findall(market_value_pattern, contract_info)[0]
    
    return (name, number, position, birthdate, age, nationality, height, foot,
            sign_date, sign_from, sign_fee, contract_end, market_value)

In [5]:
names = []
numbers = []
positions = []
birthdates = []
ages = []
nationalities = []
heights = []
foots = []
sign_dates = []
sign_froms = []
sign_fees = []
contract_ends = []
market_values = []

for player_info_block in player_info_blocks:
    (name, number, position, birthdate, age, nationality, height, foot,
    sign_date, sign_from, sign_fee, contract_end, market_value) = get_player_info(player_info_block)
    
    names.append(name)
    numbers.append(number)
    positions.append(position)
    birthdates.append(birthdate)
    ages.append(age)
    nationalities.append(nationality)
    heights.append(height)
    foots.append(foot)
    sign_dates.append(sign_date)
    sign_froms.append(sign_from)
    sign_fees.append(sign_fee)
    contract_ends.append(contract_end)
    market_values.append(market_value)

## Working with Pandas:

In [7]:
import pandas as pd

In [8]:
player_df = pd.DataFrame({'Name':names,
                          'Number':numbers,
                          'Position':positions,
                          'Birthdate':birthdates,
                          'Age':ages,
                          'Nationality':nationalities,
                          'Height':heights,
                          'Preferred foot':foots,
                          'Date signed':sign_dates,
                          'Transfer fee':sign_fees,
                          'Contract expiration':contract_end,
                          'Market value':market_values})

In [9]:
player_df

Unnamed: 0,Name,Number,Position,Birthdate,Age,Nationality,Height,Preferred foot,Date signed,Transfer fee,Contract expiration,Market value
0,Jordan Pickford,1,Goalkeeper,"Mar 7, 1994",30,England,"1,85m",left,"Jul 1, 2017",€28.50m,"Jun 30, 2027",€22.00m
1,João Virgínia,12,Goalkeeper,"Oct 10, 1999",25,Portugal,"1,92m",left,"Jul 1, 2020",-,"Jun 30, 2027",€800k
2,Asmir Begovic,31,Goalkeeper,"Jun 20, 1987",37,Canada,"1,99m",right,"Aug 23, 2024",free transfer,"Jun 30, 2027",€300k
3,Jarrad Branthwaite,32,Centre-Back,"Jun 27, 2002",22,England,"1,95m",left,"Jan 13, 2020",€1.10m,"Jun 30, 2027",€42.00m
4,Jake O'Brien,15,Centre-Back,"May 15, 2001",23,Ireland,"1,97m",right,"Jul 30, 2024",€19.50m,"Jun 30, 2027",€15.00m
5,James Tarkowski,6,Centre-Back,"Nov 19, 1992",32,England,"1,85m",right,"Jul 2, 2022",free transfer,"Jun 30, 2027",€13.00m
6,Michael Keane,5,Centre-Back,"Jan 11, 1993",31,England,"1,88m",right,"Jul 3, 2017",€28.50m,"Jun 30, 2027",€6.00m
7,Vitaliy Mykolenko,19,Left-Back,"May 29, 1999",25,Ukraine,"1,80m",left,"Jan 1, 2022",€23.50m,"Jun 30, 2027",€28.00m
8,Ashley Young,18,Left-Back,"Jul 9, 1985",39,England,"1,75m",both,"Jul 13, 2023",free transfer,"Jun 30, 2027",€500k
9,Nathan Patterson,2,Right-Back,"Oct 16, 2001",23,Scotland,"1,89m",right,"Jan 4, 2022",€14.00m,"Jun 30, 2027",€15.00m
