# Monday, December 2nd, 2024

Example using `requests` and `re` to pull data from a webpage:

In [1]:
url = r'https://www.transfermarkt.us/everton-fc/kader/verein/29/saison_id/2024/plus/1'

In [2]:
import requests, re

In [5]:
headers = {'User-Agent':'Mozilla/5.0'}

In [11]:
page = requests.get(url,headers=headers)
text = page.text

In [38]:
#print(text)

It looks like the HTML code `<tr class="odd">` or `<tr class="even">` preceeds information about each player, so let's try splitting on this code to generate blocks of player information.

In [24]:
s = 'Twelve is an even number, 14 is an odd number. 15 is even.'


re.findall(r'an (?:odd|even)', s)

['an even', 'an odd']

In [30]:
player_info_pattern = r'<tr class="(?:odd|even)">'

player_info_blocks = re.split(player_info_pattern, text)[1:]

In [16]:
help(re.split)

Help on function split in module re:

split(pattern, string, maxsplit=0, flags=0)
    Split the source string by the occurrences of the pattern,
    returning a list containing the resulting substrings.  If
    capturing parentheses are used in pattern, then the text of all
    groups in the pattern are also returned as part of the resulting
    list.  If maxsplit is nonzero, at most maxsplit splits occur,
    and the remainder of the string is returned as the final element
    of the list.



In [57]:
player_info_block = player_info_blocks[10]

print(player_info_block)


<td class="zentriert rueckennummer bg_Abwehr" title="Defender"><div class=rn_nummer>23</div></td><td class="posrela">
<table class="inline-table">
    <tr>
        <td rowspan="2">
                    </td>
        <td class="hauptlink">
            <a href="/seamus-coleman/profil/spieler/68390">
                Séamus Coleman<span title="Team captain" class="kapitaenicon-table icons_sprite">&nbsp;</span>            </a>
        </td>
    </tr>
    <tr>
        <td>
            Right-Back        </td>
    </tr>
</table>
</td><td class="zentriert">Oct 11, 1988 (36)</td><td class="zentriert"><img src="https://tmssl.akamaized.net//images/flagge/verysmall/72.png?lm=1520611569" title="Ireland" alt="Ireland" class="flaggenrahm

Player number information looks to be of the form: `<div class=rn_nummer>(SOME NUMBER HER)</div>`

In [48]:
player_number_pattern = r'<div class=rn_nummer>(\d+)</div>'
number = re.findall(player_number_pattern, player_info_block)[0]

print(number)

23


The structure for finding names is:
 - `<td class="hauptlink">`
 - `<a href=...>`
 - THE NAME
 - `</a>`

In [53]:
player_name_pattern = r'<td class="hauptlink">\s*<a href=.+>\s*(.+?)<'

player_name = re.findall(player_name_pattern, player_info_block)[0].strip()

print(player_name)

Séamus Coleman


It looks like each piece is data is separated by the HTML `<td class="zentriert">`, so let's split on this string.

In [58]:
for piece in player_info_block.split(r'<td class="zentriert">')[1:]:
    print(piece)

Oct 11, 1988 (36)</td>
<img src="https://tmssl.akamaized.net//images/flagge/verysmall/72.png?lm=1520611569" title="Ireland" alt="Ireland" class="flaggenrahmen" /></td>
1,77m</td>
right</td>
Jan 1, 2009</td>
<a title="Sligo Rovers: Ablöse €70k" href="/sligo-rovers/startseite/verein/8780/saison_id/2008"><img src="https://tmssl.akamaized.net//images/wappen/verysmall/8780.png?lm=1498249702" title="Sligo Rovers" alt="Sligo Rovers" class="" /></a></td>
Jun 30, 2025</td><td class="rechts hauptlink"><a href="/seamus-coleman/marktwertverlauf/spieler/68390">€500k</a></td></tr>



The ordering of these pieces are:
 - Age
 - Nationality
 - Height 
 - Preferred foot
 - When they signed
 - Where they signed from and for how much
 - Contract expiration and market value

In [61]:
(age_info, 
nationality_info, 
height_info, 
foot_info, 
sign_date_info, 
sign_from_info, 
contract_info) = player_info_block.split(r'<td class="zentriert">')[1:]




In [69]:
birthdate_pattern = r'\w{3} \d+, \d{4}'
age_pattern = r'\((\d+)\)'

birthdate = re.findall(birthdate_pattern,age_info)[0]
age = re.findall(age_pattern, age_info)[0]

print(birthdate)
print(age)

Oct 11, 1988
36


In [65]:
age_info

'Oct 11, 1988 (36)</td>'

In [70]:
print(nationality_info)

<img src="https://tmssl.akamaized.net//images/flagge/verysmall/72.png?lm=1520611569" title="Ireland" alt="Ireland" class="flaggenrahmen" /></td>


In [73]:
nationality_pattern = r'title="(\w+)"'

nationality = re.findall(nationality_pattern, nationality_info)[0]
print(nationality)

Ireland


In [75]:
player_name_pattern = r'<td class="hauptlink">\s*<a href=.+>\s*(.+?)<'
player_number_pattern = r'<div class=rn_nummer>(\d+)</div>'

birthdate_pattern = r'\w{3} \d+, \d{4}'
age_pattern = r'\((\d+)\)'
nationality_pattern = r'title="(\w+)"'


def get_player_info(player_info_block):
    (age_info, 
    nationality_info, 
    height_info, 
    foot_info, 
    sign_date_info, 
    sign_from_info, 
    contract_info) = player_info_block.split(r'<td class="zentriert">')[1:]

    name = re.findall(player_name_pattern, player_info_block)[0].strip()
    number = re.findall(player_number_pattern, player_info_block)[0]
    
    birthdate = re.findall(birthdate_pattern,age_info)[0]
    age = re.findall(age_pattern, age_info)[0]

    nationality = re.findall(nationality_pattern, nationality_info)[0]
    
    return name, number, birthdate, age, nationality

In [78]:
get_player_info(player_info_blocks[2])

('Asmir Begovic', '31', 'Jun 20, 1987', '37', 'Canada')

In [79]:
for player_info_block in player_info_blocks:
    print(get_player_info(player_info_block))

('Jordan Pickford', '1', 'Mar 7, 1994', '30', 'England')
('João Virgínia', '12', 'Oct 10, 1999', '25', 'Portugal')
('Asmir Begovic', '31', 'Jun 20, 1987', '37', 'Canada')
('Jarrad Branthwaite', '32', 'Jun 27, 2002', '22', 'England')
("Jake O'Brien", '15', 'May 15, 2001', '23', 'Ireland')
('James Tarkowski', '6', 'Nov 19, 1992', '32', 'England')
('Michael Keane', '5', 'Jan 11, 1993', '31', 'England')
('Vitaliy Mykolenko', '19', 'May 29, 1999', '25', 'Ukraine')
('Ashley Young', '18', 'Jul 9, 1985', '39', 'England')
('Nathan Patterson', '2', 'Oct 16, 2001', '23', 'Scotland')
('Séamus Coleman', '23', 'Oct 11, 1988', '36', 'Ireland')
('James Garner', '37', 'Mar 13, 2001', '23', 'England')
('Orel Mangala', '8', 'Mar 18, 1998', '26', 'Belgium')
('Abdoulaye Doucouré', '16', 'Jan 1, 1993', '31', 'Mali')
('Tim Iroegbunam', '42', 'Jun 30, 2003', '21', 'England')
('Idrissa Gueye', '27', 'Sep 26, 1989', '35', 'Senegal')
('Dwight McNeil', '7', 'Nov 22, 1999', '25', 'England')
('Iliman Ndiaye', '10',

In [80]:
names = []
numbers = []
birthdates = []
ages = []
nationalities = []

for player_info_block in player_info_blocks:
    name, number, birthdate, age, nationality = get_player_info(player_info_block)
    names.append(name)
    numbers.append(number)
    birthdates.append(birthdate)
    ages.append(age)
    nationalities.append(nationality)

In [81]:
import pandas as pd

In [82]:
player_df = pd.DataFrame({'Name':names,
              'Number':numbers,
              'Birthdate':birthdates,
              'Age':ages,
              'Nationality':nationalities})

In [83]:
player_df

Unnamed: 0,Name,Number,Birthdate,Age,Nationality
0,Jordan Pickford,1,"Mar 7, 1994",30,England
1,João Virgínia,12,"Oct 10, 1999",25,Portugal
2,Asmir Begovic,31,"Jun 20, 1987",37,Canada
3,Jarrad Branthwaite,32,"Jun 27, 2002",22,England
4,Jake O'Brien,15,"May 15, 2001",23,Ireland
5,James Tarkowski,6,"Nov 19, 1992",32,England
6,Michael Keane,5,"Jan 11, 1993",31,England
7,Vitaliy Mykolenko,19,"May 29, 1999",25,Ukraine
8,Ashley Young,18,"Jul 9, 1985",39,England
9,Nathan Patterson,2,"Oct 16, 2001",23,Scotland


In [85]:
player_df.loc[player_df['Age'].astype(int) > 30]

Unnamed: 0,Name,Number,Birthdate,Age,Nationality
2,Asmir Begovic,31,"Jun 20, 1987",37,Canada
5,James Tarkowski,6,"Nov 19, 1992",32,England
6,Michael Keane,5,"Jan 11, 1993",31,England
8,Ashley Young,18,"Jul 9, 1985",39,England
10,Séamus Coleman,23,"Oct 11, 1988",36,Ireland
13,Abdoulaye Doucouré,16,"Jan 1, 1993",31,Mali
15,Idrissa Gueye,27,"Sep 26, 1989",35,Senegal
