In [26]:
import requests
from bs4 import BeautifulSoup

In [28]:
url = 'https://www.espncricinfo.com/records/tournament/team-match-results/icc-men-s-t20-world-cup-2022-23-14450'
response = requests.get(url)
response.raise_for_status()

In [30]:
# 1.b Parser Code
# Step 1: Create a list to store match summary links
match_summary_links = []

# Step 2: Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

# Step 3: Selecting all rows we need from the target table
all_rows = soup.select('table.ds-w-full.ds-table.ds-table-xs.ds-table-auto > tbody > tr')

# Looping through each row to get the links
for row in all_rows:
    tds = row.find_all('td')
    row_url = "https://www.espncricinfo.com" + tds[6].find('a')['href']
    match_summary_links.append(row_url)

# Display the collected match summary links
print("Match Summary Links:")
for link in match_summary_links:
    print(link)


Match Summary Links:
https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/england-vs-pakistan-final-1298179/full-scorecard
https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/england-vs-india-2nd-semi-final-1298178/full-scorecard
https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/new-zealand-vs-pakistan-1st-semi-final-1298177/full-scorecard
https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/india-vs-zimbabwe-42nd-match-group-2-1298176/full-scorecard
https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/bangladesh-vs-pakistan-41st-match-group-2-1298175/full-scorecard
https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/netherlands-vs-south-africa-40th-match-group-2-1298174/full-scorecard
https://www.espncricinfo.com/series/icc-men-s-t20-world-cup-2022-23-1298134/england-vs-sri-lanka-39th-match-group-1-1298173/full-scorecard
https://www.espncricinf

In [40]:
def extract_players_data(match_url):
    response = requests.get(match_url)
    match_soup = BeautifulSoup(response.content, 'html.parser')
     # Find the "Match Flow" section
    match_flow = match_soup.find('span', class_='ds-text-title-xs ds-font-bold ds-text-typo', string=lambda x: x and "Match Flow" in x)

    if match_flow is None:
        print("Match Flow not found in", match_url)
        return None

    # Get parent div and its sibling divs
    match_flow_div = match_flow.find_parent('div')
    sibling_divs = match_flow_div.find_parent('div').find_next_siblings('div')

    # Navigate to the first team's innings
    first_div_element = sibling_divs[0].find('div')
    first_ul_element = first_div_element.find('ul')
    team1_li = first_ul_element.find('li')
    team1 = team1_li.find('span').text.replace(" innings", "")

    # Navigate to the second team's innings
    second_div_element = first_div_element.find_next_sibling('div')
    second_ul_element = second_div_element.find('ul')
    team2_li = second_ul_element.find('li')
    team2 = team2_li.find('span').text.replace(" innings", "")

    players_links = []
    
    # For batting players
    batting_tables = match_soup.select('div > table.ci-scorecard-table')
    first_innings_rows = batting_tables[0].select('tbody > tr')
    second_innings_rows = batting_tables[1].select('tbody > tr')

    for row in first_innings_rows:
        tds = row.find_all('td')
        if len(tds) >= 8:
            players_links.append({
                "name": tds[0].find('a').text.strip(),
                "team": team1,
                "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
            })

    for row in second_innings_rows:
        tds = row.find_all('td')
        if len(tds) >= 8:
            players_links.append({
                "name": tds[0].find('a').text.strip(),
                "team": team2,
                "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
            })

    # For bowling players
    bowling_tables = match_soup.select('div > table.ds-table')
    if len(bowling_tables) < 4:  # Check if bowling tables are present
        return players_links  # Return the collected player links so far

    first_innings_bowling_rows = bowling_tables[1].select('tbody > tr')
    second_innings_bowling_rows = bowling_tables[3].select('tbody > tr')

    for row in first_innings_bowling_rows:
        tds = row.find_all('td')
        if len(tds) >= 11:
            players_links.append({
                "name": tds[0].find('a').text.strip(),
                "team": team2,
                "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
            })

    for row in second_innings_bowling_rows:
        tds = row.find_all('td')
        if len(tds) >= 11:
            players_links.append({
                "name": tds[0].find('a').text.strip(),
                "team": team1,
                "link": "https://www.espncricinfo.com" + tds[0].find('a')['href']
            })

    return players_links  # Ensure the function returns the list of players

# Call the function for each match URL
all_players_data = []
for match_url in match_summary_links:
    players_data = extract_players_data(match_url)
    all_players_data.extend(players_data)

# Debug: Print all players data
print("All Players Data:", all_players_data)

All Players Data: [{'name': 'Mohammad Rizwan\xa0†', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/mohammad-rizwan-323389'}, {'name': 'Babar Azam\xa0(c)', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/babar-azam-348144'}, {'name': 'Mohammad Haris', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/mohammad-haris-1205559'}, {'name': 'Shan Masood', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/shan-masood-233901'}, {'name': 'Iftikhar Ahmed', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/iftikhar-ahmed-480603'}, {'name': 'Shadab Khan', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/shadab-khan-922943'}, {'name': 'Mohammad Nawaz', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/mohammad-nawaz-348148'}, {'name': 'Mohammad Wasim', 'team': 'Pakistan', 'link': 'https://www.espncricinfo.com/cricketers/mohammad-wasim-1185538'}, {'name': 'Shaheen

In [76]:
for player in all_players_data:
    player_response = requests.get(player['link'])
    player_soup = BeautifulSoup(player_response.content, 'html.parser')

    # ----- Parser for Batting Style ----- #
    batting_style_p = player_soup.find('p', string="Batting Style")  
    if batting_style_p:
        span_tag = batting_style_p.find_next('span')  
        if span_tag:
            batting_style = span_tag.find_next('p').text  
        else:
            batting_style = "N/A"
    else:
        batting_style = "N/A"

    # ----- Parser for Batting Style ----- #
    bowling_style_p = player_soup.find('p', string="Bowling Style")  
    if bowling_style_p:
        span_tag = bowling_style_p.find_next('span')  
        if span_tag:
            bowling_style = span_tag.find_next('p').text  
        else:
            bowling_style = "N/A"
    else:
        bowling_style = "N/A"

    # ----- Parser for Playing role ----- #
    playing_role_p = player_soup.find('p', string="Playing Role")  
    if playing_role_p:
        span_tag = playing_role_p.find_next('span')  
        if span_tag:
            playing_role = span_tag.find_next('p').text  
        else:
            playing_role = "N/A"
    else:
        playing_role = "N/A"
    


    description = player_soup.select_one('div.ci-player-bio-content > p').text if player_soup.select_one('div.ci-player-bio-content > p') else "N/A"

    
    final_data = {
        "name": player['name'],
        "team": player['team'],
        "battingStyle": batting_style,
        "bowlingStyle": bowling_style,
        "playingRole": playing_role,
        "description": description,
    }
    print(final_data)

{'name': 'Mohammad Rizwan\xa0†', 'team': 'Pakistan', 'battingStyle': 'Right hand Bat', 'bowlingStyle': 'Right arm Medium', 'playingRole': 'Wicketkeeper Batter', 'description': "For several years, it appeared Mohammad Rizwan's international career would only happen in a parallel universe, racking up domestic runs even as he struggled to get a game in the national side. But for someone who played an international for the first time in two years in January 2019, Mohammad Rizwan was spoken of remarkably frequently. Most often, he was used as a stick to threaten current first-choice Pakistan wicketkeeper and captain Sarfaraz Ahmed, but the Peshawar native had qualities of his own that suggested he might have been unfortunate not to play for Pakistan more often. "}
{'name': 'Babar Azam\xa0(c)', 'team': 'Pakistan', 'battingStyle': 'Right hand Bat', 'bowlingStyle': 'Right arm Offbreak', 'playingRole': 'Batter', 'description': "A right-hand, top-order batsman known for his discipline and level-

In [88]:
all_players_data_list = []

for player in all_players_data:
    player_response = requests.get(player['link'])
    player_soup = BeautifulSoup(player_response.content, 'html.parser')

    # ----- Parser for Batting Style ----- #
    batting_style_p = player_soup.find('p', string="Batting Style")  
    if batting_style_p:
        span_tag = batting_style_p.find_next('span')  
        batting_style = span_tag.find_next('p').text if span_tag else "N/A"
    else:
        batting_style = "N/A"

    # ----- Parser for Bowling Style ----- #
    bowling_style_p = player_soup.find('p', string="Bowling Style")  
    if bowling_style_p:
        span_tag = bowling_style_p.find_next('span')  
        bowling_style = span_tag.find_next('p').text if span_tag else "N/A"
    else:
        bowling_style = "N/A"

    # ----- Parser for Playing Role ----- #
    playing_role_p = player_soup.find('p', string="Playing Role")  
    if playing_role_p:
        span_tag = playing_role_p.find_next('span')  
        playing_role = span_tag.find_next('p').text if span_tag else "N/A"
    else:
        playing_role = "N/A"

    # Extract description
    description = player_soup.select_one('div.ci-player-bio-content > p').text if player_soup.select_one('div.ci-player-bio-content > p') else "N/A"

    # Collect final data
    final_data = {
        "name": player['name'],
        "team": player['team'],
        "battingStyle": batting_style,
        "bowlingStyle": bowling_style,
        "playingRole": playing_role,
        "description": description,
    }

   
    all_players_data_list.append(final_data)

# Save the output to a JSON file
with open('players_data.json', 'w') as json_file:
    json.dump(all_players_data_list, json_file, indent=4)

print("Player data has been saved to players_data.json.")

Player data has been saved to players_data.json.
