In [1]:
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import DriverManager
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from webdriver_manager.firefox import DriverManager
from selenium.webdriver.firefox.options import Options

from datetime import datetime, timedelta
from time import sleep

from pprint import pprint

import json

import re

In [2]:
DELAY = 35

firefox_options = Options()
firefox_options.add_argument('--headless')

driver = webdriver.Firefox(options=firefox_options)

driver.maximize_window()

In [3]:
def get_extra_stats(driver, match_json):
    url = None
    
    match_number = match_json["match_stats"]["match_number"]
    run_out_batters = match_json["match_stats"]["run_out_batters"]
    batters_caught_by_sub = match_json["match_stats"]["batters_caught_by_sub"]
    batters_stumped_by_sub = match_json["match_stats"]["batters_stumped_by_sub"]
    player_stats = match_json["player_stats"]

    driver.get("https://www.cricbuzz.com/cricket-series/6732/icc-cricket-world-cup-2023/matches")
    
    WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, '//*/a[@class="text-hvr-underline"]')))
    sleep(2)
    
    match_links = driver.find_elements(By.XPATH, '//*/a[@class="text-hvr-underline"]')
    for match_link in match_links:
        m = re.match(f"[^0-9]*{match_number}[^0-9]*Match.*", match_link.text)
        if m:
            url = match_link.get_attribute("href")
    
    if url:
        driver.get(url.replace("live-cricket-scores", "live-cricket-scorecard"))
    else:
        print("ERROR: Match not found.")
        
    WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, '//*/h1[@class="cb-nav-hdr cb-font-18 line-ht24"]')))
    sleep(2)
    
    for div in driver.find_elements(By.XPATH, '//*/div[@class="cb-col cb-col-100 cb-scrd-itms"]'):
        run_out_batter = None
        caught_batter = None
        stumped_batter = None
        for j, sub_div in enumerate(div.find_elements(By.TAG_NAME, "div")):
            # 0: Batter Name
            # 1: Method of getting out or "not out"
            # 2: Runs made
            # 3: Balls Faced
            # 4: 4s
            # 5: 6s
            # 6: Strike Rate
            if j == 0:
                batter_name = sub_div.text.replace(" (wk)", "").replace(" (c)", "").strip()
                for batter in run_out_batters:
                    if batter in batter_name or batter_name in batter:
                        run_out_batter = batter
                for batter in batters_caught_by_sub:
                    if batter in batter_name or batter_name in batter:
                        caught_batter = batter
                for batter in batters_stumped_by_sub:
                    if batter in batter_name or batter_name in batter:
                        stumped_batter = batter
            elif j == 1:
                sub_div_text = sub_div.text.replace(" (wk)", "").replace(" (c)", "").strip()
                names_and_fields = []
                if run_out_batter is not None:
                    m = re.match("run out \((.*|sub [.*])\)", sub_div_text)
                    name = m.group(1)
                    if "[" in name:
                        m = re.match("sub \[(.*)\]", name)
                        name = m.group(1)
                    
                    names_and_fields.append((name, "run_outs"))
                    
                if caught_batter is not None:
                    m = re.match("c \(sub\)(.*) b .*", sub_div_text)
                    name = m.group(1)
                    names_and_fields.append((name, "catches"))
                if stumped_batter is not None:
                    m = re.match("st \(sub\)(.*) b .*", sub_div_text)
                    name = m.group(1)
                    names_and_fields.append((name, "stumpings"))
                for name, field_to_change in names_and_fields:
                    fielder_to_assign_points_to = name
                    player_elt = None
                    num_players = 0
                    p_for_testing = []
                    
                    for player_name in player_stats.keys():
                        if fielder_to_assign_points_to in player_name or player_name in fielder_to_assign_points_to:
                            player_elt = player_stats[player_name]
                            num_players += 1
                            p_for_testing.append((fielder_to_assign_points_to, player_name))

                    if num_players == 0:
                        player_stats[fielder_to_assign_points_to] = {
                            "player_name": fielder_to_assign_points_to,
                            "substitute": True,
                            "catches": 1 if field_to_change == "catches" else 0,
                            "run_outs": 1 if field_to_change == "run_outs" else 0,
                            "stumpings": 1 if field_to_change == "stumpings" else 0,
                        }
                    elif num_players > 1:
                        num_players = 0
                        player_elt = None

                        for fielder_to_assign_points_to, player_name in p_for_testing:
                            if player_name.endswith(fielder_to_assign_points_to):
                                player_elt = player_stats[player_name]
                                num_players += 1
                        if num_players > 1:
                            print("ERROR: 2 players share a name and are not distinguishable.")
                            print(p_for_testing)

                    if player_elt and num_players == 1:
                        player_elt[field_to_change] += 1

            else:
                continue

In [4]:
def get_player_stats(driver, scorecard_link, teams, winner):
    driver.get(scorecard_link)
    WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, '//*/td[@class="ScoreCardBanner2"]')))

    run_out_batters = []
    batters_caught_by_sub = []
    batters_stumped_by_sub = []

    teams = ["India", "Australia"]
    winner = "India"


    player_stats = {}

    team_1_stats = {
        "team_name": None,
        "total": None,
        "wickets": None,
        "overs_str": None,
        "overs_float": None,
        "players": set()
    }

    team_2_stats = {
        "team_name": None,
        "total": None,
        "wickets": None,
        "overs_str": None,
        "overs_float": None,
        "players": set()
    }

    WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, '//*/td[@class="ScoreCardBanner2"]')))
    top_level_table = driver.find_elements(By.XPATH, '//*/table[@class="Table_D"]')[0]
    top_level_table = top_level_table.find_element(By.TAG_NAME, "table")

    for tr in top_level_table.find_elements(By.TAG_NAME, "tr"):
        tr_text = tr.text.strip()
        if tr_text != "":
            if not team_1_stats["team_name"]:
                team_stats = team_1_stats
            else:
                team_stats = team_2_stats

            m = re.match("(.*) \((.*)\) (.*)", tr_text)
            name = m.group(1)
            overs_str = m.group(2)
            overs_float = overs_str[:overs_str.find(" overs")].split(".")
            if len(overs_float) == 1:
                overs_float = float(overs_float[0])
            else:
                overs_float = float(overs_float[0]) + float(float(overs_float[1])/6)

            if "/" not in m.group(3):
                total = int(m.group(3))
                wickets = 10
            else:
                total, wickets = m.group(3).split("/")

            team_stats["team_name"] = name
            team_stats["total"] = total
            team_stats["wickets"] = wickets
            team_stats["overs_str"] = overs_str
            team_stats["overs_float"] = overs_float

    # loop once just to add players to the data structure
    # loop again to add the players' stats
    scorecard = driver.find_elements(By.XPATH, '//*/table[@class="Scorecard"]')[0]

    for pass_thru in ["first", "second"]:
        mode = None
        curr_team = None
        for i, tr in enumerate(scorecard.find_elements(By.TAG_NAME, "tr")):
            tr_text = tr.text.strip()
            for td in tr.find_elements(By.TAG_NAME, "td"):
                if "TextBlackBold8" in td.get_attribute("class"):
                    td_text = td.text.strip()
                    if "Total" in td_text or "Wickets" in td_text:
                        mode = "Ignore"
                        ignored_first_bowling = False
                        break
                    elif "Bowling" in td_text and not ignored_first_bowling:
                        mode = "Ignore"
                        ignored_first_bowling = True
                        break
                    elif "Bowling" in td_text and ignored_first_bowling:
                        mode = "Bowling"
                        #change to the bowling team
                        curr_team = set(teams) - {curr_team}
                        curr_team = curr_team.pop()
                        break

                    tr_text = tr_text[:tr_text.find(" R")]
                    if "(" in tr_text:
                        tr_text = tr_text[:tr_text.find(" (")]
                    mode = "Batting"
                    curr_team = tr_text
                    break

            if mode == "Ignore":
                continue
            elif curr_team and mode == "Batting":
                if tr_text.startswith(curr_team) or tr_text.startswith("Extras"):
                    continue
                if pass_thru == "first":
                    line = []
                    for td in tr.find_elements(By.TAG_NAME, "td"):
                        line.append(td.text.strip())

                    player_name = line[0].strip().replace("†", "")
                    if "*" in player_name:
                        is_captain = True
                        player_name = player_name[:player_name.find("*")]
                    else:
                        is_captain = False
                    player_stats[player_name] = {
                        "player_name": player_name,
                        "is_captain": is_captain,
                        "substitute": False,
                        "team": curr_team,
                        "on_winning_team": curr_team == winner,
                        "catches": 0,
                        "wickets_taken_bowled": 0,
                        "wickets_taken_lbw": 0,
                        "run_outs": 0,
                        "stumpings": 0,
                        "not_out": False
                    }
                    for team_stats in [team_1_stats, team_2_stats]:
                        if team_stats["team_name"] == curr_team:
                            team_stats["players"].add(player_name)
                elif pass_thru == "second":
                    line = []
                    for td in tr.find_elements(By.TAG_NAME, "td"):
                        line.append(td.text.strip())

                    # Add batter's stats here
                    # 0: Batter Name
                    # 1: Method of getting out or "not out"
                    # 2: Runs made
                    # 3: Balls Faced
                    # 4: 4s
                    # 5: 6s
                    # 6: Strike Rate
                    # 7: Percent of Total Runs made by this Batter
                    for j, line_elt in enumerate(line):
                        if j == 0:
                            batter_name = line_elt.replace("*", "").replace("†", "")
                        elif j == 1:
                            # assign catches and bonuses for bowled/LBW here
                            if ("retired" in line_elt.lower()) or \
                                ("obstructing field" in line_elt.lower()) or \
                                ("retired" in line_elt.lower()) or \
                                ("absent hurt" in line_elt.lower()) or \
                                ("hit wicket" in line_elt.lower()):
                                continue
                            elif "lbw " in line_elt.lower() or line_elt.lower().startswith("b ") or "c & b " in line_elt.lower():
                                bowler_to_assign_points_to = line_elt[line_elt.rfind("b ")+2:].replace("*", "").replace("†", "")
                                player_elt = None
                                num_players = 0
                                for player_name in player_stats.keys():
                                    if bowler_to_assign_points_to in player_name:
                                        player_elt = player_stats[player_name]
                                        num_players += 1

                                if num_players != 1:
                                    print("ERROR: Player not found or 2 players share a name and are not distinguishable.")
                                else:
                                    if "lbw " in line_elt.lower():
                                        player_elt["wickets_taken_lbw"] += 1
                                    elif "c & b " in line_elt.lower():
                                        player_elt["catches"] += 1
                                    else:
                                        player_elt["wickets_taken_bowled"] += 1

    #                             print(player_elt)
                            elif "c " in line_elt.lower() or "st " in line_elt.lower():
                                substring = "c" if "c " in line_elt.lower() else "st"
                                fielder_to_assign_points_to = line_elt[line_elt.find(f"{substring} ")+len(substring)+1:]
                                fielder_to_assign_points_to = fielder_to_assign_points_to.replace("*", "").replace("†", "")
                                fielder_to_assign_points_to = fielder_to_assign_points_to[:fielder_to_assign_points_to.find(" b ")]
                                fielder_to_assign_points_to = fielder_to_assign_points_to.strip()

                                player_elt = None
                                num_players = 0
                                p_for_testing = []
                                
                                for player_name in player_stats.keys():
                                    if fielder_to_assign_points_to in player_name:
                                        player_elt = player_stats[player_name]
                                        num_players += 1
                                        p_for_testing.append((fielder_to_assign_points_to, player_name))

                                if num_players == 0:
                                    if "st " in line_elt.lower():
                                        batters_stumped_by_sub.append(batter_name)
                                    elif "c " in line_elt.lower():
                                        batters_caught_by_sub.append(batter_name)
                                elif num_players > 1:
                                    num_players = 0
                                    player_elt = None
                                    
                                    for fielder_to_assign_points_to, player_name in p_for_testing:
                                        if player_name.endswith(fielder_to_assign_points_to):
                                            player_elt = player_stats[player_name]
                                            num_players += 1
                                    if num_players > 1:
                                        print("ERROR: 2 players share a name and are not distinguishable.")
                                        print(p_for_testing)

                                if player_elt and num_players == 1:
                                    if "st " in line_elt.lower():
                                        player_elt["stumpings"] += 1
                                    elif "c " in line_elt.lower():
                                        player_elt["catches"] += 1

    #                             print(player_elt)
                            elif "run out" in line_elt.lower():
                                run_out_batters.append(batter_name)
                            elif "not out" in line_elt.lower() or line_elt.strip() == "":
                                player_stats[batter_name]["not_out"] = True
                            else:
                                print(f"Unknown dismissal type: {line_elt}")
                        elif j == 2:
                            try:
                                player_stats[batter_name]["runs"] = int(line_elt.strip())
                                player_stats[batter_name]["batted"] = True
                            except ValueError:
                                player_stats[batter_name]["runs"] = None
                                player_stats[batter_name]["batted"] = False
                        elif j == 3 or j == 4 or j == 5:
                            if j == 3:
                                key = "balls_faced"
                            elif j == 4:
                                key = "4s"
                            elif j == 5:
                                key = "6s"

                            try:
                                player_stats[batter_name][key] = int(line_elt.strip())
                            except ValueError:
                                player_stats[batter_name][key] = None

                        elif j == 6:
                            try:
                                player_stats[batter_name]["strike_rate"] = float(line_elt.strip())
                            except ValueError:
                                player_stats[batter_name]["strike_rate"] = None


            elif curr_team and mode == "Bowling":
                if pass_thru == "first":
                    continue
                elif pass_thru == "second":
                    line = []
                    for td in tr.find_elements(By.TAG_NAME, "td"):
                        line.append(td.text.strip())
                    if line == [''] or line[0] == 'Bowling':
                        continue

                    # Add bowler's stats here
                    # 0: Bowler
                    # 1: Overs
                    # 2: Maidens
                    # 3: Runs Conceded
                    # 4: Wickets Taken
                    # 5: Economy Rate
                    # 6: Percent of Wickets taken by this Bowler
                    for j, line_elt in enumerate(line):
                        if j == 0:
                            bowler_name = line_elt.replace("*", "").replace("†", "")
                        elif j == 1:
                            overs_str = line_elt.strip()
                            overs_float = overs_str.split(".")
                            if len(overs_float) == 1:
                                overs_float = float(overs_float[0])
                            else:
                                overs_float = float(overs_float[0]) + float(float(overs_float[1])/6)

                            player_stats[bowler_name]["overs_bowled_str"] = overs_str
                            player_stats[bowler_name]["overs_bowled_float"] = overs_float

                        elif j == 2 or j == 3 or j == 4:
                            if j == 2:
                                key = "maidens"
                            if j == 3:
                                key = "runs_conceded"
                            elif j == 4:
                                key = "wickets_taken"

                            try:
                                player_stats[bowler_name][key] = int(line_elt.strip())
                            except ValueError:
                                player_stats[bowler_name][key] = None
                        elif j == 5:
                            try:
                                player_stats[bowler_name]["economy_rate"] = float(line_elt.strip())
                            except ValueError:
                                player_stats[bowler_name]["economy_rate"] = None


    return player_stats, team_1_stats, team_2_stats, \
                run_out_batters, batters_caught_by_sub, batters_stumped_by_sub

In [5]:
def get_all_matches_within_dates(start_date, end_date):
    driver.get("http://www.howstat.com/cricket/Statistics/Series/SeriesStats_ODI.asp?SeriesCode=1117")

    table = WebDriverWait(driver, DELAY).until(EC.presence_of_element_located((By.XPATH, '//*/table[@class="TableLined"]')))

    all_matches = list()

    for i, tr in enumerate(table.find_elements(By.TAG_NAME, "tr")):
        if i == 0:
            continue
        match_number = None
        match_date = None
        teams = None
        ground = None
        winner = None
        loser = None
        margin = None
        scorecard_link = None
        match_json = dict()

        for j, td in enumerate(tr.find_elements(By.TAG_NAME, "td")):
            # 0: Match Number
            # 1: Date
            # 2: Teams
            # 3: Ground
            # 4: Result
            # 5: Scorecard Link

            td_text = td.text.strip()

            if j == 0:
                match_number = int(td_text)
            elif j == 1:
                match_date = datetime.strptime(td_text, "%d/%m/%Y")
            elif j == 2:
                teams = td_text.split(" v ")
            elif j == 3:
                ground = td_text
            elif j == 4:
                winner = td_text[:td_text.find(" won")].strip()
                loser = set(teams) - {winner}
                loser = loser.pop()
                margin = td_text[td_text.find(" won")+5:].strip()
            elif j == 5:
                scorecard_link = td.find_element(By.TAG_NAME, "a").get_attribute("href")

        if match_date >= start_date and match_date <= end_date:
            match_json["match_stats"] = {
                "match_number": match_number,
                "match_date": match_date.strftime("%Y-%m-%d"),
                "teams": teams,
                "ground": ground,
                "winner": winner,
                "loser": loser,
                "margin": margin,
                "scorecard_link": scorecard_link
            }

            all_matches.append(match_json)

    for match_json in all_matches:
        player_stats, team_1_stats, team_2_stats, \
        run_out_batters, batters_caught_by_sub, \
        batters_stumped_by_sub = get_player_stats(driver, 
                                         match_json["match_stats"]["scorecard_link"],
                                         match_json["match_stats"]["teams"],
                                         match_json["match_stats"]["winner"])

        team_1_stats["players"] = list(team_1_stats["players"])
        team_2_stats["players"] = list(team_2_stats["players"])
        
        match_json["player_stats"] = player_stats
        match_json["match_stats"]["team_1_stats"] = team_1_stats
        match_json["match_stats"]["team_2_stats"] = team_2_stats
        match_json["match_stats"]["run_out_batters"] = run_out_batters
        match_json["match_stats"]["batters_caught_by_sub"] = batters_caught_by_sub
        match_json["match_stats"]["batters_stumped_by_sub"] = batters_stumped_by_sub

        if run_out_batters != [] or batters_caught_by_sub != [] or batters_stumped_by_sub != []:
            get_extra_stats(driver, match_json)

    return {"all_matches": all_matches}

In [6]:
%%time

start_date = datetime.strptime("5/10/2023", "%d/%m/%Y")
end_date = datetime.strptime("10/10/2023", "%d/%m/%Y")

# all matches is a python dict
all_matches = get_all_matches_within_dates(start_date, end_date)

json.dump(all_matches, open(f"""{start_date.strftime("%Y-%m-%d")}_{end_date.strftime("%Y-%m-%d")}.json""", "w"))

CPU times: user 3.58 s, sys: 426 ms, total: 4.01 s
Wall time: 1min 9s


In [7]:
# TYPES OF DISMISSALS - HOWSTAT

# 1 Marvan Atapattu	retired out	201	 259	 27	 0	77.61	 36.22%
# 2 Javed Omar	c Jayasuriya b Vaas	7	 16	 1	 0	43.75	 7.78%
# 3 Mehrab Hossain	run out	23	 70	 3	 0	32.86	 25.56%
# 4 Habibul Bashar	b Vaas	4	 6	 1	 0	66.67	 4.44%
# 5 Mohammed Sharif	c & b Muralitharan	19	 13	 3	 1	146.15	 5.79%
# 6 Haris Rauf	st †Edwards b Ackermann	16	 14	 2	 1	114.29	 5.59%
# 7 Saqib Zulfiqar	lbw b Shaheen Shah Afridi	10	 18	 1	 0	55.56	 4.88%
# 8 Leonard Hutton	obstructing field	27	 	 0	 0	 	 16.46%
# 9 Steve Waugh*	handled ball	47	 150	 4	 1	31.33	 12.02%
# 10 Fanyan Mughal	hit ball twice	8	 15	 0	 1	53.33	 5.13%
# 11 James McCollum	retired hurt	12	 26	 2	 0	46.15	 3.31%
# 12 Wasim Jaffer	retired ill	138	 229	 17	 0	60.26	 22.62%
# 13 Ramnaresh Sarwan*	absent hurt
# 14 Mujeeb Ur Rahman	hit wicket b Taskin Ahmed	4	 8	 0	 0	50.00	 1.63%
# 15 Paul van Meekeren	not out	4	 3	 1	 0	133.33	 1.79%

# ADDITIONAL DETAILS - CRICBUZZ
# 1 run out (sub [GJ Pratt])
# 2 run out (Najibullah)
# 3 c (sub)Phehlukwayo b Maharaj
# 4 st (sub)Srikar Bharat b Axar