In [1]:
# selenium 4
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(service=ChromeService(
    ChromeDriverManager().install()))
driver.maximize_window()
driver.implicitly_wait(10)
driver.set_page_load_timeout(5)


In [2]:
df = pd.read_csv('match_results.csv').iloc[:5]


def handle_popup():
    try:
        junk_click_box = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located(
                (By.XPATH, '//*[@id="wzrk-cancel"]'))
        )
        junk_click_box.click()
    except NoSuchElementException:
        print('No Such element')


In [3]:
def process_batting_table(xpath):
    batting = driver.find_element(By.XPATH, xpath)
    batting_outer_html = batting.get_attribute('outerHTML')
    batting = pd.read_html(batting_outer_html)[0]
    batting = (batting
               .dropna(axis=0, how='all')
               .dropna(axis=1, how='all')
               .rename(columns={'Unnamed: 1': 'Dismissal'})
               )
    # last 4 elements are junk and delete the last table
    batting = batting.iloc[:-4].dropna(axis=1, how='all')
    batting_soup = BeautifulSoup(batting_outer_html)

    player_links = batting_soup.find_all(
        class_='ds-inline-flex ds-items-start ds-leading-none')

    player_links = [a['href'] for a in player_links]

    batting['player_links'] = player_links[:batting.shape[0]]

    return batting


def process_bowling_table(xpath):
    bowling = driver.find_element(By.XPATH, xpath)
    bowling_outer_html = bowling.get_attribute('outerHTML')
    bowling = pd.read_html(bowling_outer_html)[0]
    bowling = bowling[~bowling['BOWLING'].str.contains('\d', regex=True)]
    bowling = (bowling
               .dropna(axis=0, how='all')
               .dropna(axis=1, how='all')
               )

    bowling_soup = BeautifulSoup(bowling_outer_html)

    player_links = bowling_soup.find_all(
        class_='ds-inline-flex ds-items-start ds-leading-none')

    player_links = [a['href'] for a in player_links]

    bowling['player_links'] = player_links[:bowling.shape[0]]

    return bowling


In [4]:
team_one_batting_xpath = "(//table[@class='ds-w-full ds-table ds-table-md ds-table-auto  ci-scorecard-table'])[1]"
team_two_batting_xpath = "(//table[@class='ds-w-full ds-table ds-table-md ds-table-auto  ci-scorecard-table'])[2]"
team_one_bowling_xpath = '(//table[@class="ds-w-full ds-table ds-table-md ds-table-auto "])[1]'
team_two_bowling_xpath = '(//table[@class="ds-w-full ds-table ds-table-md ds-table-auto "])[2]'


In [5]:
batting_outputs = []

bowling_outputs = []

for i,row in enumerate(df.iterrows()):
    _, row = row
    driver.get(row['score_card_link'])
    # process batting
    handle_popup() if i == 1 else None 
    batting_one_df = process_batting_table(team_one_batting_xpath)
    batting_two_df = process_batting_table(team_two_batting_xpath)
    batting_one_df['team_inning'] = row['team_one']
    batting_two_df['team_inning'] = row['team_two']
    batting_one_df['match_id'] = row['score_card']
    batting_two_df['match_id'] = row['score_card']

    # process bowling 
    bowling_one_df = process_bowling_table(team_one_bowling_xpath)
    bowling_two_df = process_bowling_table(team_two_bowling_xpath)
    bowling_one_df['team_inning'] = row['team_one']
    bowling_two_df['team_inning'] = row['team_two']
    bowling_one_df['match_id'] = row['score_card']
    bowling_two_df['match_id'] = row['score_card']

    
    bowling_outputs.append(bowling_one_df)
    bowling_outputs.append(bowling_two_df)

    
    batting_outputs.append(batting_one_df)
    batting_outputs.append(batting_two_df)
    
    #driver.delete_all_cookies()


In [10]:
pd.concat(batting_outputs).to_csv('CollectedBattingData.csv', index = None)

In [9]:
pd.concat(bowling_outputs).to_csv('CollectedBowlingData.csv', index = None)