### Insights 

**PHASE 1**:

- How effective was the DRS call in extending the survival at the crease?

- No. of referrals innings wise

- Who was the batting partner who has probably assited the most in DRS?

**PHASE 2**:

- How many recognized batsmen were left? And was there a missed opportunity due to DRS being recklessly taken earlier?**

- Missed reviews by teams: Did # of remaining reviews have a say


### Data points (Phase 1)
-  Match in Series 
-  Series Name to produce facets 
-  Match Venue
-  Match Date (Month_Year)
-  Over of referral
-  Innings of referral in game
- Team taking review
- Team Batting/Bowling
- Umpire at time of review
- Batsman at time of review
- Outcome of review 
- Innings wise dismissal data
- Active wicket partnership
- Innings wise referral data (scraped from match notes, needs a bit of formatting)
- Commentary of that particular referral ball


### Data points (Phase 2)

- no. of recognized batsman to come vs missed opportunities for them


### Required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from collections import defaultdict

### Initialisations

In [2]:
##Path of chromedriver
chromedriver="C:/Users/k.shridhar/Documents/chromedriver.exe"

### Scrape cricinfo match notes for DRS events

In [3]:
def scrape_cricinfo_match_notes(cricinfo_match_notes_url):
    '''Function to scrape Cricinfo match notes given a match URL. Returns soup of match notes'''
    driver = webdriver.Chrome(executable_path=chromedriver)
    driver.get(cricinfo_match_notes_url)
    cricinfo_matchnotes_soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    return cricinfo_matchnotes_soup
    
# cricinfo_match_notes_url='https://www.espncricinfo.com/series/19430/scorecard/1152846/england-vs-australia-1st-test-icc-world-test-championship-2019-2021'

# cricinfo_matchnotes_soup=scrape_cricinfo_match_notes(cricinfo_match_notes_url)

### Process cricinfo match notes to obtain day wise referral with outcome

In [4]:
def process_cricinfo_match_notes(cricinfo_matchnotes_soup):
    '''Process the information from the cricinfo matchnotes soup and obtain day wise referral information'''
    m_notes=cricinfo_matchnotes_soup.find('h1',text='Match Notes')
    all_days=[d for d in m_notes.next_element.next.find_all('ul',{'class':'bulleted-list'})]
    all_days.reverse()
    return all_days
    
# all_days=process_cricinfo_match_notes(soup)

In [5]:
def create_innings_df(all_days):
    '''Process list of day wise reviews andreturn a neater dataframe'''
    innings_list=[]
    innings_reviews=defaultdict(list)
    innings_list.append([a.text for a in all_days[0] if "innings" in a.text and len(a.text)<=30])
    day_wise_reviews=[a.text for a in all_days[0] if a.text.startswith("Over") or "innings" in a.text]
    ##Append from day2 onwards to the day 1 list
    if len(all_days)>=2:
        for ad in all_days[1:]:
            daylist=[a.text for a in ad if a.text.startswith("Over") or "innings" in a.text]
            innings_list.append([a.text for a in ad if "innings" in a.text and len(a.text)<=30])
            for d in daylist:
                day_wise_reviews.append(d)

    idxs = [i for i,x in enumerate(day_wise_reviews) if 'innings' in x]
    start_end_idxs=list(map(list, zip(idxs, idxs[1:])))
    for s in start_end_idxs:
        innings_reviews[day_wise_reviews[s[0]]]=day_wise_reviews[s[0]+1:s[1]]
    innings_reviews[day_wise_reviews[max(idxs)]]=day_wise_reviews[max(idxs)+1:]
    idf=pd.DataFrame.from_dict(innings_reviews,orient='index')
    idf.reset_index(inplace=True)
    idf.fillna('',inplace=True)
    innings_df=idf.melt(id_vars='index',value_name='reviews')
    ##Variable column does not add any value.
    innings_df.drop(columns='variable',inplace=True) 
    innings_df.columns=['innings','reviews']
    innings_df=innings_df[innings_df.reviews!='']
    
    ###Augment Innings_df
    
    over=[]
    review_team=[]
    review_umpire=[]
    review_batsman=[]
    review_outcome=[]
    
    
    for review in innings_df.reviews:
        over.append(review.split('Over')[1].split(':')[0].strip())
        review_team.append(review.split(':')[1].strip().split('by ')[1].split(',')[0])
        review_umpire.append(review.split(':')[1].strip().split('Umpire - ')[1].split(',')[0])
        review_batsman.append(review.split(':')[1].strip().split('Batsman -')[1].strip().split('(')[0].strip())
        review_outcome.append(review.split(':')[1].strip().split('Batsman -')[1].strip().split('(')[1].split(')')[0].strip())

    innings_df['Over']=over
    innings_df['Review_team']=review_team
    innings_df['Review_batsman']=review_batsman
    innings_df['Review_umpire']=review_umpire
    innings_df['Review_outcome']=review_outcome
    innings_df['Umpires_call']=innings_df['Review_outcome'].apply(lambda x:"Umpire" in x)
    innings_df['index']=range(len(innings_df.reviews))
    innings_list_updated=[]
    for i in innings_list:
        if(i):
            for a in range(len(i)):
                innings_list_updated.append(i[a])
                
    innings_list_updated=list(dict.fromkeys(innings_list_updated))
    innings_list_updated.reverse()
    
    innings_df.set_index('index',inplace=True,drop=False)
    return innings_df,innings_list_updated

# innings_df,innings_list=create_innings_df(all_days)

### Break in partnerships

In [6]:
def analyze_partnership_breaks(cricinfo_matchnotes_soup,innings_df,innings_list):
    '''Analyze break in partnerships using Fall of wicket data and augment innings_df'''
    fow_text=[fow.text for fow in cricinfo_matchnotes_soup.find_all('div',{"class":"wrap dnb"}) if "Fall of wickets:" in fow.text]
    innings_fow=defaultdict(list)
    for a,inn in enumerate(reversed(innings_list)):
        innings_fow[inn]=[f.split(')')[0].strip().split(' ')[0] for i,f in enumerate(fow_text[a].split(':')[1].strip().split(',')) if i%2!=0]
    innings_fow_df=pd.DataFrame.from_dict(innings_fow,orient='index')
    innings_fow_df.reset_index(inplace=True)
    innings_fow_df.fillna('',inplace=True)
    innings_fow_df=innings_fow_df.melt(id_vars='index',value_name='wickets')
    innings_fow_df['variable']=innings_fow_df['variable']+1
    innings_fow_df.columns=['innings','active_partnership','Over']
    innings_fow_df=innings_fow_df[innings_fow_df.Over!='']
    pbreak_innings=pd.merge(innings_df,innings_fow_df,on=['innings','Over'],how='inner')['index']
    innings_df['Partnership_broken']=False
    innings_df.loc[pbreak_innings,'Partnership_broken']=True
    return innings_df,innings_fow_df

# innings_df_updated,innings_fow_df=analyze_partnership_breaks(soup,innings_df,innings_list)


In [7]:
def parse_cricbuzz_commentary(cricbuzz_match_url):
    '''Parse ball by ball commentary from Cricbuzz match URL'''
    driver = webdriver.Chrome(executable_path=chromedriver)
    driver.get(cricbuzz_match_url)
    cricbuzz_match_soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    return cricbuzz_match_soup

# cricbuzz_match_url='https://www.cricbuzz.com/cricket-scores/20715/eng-vs-aus-1st-test-the-ashes-2019'

# cricbuzz_match_soup=parse_cricbuzz_commentary(cricbuzz_match_url)

In [8]:
def process_cricbuzz_commentary(cricbuzz_match_soup,innings_list):
    '''Process cricbuzz soup and create cricbuzz commentary dataframe'''
    commentary_text=[c.text for c in cricbuzz_match_soup.find_all('p',{'class':'cb-col cb-col-90 cb-com-ln'})]
    over_text=[o.text for o in cricbuzz_match_soup.find_all('span',{'cb-col cb-col-8 text-bold'})]
    inngs_breaks=[index for index, value in enumerate(over_text) if value == '0.1']
    current_innings=[]
    current_commentary=[]
    current_over=[]
    end=0
    for i,b in enumerate(inngs_breaks):
        start=b
        for on,o in enumerate(over_text[end:start+1]):
            current_innings.append(innings_list[i])
            current_over.append(o)
            current_commentary.append(commentary_text[end:start+1][on])
        end=start+1
    cricbuzz_commentary_df=pd.DataFrame({'innings':current_innings,'Over':current_over,'Commentary':current_commentary})
    return cricbuzz_commentary_df

# cricbuzz_commentary_df=process_cricbuzz_commentary(cricbuzz_match_soup)

### Compile all tables

In [9]:
# page_url='https://www.espncricinfo.com/scores/series/19430/season/2019/icc-world-test-championship'
# r1=requests.get(page_url)
# bs_main=BeautifulSoup(r1.text,'html.parser')
# urllist=[]
# for link in bs_main.find_all('a',href=True,text='SCORECARD'):
#     urllist.append('https://www.espncricinfo.com'+link['href'])
# pd.DataFrame(urllist).to_csv('url.csv')

In [10]:
def compile_referral_data(cricinfo_match_notes_url,cricbuzz_match_url):
    '''Compile all data from given cricinfo match notes url and cricbuzz match URL'''
    cricinfo_matchnotes_soup=scrape_cricinfo_match_notes(cricinfo_match_notes_url)
    cricbuzz_match_soup=parse_cricbuzz_commentary(cricbuzz_match_url)
    all_days=process_cricinfo_match_notes(cricinfo_matchnotes_soup)
    innings_df,innings_list=create_innings_df(all_days)
    innings_df_updated,innings_fow_df=analyze_partnership_breaks(cricinfo_matchnotes_soup,innings_df,innings_list)
    cricbuzz_commentary_df=process_cricbuzz_commentary(cricbuzz_match_soup,innings_list)
    innings_state=pd.merge(cricbuzz_commentary_df,innings_fow_df,on=['innings','Over'],how='left').ffill()
    last_fow_idx=min(innings_state[~pd.isnull(innings_state.active_partnership)].index)
#     print(last_fow_idx)
    if last_fow_idx>=1:
        last_fow_val=(innings_state[~pd.isnull(innings_state.active_partnership)]['active_partnership'])[last_fow_idx]+1
        innings_state.loc[0:last_fow_idx-1]['active_partnership']=last_fow_val
    reviews_match=pd.merge(innings_state,innings_df_updated,how='left')
    reviews_match.fillna('',inplace=True)
    reviews_match['match']=cricbuzz_match_url.split('/')[-1]
    
    return reviews_match
    
    

In [11]:
url_list=pd.read_csv('URLs.csv')

### Store referral data

In [12]:
# for row in range(url_list.shape[0]):
#     if row>=7:
#         cricinfo_match_notes_url=(url_list.iloc[row]['Cricinfo_URL'])
#         cricbuzz_match_url=(url_list.iloc[row]['Cricbuzz_URL'])
#         print("Match",cricbuzz_match_url.split('/')[-1])
#         reviews_match=compile_referral_data(cricinfo_match_notes_url,cricbuzz_match_url)
#         reviews_match.to_csv('data/reviews_match_{0}.csv'.format(cricbuzz_match_url.split('/')[-1]),index=False)
#         print("Match parsed",cricbuzz_match_url.split('/')[-1])

In [13]:
def compile_referral_data_debug(cricinfo_match_notes_url,cricbuzz_match_url):
    '''Compile all data from given cricinfo match notes url and cricbuzz match URL'''
    cricinfo_matchnotes_soup=scrape_cricinfo_match_notes(cricinfo_match_notes_url)
    cricbuzz_match_soup=parse_cricbuzz_commentary(cricbuzz_match_url)
    all_days=process_cricinfo_match_notes(cricinfo_matchnotes_soup)
    innings_df,innings_list=create_innings_df(all_days)
    return cricinfo_matchnotes_soup,cricbuzz_match_soup,all_days,innings_df,innings_list
    

### Run step by step to avoid blocking of IPs

In [14]:
# row=0
# cricinfo_match_notes_url=(url_list.iloc[row]['Cricinfo_URL'])
# cricbuzz_match_url=(url_list.iloc[row]['Cricbuzz_URL'])

# reviews_match=compile_referral_data(cricinfo_match_notes_url,cricbuzz_match_url)
# reviews_match.to_csv('data/reviews_match_{0}.csv'.format(cricbuzz_match_url.split('/')[-1]),index=False)
# print("Match parsed",cricbuzz_match_url.split('/')[-1])

# ##FOR DEBUGGING USE BELOW SNIPPET


# #cricinfo_matchnotes_soup,cricbuzz_match_soup,all_days,innings_df,innings_list=compile_referral_data_debug(cricinfo_match_notes_url,cricbuzz_match_url)

Fixes needed:
- Active partnership sometimes increments additionally- due to retired not outs..etc.