## Web Scraping from cricinfo

- https://medium.com/swlh/web-scraping-cricinfo-data-c134fce79a33

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

In [53]:
def extract_batting_data(series_id, match_id):

    URL = 'https://www.espncricinfo.com/series/'+ str(series_id) + '/scorecard/' + str(match_id)
    print (URL)
    page = requests.get(URL)
    bs = BeautifulSoup(page.content, 'lxml')

    table_body=bs.find_all('tbody')
    batsmen_df = pd.DataFrame(columns=["Name","Desc","Runs", "Balls", "4s", "6s", "SR", "Team"])
    for i, table in enumerate(table_body[0:4:2]):
        rows = table.find_all('tr')
        for row in rows[::2]:
            cols=row.find_all('td')
            cols=[x.text.strip() for x in cols]
            if cols[0] == 'Extras':
                continue
            if len(cols) > 7:
                batsmen_df = batsmen_df.append(pd.Series(
                [re.sub(r"\W+", ' ', cols[0].split("(c)")[0]).strip(), cols[1], 
                cols[2], cols[3], cols[5], cols[6], cols[7], i+1], 
                index=batsmen_df.columns ), ignore_index=True)
            else:
                batsmen_df = batsmen_df.append(pd.Series(
                [re.sub(r"\W+", ' ', cols[0].split("(c)")[0]).strip(), cols[1], 
                0, 0, 0, 0, 0, i+1], index = batsmen_df.columns), ignore_index=True)
                    
    for i in range(2):
        dnb_row = bs.find_all("tfoot")[i].find_all("div")
        for c in dnb_row:
            dnb_cols = c.find_all('span')
            dnb = [x.text.strip().split("(c)")[0] for x in dnb_cols]
            dnb = filter(lambda item: item, [re.sub(r"\W+", ' ', x).strip() for x in dnb])
            for dnb_batsman in dnb:
                batsmen_df = batsmen_df.append(pd.Series([dnb_batsman, "DNB", 0, 0, 0, 0, 0, i+1], index = batsmen_df.columns), ignore_index =True)

    return batsmen_df

In [55]:
dff = extract_batting_data(series_id = 8048, match_id = 1136561)
dff.head(2)

https://www.espncricinfo.com/series/8048/scorecard/1136561


Unnamed: 0,Name,Desc,Runs,Balls,4s,6s,SR,Team
0,Rohit Sharma,c Rayudu b Watson,15,18,1,1,83.33,1
1,Evin Lewis,lbw b Chahar,0,2,0,0,0.0,1


In [54]:
def extract_bowling_data(series_id, match_id):

    URL = 'https://www.espncricinfo.com/series/'+ str(series_id) + '/scorecard/' + str(match_id)
    page = requests.get(URL)
    bs = BeautifulSoup(page.content, 'lxml')

    table_body=bs.find_all('tbody')
    bowler_df = pd.DataFrame(columns=['Name', 'Overs', 'Maidens', 'Runs', 'Wickets',
                                      'Econ', 'Dots', '4s', '6s', 'Wd', 'Nb','Team'])
    for i, table in enumerate(table_body[1:4:2]):
        rows = table.find_all('tr')
        for row in rows:
            cols=row.find_all('td')
            cols=[x.text.strip() for x in cols]
            bowler_df = bowler_df.append(pd.Series([cols[0], cols[1], cols[2], cols[3], cols[4], cols[5], 
                                                    cols[6], cols[7], cols[8], cols[9], cols[10], (i==0)+1], 
                                                   index=bowler_df.columns ), ignore_index=True)
    return bowler_df
    

In [56]:
dff = extract_bowling_data(series_id = 8048, match_id = 1136561)
dff.head(2)

Unnamed: 0,Name,Overs,Maidens,Runs,Wickets,Econ,Dots,4s,6s,Wd,Nb,Team
0,Deepak Chahar,3,0,14,1,4.66,12,2,0,0,0,2
1,Shane Watson,4,0,29,2,7.25,12,1,2,1,0,2


In [None]:
series_id = 8048;
match_id = 1136561
URL = 'https://www.espncricinfo.com/series/'+ str(series_id) + '/scorecard/' + str(match_id)
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'lxml')

In [23]:
table_body=soup.find_all('tbody')

In [24]:
len(table_body)

9

In [25]:
table_body[0:4:2]

[<tbody><tr><td class="batsman-cell text-truncate out"><a class="small" data-hover="" href="/player/rohit-sharma-34102" rel="" target="_self" title="View full profile of Rohit Sharma">Rohit Sharma<!-- --> <span>(c)</span></a></td><td class="text-left"><span class="cursor-pointer"><i class="espn-icon icon-caret-sm2-down-after icon-sm text-danger font-weight-bold small pr-1" id="caret-0"></i>c Rayudu b Watson</span></td><td class="font-weight-bold">15</td><td>18</td><td style="display:none">-</td><td>1</td><td>1</td><td>83.33</td></tr><tr><td class="p-0 border-0 d-none out" colspan="9"></td></tr><tr><td class="batsman-cell text-truncate out"><a class="small" data-hover="" href="/player/evin-lewis-431901" rel="" target="_self" title="View full profile of Evin Lewis">Evin Lewis<!-- --> </a></td><td class="text-left"><span class="cursor-pointer"><i class="espn-icon icon-caret-sm2-down-after icon-sm text-danger font-weight-bold small pr-1" id="caret-1"></i>lbw b Chahar</span></td><td class="

In [27]:
table = table_body[0]
rows = table.find_all('tr')

In [51]:
rows[4].find_all('td')[0].find_all('a')[0]

<a class="small" data-hover="" href="/player/ishan-kishan-720471" rel="" target="_self" title="View full profile of Ishan Kishan">Ishan Kishan<!-- --> <span>†</span></a>

In [28]:

#link = a_38['href']

13

In [19]:
match_body=soup.find_all('match-body')

In [21]:
teams = soup.find_all('teams')
teams

[]