In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np

In [13]:
def get_batting_records(pages):
    batsmen_df = pd.DataFrame(columns=["name","team","span","matches","innings","not_out","runs", "highest_score", 
                                       "average", "balls_faced","strike_rate","100s","50s","duck","4s","6s"])
    for page_num in range(1, pages+1):
        URL = 'https://www.espncricinfo.com/records/trophy/batting-most-runs-career/indian-premier-league-117'
        page = requests.get(URL)
        bs = BeautifulSoup(page.content, 'lxml')

        table_body = bs.find_all('tbody')
        
        for i, table in enumerate(table_body[0:4:2]):
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                cols = [x.text.strip() for x in cols]

                # Pre-processing for name and team
                name = cols[0].split("(")[0].strip()
                team = cols[0].split("(")[1].strip()[:-1]

                new_row = pd.Series([name, team, cols[1], cols[2], cols[3], cols[4], cols[5], cols[6], cols[7], 
                                     cols[8], cols[9], cols[10], cols[11], cols[12], cols[13], cols[14]], 
                                    index=batsmen_df.columns)
                
                batsmen_df = pd.concat([batsmen_df, new_row.to_frame().T], ignore_index=True)
                
    return batsmen_df

# Example usage
# pages = 1  # Set the number of pages you want to scrape
# df = get_batting_records(pages)
# print(df)


In [5]:
batting_records = get_batting_records(40)

In [6]:
batting_records


Unnamed: 0,name,team,span,matches,innings,not_out,runs,highest_score,average,balls_faced,strike_rate,100s,50s,duck,4s,6s
0,V Kohli,RCB,2008-2024,252,244,37,8004,113*,38.66,6065,131.97,8,55,10,705,272
1,S Dhawan,DC/DCH/MI/PBKS/SRH,2008-2024,222,221,29,6769,106*,35.25,5324,127.14,2,51,11,768,152
2,RG Sharma,DCH/MI,2008-2024,257,252,29,6628,109*,29.72,5054,131.14,2,43,17,599,280
3,DA Warner,DC/SRH,2009-2024,184,184,22,6565,126,40.52,4697,139.77,4,62,11,663,236
4,SK Raina,CSK/GL,2008-2021,205,200,30,5528,100*,32.51,4043,136.73,1,39,8,506,203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,Gurkeerat Singh,KXIP/RCB,2012-2020,41,32,8,511,65,21.29,422,121.09,0,2,4,55,11
5996,OA Shah,KKR/Kochi/RR,2010-2013,23,22,7,506,76,33.73,389,130.07,0,4,0,34,23
5997,PC Valthaty,KXIP/RR,2009-2013,23,23,1,505,120*,22.95,418,120.81,1,2,1,60,20
5998,SW Billings,CSK/DC/KKR,2016-2022,30,27,1,503,56,19.34,388,129.63,0,3,3,40,20


In [12]:
def get_bowling_records(pages):
    bowler_df = pd.DataFrame(columns=["name","team","span","matches","innings","overs","maidens","runs","wickets",
                                      "best_bowling_figure","average","economy","strike_rate","4-fer","5-fer"])
    
    for page_num in range(1, pages + 1):
        URL = 'https://www.espncricinfo.com/records/trophy/bowling-most-wickets-career/indian-premier-league-117'
        page = requests.get(URL)
        bs = BeautifulSoup(page.content, 'lxml')

        table_body = bs.find_all('tbody')
        
        for i, table in enumerate(table_body[0:4:2]):
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                cols = [x.text.strip() for x in cols]

                # Pre-processing for name and team
                name = cols[0].split("(")[0].strip()
                team = cols[0].split("(")[1].strip()[:-1]

                new_row = pd.Series([name, team, cols[1], cols[2], cols[3], cols[4], cols[5], cols[6], cols[7], 
                                     cols[8], cols[9], cols[10], cols[11], cols[12], cols[13]], 
                                    index=bowler_df.columns)
                
                bowler_df = pd.concat([bowler_df, new_row.to_frame().T], ignore_index=True)
                
    return bowler_df

# Example usage
# pages = 1  # Set the number of pages you want to scrape
# df = get_bowling_records(pages)
# print(df)


In [8]:
bowling_records = get_bowling_records(40)

In [9]:
bowling_records

Unnamed: 0,name,team,span,matches,innings,overs,maidens,runs,wickets,best_bowling_figure,average,economy,strike_rate,4-fer,5-fer
0,YS Chahal,MI/RCB/RR,2013-2024,160,159,3521,586.5,4,4602,205,5/40,22.44,7.84,17.17,6
1,PP Chawla,CSK/KKR/KXIP/MI,2008-2024,192,191,3850,641.4,2,5108,192,4/17,26.60,7.96,20.05,2
2,DJ Bravo,CSK/GL/MI,2008-2022,161,158,3119,519.5,3,4360,183,4/22,23.82,8.38,17.04,2
3,B Kumar,PWI/SRH,2011-2024,176,176,3910,651.4,14,4929,181,5/19,27.23,7.56,21.60,2
4,SP Narine,KKR,2012-2024,177,175,4075,679.1,3,4571,180,5/19,25.39,6.73,22.63,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,MR Marsh,DC/DCH/PWI/RPS/SRH,2010-2024,42,34,560,93.2,1,795,37,4/25,21.48,8.51,15.13,2
3996,M Markande,MI/RR/SRH,2018-2024,37,37,720,120.0,0,1069,37,4/15,28.89,8.90,19.45,2
3997,MS Gony,CSK/DCH/GL/KXIP,2008-2017,44,44,888,148.0,3,1287,37,3/31,34.78,8.69,24.00,0
3998,GJ Maxwell,DC/KXIP/MI/RCB,2012-2024,134,79,944,157.2,1,1303,37,2/15,35.21,8.28,25.51,0


In [10]:
from pathlib import Path
filepath_bowling = Path('bowling_records.csv') 
filepath_batting = Path('batting_records.csv') 

bowling_records.to_csv(filepath_bowling,index=False)
batting_records.to_csv(filepath_batting,index=False)

In [11]:
batting_records.head(200)


Unnamed: 0,name,team,span,matches,innings,not_out,runs,highest_score,average,balls_faced,strike_rate,100s,50s,duck,4s,6s
0,V Kohli,RCB,2008-2024,252,244,37,8004,113*,38.66,6065,131.97,8,55,10,705,272
1,S Dhawan,DC/DCH/MI/PBKS/SRH,2008-2024,222,221,29,6769,106*,35.25,5324,127.14,2,51,11,768,152
2,RG Sharma,DCH/MI,2008-2024,257,252,29,6628,109*,29.72,5054,131.14,2,43,17,599,280
3,DA Warner,DC/SRH,2009-2024,184,184,22,6565,126,40.52,4697,139.77,4,62,11,663,236
4,SK Raina,CSK/GL,2008-2021,205,200,30,5528,100*,32.51,4043,136.73,1,39,8,506,203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,SR Tendulkar,MI,2008-2013,78,78,11,2334,100*,34.83,1948,119.81,1,13,4,295,29
196,RA Tripathi,KKR/RPS/RR/SRH,2017-2024,95,93,11,2236,93,27.26,1605,139.31,0,12,4,223,84
197,R Dravid,RCB/RR,2008-2013,89,82,5,2174,75*,28.23,1882,115.51,0,11,3,268,28
198,KS Williamson,GT/SRH,2015-2024,79,77,17,2128,89,35.46,1694,125.61,0,18,2,185,64
