In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
matches_raw_dataset =  pd.read_csv("../datasets/matches.csv")
deliveries_raw_dataset =  pd.read_csv("../datasets/deliveries.csv")

In [3]:
deliveries_raw_dataset.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,extras_type,is_wicket,player_dismissed,dismissal_kind,fielder
0,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,legbyes,0,,,
1,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
2,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,wides,0,,,
3,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,
4,335982,1,Kolkata Knight Riders,Royal Challengers Bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,,0,,,


# Creating Players Dataset

In [4]:
all_players = pd.concat([
    deliveries_raw_dataset['batter'],
    deliveries_raw_dataset['bowler'],
    deliveries_raw_dataset['non_striker']
])
unique_players = all_players.dropna().unique()

players_df = pd.DataFrame({
    'player_id': range(1, len(unique_players) + 1),
    'player_name': unique_players,
    'country': None,
    'played_international':None,
    'ipl_time_span':None,
    'no_of_ipl_matches':None,
    'bowling_style':None,
    'batting_style':None,
    'player_role':None
})


players_df.head()

Unnamed: 0,player_id,player_name,country,played_international,ipl_time_span,no_of_ipl_matches,bowling_style,batting_style,player_role
0,1,SC Ganguly,,,,,,,
1,2,BB McCullum,,,,,,,
2,3,RT Ponting,,,,,,,
3,4,DJ Hussey,,,,,,,
4,5,Mohammad Hafeez,,,,,,,


# Scraping Player Data

In [5]:
import requests
from bs4 import BeautifulSoup

In [6]:
# Define the URL
url = "https://www.espncricinfo.com/records/trophy/indian-premier-league-117"

# Send a GET request
response = requests.get(url)
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the "Player averages" section by its title
    player_averages_section = soup.find('span', string='Player averages')
    if player_averages_section:
        # Locate the parent container
        parent_div = player_averages_section.find_parent('div', class_="ds-w-full ds-bg-fill-content-prime ds-overflow-hidden ds-rounded-xl ds-border ds-border-line ds-mb-2")
        if parent_div:
            # Find all links within the section
            links = parent_div.find_all('a', href=True)
            
            # Prepare data for DataFrame
            team_links = []
            for link in links:
                team_name = link.find('span').text.strip() if link.find('span') else None
                full_link = "https://www.espncricinfo.com" + link['href']
                if team_name:  # Avoid empty team names
                    team_links.append({'Team': team_name, 'Link': full_link})
            
            # Create a DataFrame
            team_links_df = pd.DataFrame(team_links)
        else:
            print("Could not locate the parent container for 'Player averages'.")
    else:
        print("'Player averages' section not found.")
else:
    print(f"Failed to fetch the webpage. Status code: {response.status_code}")



In [7]:
team_links_df.head()

Unnamed: 0,Team,Link
0,Chennai Super Kings,https://www.espncricinfo.com/records/trophy/av...
1,Deccan Chargers,https://www.espncricinfo.com/records/trophy/av...
2,Delhi Daredevils,https://www.espncricinfo.com/records/trophy/av...
3,Gujarat Lions,https://www.espncricinfo.com/records/trophy/av...
4,Gujarat Titans,https://www.espncricinfo.com/records/trophy/av...


In [17]:
def get_data_from_table(response):
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'class': 'ds-w-full ds-table ds-table-xs ds-table-auto ds-w-full ds-overflow-scroll ds-scrollbar-hide'})
    
    # Check if the table exists
    if not table:
        return pd.DataFrame(columns=['Player', 'Span', 'Mat'])
    
    # Extract table rows
    rows = table.find('tbody').find_all('tr')
    
    # Extract required data
    data = []
    for row in rows:
        cols = row.find_all('td')
        player = cols[0].text.strip()
        span = cols[1].text.strip()
        mat = cols[2].text.strip()
        data.append([player, span, mat])
    
    return pd.DataFrame(data, columns=['Player', 'Span', 'Mat'])

# Master DataFrame to collect all data
master_df = pd.DataFrame(columns=['Player', 'Span', 'Mat', 'Team'])

# Loop through each team's link
for index, row in team_links_df.iterrows():
    team = row['Team']
    url = row['Link']
    
    # Fetch the webpage
    response = requests.get(url)
    
    # Extract data from the table
    team_df = get_data_from_table(response)
    
    # Add the team name to the DataFrame
    team_df['Team'] = team
    
    # Append to the master DataFrame
    master_df = pd.concat([master_df, team_df], ignore_index=True)

# Display the consolidated DataFrame
print(master_df)

                 Player       Span Mat                 Team
0            P Amarnath  2008-2008   6  Chennai Super Kings
1           Akash Singh  2023-2023   6  Chennai Super Kings
2                MM Ali  2021-2024  48  Chennai Super Kings
3            S Anirudha  2008-2013  19  Chennai Super Kings
4       KB Arun Karthik  2010-2010   1  Chennai Super Kings
...                 ...        ...  ..                  ...
2767          DA Warner  2014-2021  95  Sunrisers Hyderabad
2768  Washington Sundar  2022-2024  18  Sunrisers Hyderabad
2769           CL White  2013-2013  13  Sunrisers Hyderabad
2770      KS Williamson  2015-2022  76  Sunrisers Hyderabad
2771       Yuvraj Singh  2016-2017  22  Sunrisers Hyderabad

[2772 rows x 4 columns]


In [27]:
master_df_cleaned = master_df.drop_duplicates(subset=['Player', 'Span', 'Mat'], keep='first')
print(master_df_cleaned)
master_df_cleaned.to_csv('../datasets/players_summary_dataset.csv', index=False)

                 Player       Span Mat                 Team
0            P Amarnath  2008-2008   6  Chennai Super Kings
1           Akash Singh  2023-2023   6  Chennai Super Kings
2                MM Ali  2021-2024  48  Chennai Super Kings
3            S Anirudha  2008-2013  19  Chennai Super Kings
4       KB Arun Karthik  2010-2010   1  Chennai Super Kings
...                 ...        ...  ..                  ...
1381          DA Warner  2014-2021  95  Sunrisers Hyderabad
1382  Washington Sundar  2022-2024  18  Sunrisers Hyderabad
1383           CL White  2013-2013  13  Sunrisers Hyderabad
1384      KS Williamson  2015-2022  76  Sunrisers Hyderabad
1385       Yuvraj Singh  2016-2017  22  Sunrisers Hyderabad

[1386 rows x 4 columns]


In [28]:
# Iterate through each row in master_df_cleaned
for index, row in master_df_cleaned.iterrows():
    player = row["Player"]  # Get the player name
    span = row["Span"]      # Get the Span
    mat = row["Mat"]        # Get the number of matches
    
    # Check if the player exists in players_df
    if player in players_df["player_name"].values:
        # Locate the matching row in players_df
        player_index = players_df[players_df["player_name"] == player].index[0]
        
        # Update the relevant columns in players_df
        players_df.at[player_index, "ipl_time_span"] = span
        players_df.at[player_index, "no_of_ipl_matches"] = mat

# Display updated players_df
print(players_df)


     player_id      player_name country played_international ipl_time_span  \
0            1       SC Ganguly    None                 None     2011-2012   
1            2      BB McCullum    None                 None     2018-2018   
2            3       RT Ponting    None                 None     2013-2013   
3            4        DJ Hussey    None                 None     2008-2010   
4            5  Mohammad Hafeez    None                 None     2008-2008   
..         ...              ...     ...                  ...           ...   
727        728      M Siddharth    None                 None     2024-2024   
728        729         MP Yadav    None                 None     2024-2024   
729        730         S Joseph    None                 None     2024-2024   
730        731       N Thushara    None                 None     2024-2024   
731        732      V Kaverappa    None                 None     2024-2024   

    no_of_ipl_matches bowling_style batting_style player_role  

In [29]:
players_df.to_csv('../datasets/players_dataset.csv', index=False)