In [3]:
import requests
from bs4 import BeautifulSoup

def get_player_info(player_id):
    """
    Scrapes player information from Cricbuzz profile page
    
    Args:
        player_id (str): Player ID from Cricbuzz URL
        
    Returns:
        dict: Dictionary containing role, batting style and bowling style
    """
    
    # Construct the URL
    url = f"https://www.cricbuzz.com/profiles/{player_id}"
    
    try:
        # Send GET request
        response = requests.get(url)
        response.raise_for_status()  # Raise exception for bad status codes
        
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the div containing personal information
        info_div = soup.find('div', class_='cb-hm-rght')
        
        # Initialize dictionary to store player info
        player_info = {}
        
        # Extract role, batting style and bowling style
        div_elements = info_div.find_all('div', class_='cb-col')
        
        for i in range(len(div_elements)-1):
            current_div = div_elements[i]
            next_div = div_elements[i+1]
            
            if 'Role' in current_div.text:
                player_info['role'] = next_div.text.strip()
            elif 'Batting Style' in current_div.text:
                player_info['batting_style'] = next_div.text.strip()
            elif 'Bowling Style' in current_div.text:
                player_info['bowling_style'] = next_div.text.strip()
                
        return player_info
        
    except requests.RequestException as e:
        print(f"Error fetching data: {e}")
        return None
    except Exception as e:
        print(f"Error processing data: {e}")
        return None

# Example usage
if __name__ == "__main__":
    player_id = "11813"
    player_info = get_player_info(player_id)
    
    if player_info:
        print("Player Information:")
        print(f"Role: {player_info.get('role', 'Not found')}")
        print(f"Batting Style: {player_info.get('batting_style', 'Not found')}")
        print(f"Bowling Style: {player_info.get('bowling_style', 'Not found')}")

Player Information:
Role: Batsman
Batting Style: Right Handed Bat
Bowling Style: Right-arm offbreak


In [1]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import time

# MongoDB Atlas connection
client = MongoClient("mongodb+srv://harsh8423:8423047004@cluster0.1xbklyu.mongodb.net/cricket")
db = client["cricket"]
players_collection = db["players"]

def get_player_info(player_id):
    """
    Scrapes player information from Cricbuzz profile page
    """
    url = f"https://www.cricbuzz.com/profiles/{player_id}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        info_div = soup.find('div', class_='cb-hm-rght')
        
        if not info_div:
            print(f"No information found for player {player_id}")
            return None
            
        player_info = {}
        div_elements = info_div.find_all('div', class_='cb-col')
        
        for i in range(len(div_elements)-1):
            current_div = div_elements[i]
            next_div = div_elements[i+1]
            
            if 'Role' in current_div.text:
                player_info['role'] = next_div.text.strip()
            elif 'Batting Style' in current_div.text:
                player_info['batting_style'] = next_div.text.strip()
            elif 'Bowling Style' in current_div.text:
                player_info['bowling_style'] = next_div.text.strip()
                
        return player_info
        
    except requests.RequestException as e:
        print(f"Error fetching data for player {player_id}: {e}")
        return None
    except Exception as e:
        print(f"Error processing data for player {player_id}: {e}")
        return None

def update_all_players_info():
    """
    Updates player information in MongoDB for all existing players in the database
    """
    print("Starting update for all players in database")
    
    # Get all player IDs from database
    all_players = list(players_collection.find({}, {"_id": 1, "name": 1}))
    total_players = len(all_players)
    
    print(f"Found {total_players} players in database")
    
    # Counter for successful updates
    updates_count = 0
    failed_count = 0
    
    for index, player in enumerate(all_players, 1):
        player_id = player['_id']
        player_name = player.get('name', 'Unknown')
        
        print(f"\nProcessing {index}/{total_players}: {player_name} (ID: {player_id})")
        
        # Get player info from Cricbuzz
        player_info = get_player_info(str(player_id))
        
        if player_info:
            try:
                # Update player document
                result = players_collection.update_one(
                    {"_id": player_id},
                    {"$set": {
                        "role": player_info.get('role', ''),
                        "batting_style": player_info.get('batting_style', ''),
                        "bowling_style": player_info.get('bowling_style', '')
                    }}
                )
                
                if result.modified_count > 0:
                    print(f"✓ Updated {player_name}")
                    updates_count += 1
                else:
                    print(f"- No changes needed for {player_name}")
                    
            except Exception as e:
                print(f"× Error updating MongoDB for {player_name}: {e}")
                failed_count += 1
        else:
            print(f"× No Cricbuzz data found for {player_name}")
            failed_count += 1
            
        # Add a small delay to avoid overwhelming the server
        time.sleep(1)
    
    print("\nUpdate complete!")
    print(f"Total players processed: {total_players}")
    print(f"Successfully updated: {updates_count}")
    print(f"Failed updates: {failed_count}")

# Example usage
if __name__ == "__main__":
    update_all_players_info()

Starting update for all players in database
Found 810 players in database

Processing 1/810: Ruturaj Gaikwad (ID: 11813)
- No changes needed for Ruturaj Gaikwad

Processing 2/810: Andre Siddarth C (ID: 1427488)
- No changes needed for Andre Siddarth C

Processing 3/810: Shaik Rasheed (ID: 22571)
- No changes needed for Shaik Rasheed

Processing 4/810: Rahul Tripathi (ID: 9012)
- No changes needed for Rahul Tripathi

Processing 5/810: Shivam Dube (ID: 11195)
- No changes needed for Shivam Dube

Processing 6/810: Rachin Ravindra (ID: 11177)
- No changes needed for Rachin Ravindra

Processing 7/810: Deepak Hooda (ID: 9427)
- No changes needed for Deepak Hooda

Processing 8/810: Vijay Shankar (ID: 8204)
- No changes needed for Vijay Shankar

Processing 9/810: Ramakrishna Ghosh (ID: 32835)
- No changes needed for Ramakrishna Ghosh

Processing 10/810: Ravindra Jadeja (ID: 587)
- No changes needed for Ravindra Jadeja

Processing 11/810: Anshul Kamboj (ID: 14598)
- No changes needed for Anshul

In [2]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import time

# MongoDB Atlas connection
client = MongoClient("mongodb+srv://harsh8423:8423047004@cluster0.1xbklyu.mongodb.net/cricket")
db = client["cricket"]
players_collection = db["players"]

def get_player_info(player_id):
    """
    Scrapes player information from Cricbuzz profile page with added delay for HTML loading
    """
    url = f"https://www.cricbuzz.com/profiles/{player_id}"
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        
        # Wait for 3 seconds to ensure HTML is loaded
        time.sleep(3)
        
        soup = BeautifulSoup(response.content, 'html.parser')
        info_div = soup.find('div', class_='cb-hm-rght')
        
        if not info_div:
            print(f"No information found for player {player_id}")
            return None
            
        player_info = {}
        div_elements = info_div.find_all('div', class_='cb-col')
        
        for i in range(len(div_elements)-1):
            current_div = div_elements[i]
            next_div = div_elements[i+1]
            
            if 'Role' in current_div.text:
                player_info['role'] = next_div.text.strip()
            elif 'Batting Style' in current_div.text:
                player_info['batting_style'] = next_div.text.strip()
            elif 'Bowling Style' in current_div.text:
                player_info['bowling_style'] = next_div.text.strip()
                
        return player_info
        
    except requests.RequestException as e:
        print(f"Error fetching data for player {player_id}: {e}")
        return None
    except Exception as e:
        print(f"Error processing data for player {player_id}: {e}")
        return None

def update_missing_info_players():
    """
    Updates information only for players with missing or empty role/batting_style
    """
    print("Starting update for players with missing information")
    
    # Find players with missing information
    query = {
        "$or": [
            {"role": {"$in": ["--", "", None]}},
            {"batting_style": {"$in": ["--", "", None]}},
            {"role": {"$exists": False}},
            {"batting_style": {"$exists": False}}
        ]
    }
    
    players_to_update = list(players_collection.find(query, {"_id": 1, "name": 1, "role": 1, "batting_style": 1}))
    total_players = len(players_to_update)
    
    print(f"Found {total_players} players with missing information")
    
    # # Counter for successful updates
    # updates_count = 0
    # failed_count = 0
    
    # for index, player in enumerate(players_to_update, 1):
    #     player_id = player['_id']
    #     player_name = player.get('name', 'Unknown')
    #     current_role = player.get('role', 'Not set')
    #     current_batting_style = player.get('batting_style', 'Not set')
        
    #     print(f"\nProcessing {index}/{total_players}: {player_name} (ID: {player_id})")
    #     print(f"Current Role: {current_role}")
    #     print(f"Current Batting Style: {current_batting_style}")
        
    #     # Get player info from Cricbuzz
    #     player_info = get_player_info(str(player_id))
        
    #     if player_info:
    #         try:
    #             # Only update fields that have new information
    #             update_fields = {}
                
    #             if player_info.get('role'):
    #                 update_fields['role'] = player_info['role']
    #             if player_info.get('batting_style'):
    #                 update_fields['batting_style'] = player_info['batting_style']
    #             if player_info.get('bowling_style'):
    #                 update_fields['bowling_style'] = player_info['bowling_style']
                
    #             if update_fields:
    #                 result = players_collection.update_one(
    #                     {"_id": player_id},
    #                     {"$set": update_fields}
    #                 )
                    
    #                 if result.modified_count > 0:
    #                     print(f"✓ Updated {player_name} with new information:")
    #                     for field, value in update_fields.items():
    #                         print(f"  - {field}: {value}")
    #                     updates_count += 1
    #                 else:
    #                     print(f"- No changes made for {player_name}")
    #             else:
    #                 print(f"- No new information found for {player_name}")
                    
    #         except Exception as e:
    #             print(f"× Error updating MongoDB for {player_name}: {e}")
    #             failed_count += 1
    #     else:
    #         print(f"× No Cricbuzz data found for {player_name}")
    #         failed_count += 1
    
    # print("\nUpdate complete!")
    # print(f"Total players with missing info: {total_players}")
    # print(f"Successfully updated: {updates_count}")
    # print(f"Failed updates: {failed_count}")

# Example usage
if __name__ == "__main__":
    update_missing_info_players()

Starting update for players with missing information
Found 93 players with missing information


In [3]:
from pymongo import MongoClient

# MongoDB Atlas connection
client = MongoClient("mongodb+srv://harsh8423:8423047004@cluster0.1xbklyu.mongodb.net/cricket")
db = client["cricket"]
players_collection = db["players"]

def classify_bowling_type():
    """
    Analyzes bowling_style field and adds bowling_type classification
    """
    print("Starting bowling type classification")
    
    # Find all players with non-empty bowling_style
    query = {
        "bowling_style": {"$exists": True, "$ne": "", "$ne": None}
    }
    
    players = list(players_collection.find(query, {"_id": 1, "name": 1, "bowling_style": 1}))
    total_players = len(players)
    
    print(f"Found {total_players} players with bowling style information")
    
    # Define bowling type categories
    spin_keywords = ['legbreak', 'offbreak', 'wrist-spin', 'orthodox']
    pace_keywords = ['fast-medium', 'medium', 'fast']
    
    # Counters for updates
    updates_count = 0
    no_match_count = 0
    
    for index, player in enumerate(players, 1):
        player_id = player['_id']
        player_name = player.get('name', 'Unknown')
        bowling_style = player.get('bowling_style', '').lower()
        
        print(f"\nProcessing {index}/{total_players}: {player_name}")
        print(f"Current bowling style: {bowling_style}")
        
        # Get the last word of bowling style
        last_word = bowling_style.split()[-1] if bowling_style else ''
        
        try:
            bowling_type = None
            
            # Check for spin bowling
            if any(keyword in bowling_style for keyword in spin_keywords):
                bowling_type = 'spin'
                print(f"Classified as spin bowler (matched: {last_word})")
                
            # Check for pace bowling
            elif any(keyword in bowling_style for keyword in pace_keywords):
                bowling_type = 'pace'
                print(f"Classified as pace bowler (matched: {last_word})")
            
            # Update the document if bowling type was determined
            if bowling_type:
                result = players_collection.update_one(
                    {"_id": player_id},
                    {"$set": {"bowling_type": bowling_type}}
                )
                
                if result.modified_count > 0:
                    print(f"✓ Updated {player_name} - Set bowling_type: {bowling_type}")
                    updates_count += 1
                else:
                    print(f"- No changes needed for {player_name}")
            else:
                print(f"× Could not classify bowling type for {player_name}")
                no_match_count += 1
                
        except Exception as e:
            print(f"× Error updating {player_name}: {e}")
            no_match_count += 1
    
    print("\nClassification complete!")
    print(f"Total players processed: {total_players}")
    print(f"Successfully classified and updated: {updates_count}")
    print(f"Unclassified bowlers: {no_match_count}")
    
    # Print some examples of unclassified bowling styles for review
    print("\nSample of unclassified bowling styles:")
    unclassified = players_collection.find({
        "bowling_style": {"$exists": True, "$ne": ""},
        "bowling_type": {"$exists": False}
    }).limit(5)
    
    for player in unclassified:
        print(f"- {player.get('name')}: {player.get('bowling_style')}")

# Example usage
if __name__ == "__main__":
    classify_bowling_type()

Starting bowling type classification
Found 810 players with bowling style information

Processing 1/810: Rahul Tripathi
Current bowling style: right-arm medium
Classified as pace bowler (matched: medium)
✓ Updated Rahul Tripathi - Set bowling_type: pace

Processing 2/810: Deepak Hooda
Current bowling style: right-arm offbreak
Classified as spin bowler (matched: offbreak)
✓ Updated Deepak Hooda - Set bowling_type: spin

Processing 3/810: Anshul Kamboj
Current bowling style: right-arm fast-medium
Classified as pace bowler (matched: fast-medium)
✓ Updated Anshul Kamboj - Set bowling_type: pace

Processing 4/810: Vansh Bedi
Current bowling style: 
× Could not classify bowling type for Vansh Bedi

Processing 5/810: Dhruv Jurel
Current bowling style: 
× Could not classify bowling type for Dhruv Jurel

Processing 6/810: Akash Madhwal
Current bowling style: right-arm fast-medium
Classified as pace bowler (matched: fast-medium)
✓ Updated Akash Madhwal - Set bowling_type: pace

Processing 7/810: