In [1]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import string
import re

# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Custom stopwords specific to cricket commentary
custom_stopwords = {
    'runs', 'run', 'bowler', 'batsman', 'batting', 'bowling',
    'wicket', 'innings', 'match', 'player', 'team', 'score',
    'field', 'fielder', 'boundary',
    'comes', 'goes', 'gets', 'got', 'taking', 'taken', 'makes', 'made',
    'going', 'comes', 'well', 'right', 'left',
    'great', 'nice', 'now', 'just', 'one', 'two', 'three',
    'four', 'six', 'single', 'double'
}

# Load JSON file
file_path = "commentarydata.json"  # Replace with your actual file path
with open(file_path, 'r') as file:
    data = json.load(file)

# Normalize both innings
inning1 = pd.json_normalize(data, record_path=['commentary_innings1'])
inning2 = pd.json_normalize(data, record_path=['commentary_innings2'])

# Add a column to distinguish between innings
inning1['inning'] = 1
inning2['inning'] = 2

# Combine both innings into a single DataFrame
commentary = pd.concat([inning1, inning2], ignore_index=True)

# Combine NLTK stopwords with custom stopwords (excluding 'off')
stop_words = set(stopwords.words('english')) | custom_stopwords - {'off'}

def remove_first_phrase(text):
    """Remove the first phrase up to the first comma"""
    parts = text.split(',', 1)
    if len(parts) > 1:
        return parts[1].strip()
    return text

# Function to preprocess and tokenize commText
def preprocess_and_tokenize(text, n=1):
    # Remove the first phrase (bowler to batsman)
    text = remove_first_phrase(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numeric numbers (both standalone and within words)
    text = re.sub(r'\b\d+\b|\d+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords and empty strings
    tokens = [word for word in tokens if word not in stop_words and word.strip()]
    
    # Generate n-grams
    if n > 1:
        ngrams = nltk.ngrams(tokens, n)
        return list(ngrams)
    return tokens

# Count word or n-gram frequency
def count_word_frequency(df, n=1):
    all_tokens = []
    
    for text in df['commText']:
        tokens = preprocess_and_tokenize(text, n)
        all_tokens.extend(tokens)
    
    return Counter(all_tokens)

# Count frequencies for unigrams and bigrams
unigram_frequency = count_word_frequency(commentary, n=1)
bigram_frequency = count_word_frequency(commentary, n=2)

# Convert Counter to DataFrame for better readability
unigram_df = pd.DataFrame(unigram_frequency.items(), columns=['Word', 'Count']).sort_values(by='Count', ascending=False)
bigram_df = pd.DataFrame([(' '.join(gram), count) for gram, count in bigram_frequency.items()], 
                         columns=['Bigram', 'Count']).sort_values(by='Count', ascending=False)

# Output results
print("Top 10 Unigrams:")
print(unigram_df.head(10))
print("\nTop 10 Bigrams:")
print(bigram_df.head(20))

# Print a few example processed comments to verify the first phrase removal
# print("\nExample processed comments:")
# for text in commentary['commText'][:3]:
#     print("Original:", text)
#     print("Processed:", remove_first_phrase(text))
#     print()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Top 10 Unigrams:
         Word  Count
41     length  14010
66       ball  12304
16    outside  11927
65      short  10805
40       back   9920
5         leg   9573
215      wide   7728
55       full   7371
42   delivery   6782
30       deep   6655

Top 10 Bigrams:
                Bigram  Count
42         back length   3031
4           square leg   2806
43     length delivery   2765
364       short length   2559
520        good length   2462
481        length ball   2453
183     deep midwicket   2387
663     backward point   2358
706     length outside   2206
1153         third man   1579
1573   backward square   1366
62           full toss   1341
82          middle leg   1325
1592       slower ball   1259
868        extra cover   1225
1145  delivery outside   1175
72          short ball   1158
1572     deep backward   1072
146           fine leg   1068
49         deep square   1068


In [2]:
import pandas as pd
import json
from pymongo import MongoClient

# MongoDB Atlas connection
client = MongoClient("mongodb+srv://harsh8423:8423047004@cluster0.1xbklyu.mongodb.net/cricket")
db = client["cricket"]

# Collections
matches_collection = db["matches"]
players_collection = db["players"]

# Load JSON file
file_path = "commentarydata.json"  # Replace with your actual file path
with open(file_path, 'r') as file:
    data = json.load(file)

# Fetch all players and matches data once
players_data = list(players_collection.find({}, {'_id': 1, 'bowling_type': 1, 'batting_style': 1, 'bowling_style': 1}))
matches_data = list(matches_collection.find({}, {'_id': 1, 'venue.name': 1}))

# Convert players data to dictionaries for faster lookup
players_dict = {str(player['_id']): player for player in players_data}
matches_dict = {str(match['_id']): match for match in matches_data}

# Since data is a list, we'll process each match in the list
all_commentary = []

for match in data:
    # For innings 1
    if 'commentary_innings1' in match:
        for ball in match['commentary_innings1']:
            ball_data = {
                '_id': match['_id'],
                'commText': ball.get('commText', ''),
                'overNumber': ball.get('overNumber', None),
                'batTeamName': ball.get('batTeamName', ''),
                'inningruns': ball.get('inningruns', None),
                'batsman_name': ball.get('batsman', {}).get('batName', ''),
                'bowler_name': ball.get('bowler', {}).get('bowlName', ''),
                'batsman_id': int(ball.get('batsman', {}).get('batId')) if ball.get('batsman', {}).get('batId') is not None else None,
                'bowler_id': int(ball.get('bowler', {}).get('bowlId')) if ball.get('bowler', {}).get('bowlId') is not None else None,
                'event': ball.get('event', ''),
                'is_wicket': ball.get('isWicket', False),
                'wicket_type': ball.get('wicketType', ''),
                'shot_played': ball.get('shotPlayed', ''),
                'ball_length': ball.get('ballLength', ''),
                'ball_line': ball.get('ballLine', ''),
                'batting_control': ball.get('battingControl', ''),
                'bowling_variation': ball.get('bowlingVariation', ''),
                'shot_direction': ball.get('shotDirection', ''),
                'inning': 1
            }
            all_commentary.append(ball_data)

    # For innings 2
    if 'commentary_innings2' in match:
        for ball in match['commentary_innings2']:
            ball_data = {
                '_id': match['_id'],
                'commText': ball.get('commText', ''),
                'overNumber': ball.get('overNumber', None),
                'batTeamName': ball.get('batTeamName', ''),
                'inningruns': ball.get('inningruns', None),
                'batsman_name': ball.get('batsman', {}).get('batName', ''),
                'bowler_name': ball.get('bowler', {}).get('bowlName', ''),
                'batsman_id': int(ball.get('batsman', {}).get('batId')) if ball.get('batsman', {}).get('batId') is not None else None,
                'bowler_id': int(ball.get('bowler', {}).get('bowlId')) if ball.get('bowler', {}).get('bowlId') is not None else None,
                'event': ball.get('event', ''),
                'is_wicket': ball.get('isWicket', False),
                'wicket_type': ball.get('wicketType', ''),
                'shot_played': ball.get('shotPlayed', ''),
                'ball_length': ball.get('ballLength', ''),
                'ball_line': ball.get('ballLine', ''),
                'batting_control': ball.get('battingControl', ''),
                'bowling_variation': ball.get('bowlingVariation', ''),
                'shot_direction': ball.get('shotDirection', ''),
                'inning': 2
            }
            all_commentary.append(ball_data)

# Convert to DataFrame
commentary = pd.DataFrame(all_commentary)

def process_commentary(commentary):
    
    # Convert IDs to strings for consistent comparison
    commentary['batsman_id'] = commentary['batsman_id'].astype(str)
    commentary['bowler_id'] = commentary['bowler_id'].astype(str)
    commentary['_id'] = commentary['_id'].astype(str)

        # Function to extract runs scored
    def extract_runs(comm_text):
        if "SIX" in comm_text or "six" in comm_text:
            return 6
        elif "FOUR" in comm_text or "four" in comm_text:
            return 4
        elif "1 run" in comm_text:
            return 1
        elif "2 runs" in comm_text:
            return 2
        elif "3 runs" in comm_text:
            return 3
        elif "no run" in comm_text:
            return 0
        else:
            return 0
        return None

    def wicket_type(comm_text):
        # Array of wicket type keywords for easy future additions
        wicket_keywords = ['runout', 'stumped', 'lbw', 'hit wicket']
        
        # Check for special combinations first
        text_lower = comm_text.lower()
        if 'caught' in text_lower and 'edge' in text_lower:
            return 'edge&caught'
        elif 'played on' in text_lower and 'bowled' in text_lower:
            return 'played_on'
        
        # Check for individual keywords
        if 'caught' in text_lower:
            return 'caught'
        if 'bowled' in text_lower:
            return 'bowled'
        
        # Check other wicket types
        for wicket in wicket_keywords:
            if wicket in text_lower:
                return wicket
        
        return 'wicket'

    def extract_shot(comm_text):
        shot_keywords = ['pull', 'cut', 'drive', 'sweep', 'flick', 'hook', 'loft','slog', 'punch','down the ground','across the line','scoop']
        if 'goes down' in comm_text.lower() or 'whip' in comm_text.lower():
            return 'loft'
        if 'nudge' in comm_text.lower() or 'tap' in comm_text.lower() or 'darted' in comm_text.lower() or 'paddle' in comm_text.lower() or 'tuck' in comm_text.lower() or 'pushed' in comm_text.lower() or 'knock' in comm_text.lower() or 'clipped' in comm_text.lower() or 'steer' in comm_text.lower() or 'dab' in comm_text.lower():
            return 'pushed slightly'
        if 'defend' in comm_text.lower() or 'defence' in comm_text.lower() or 'blocked' in comm_text.lower():
            return 'defend'
        for shot in shot_keywords:
            if shot in comm_text.lower():
                return shot
        return None

    def extract_direction(comm_text):
        direction_keywords = ['cover', 'square', 'point']
        
        for direction in direction_keywords:
            if direction in comm_text.lower():
                return direction
        if 'fine leg' in comm_text.lower() or 'fine-leg' in comm_text.lower() or 'fine' in comm_text.lower():
            return 'fine-leg'
        if 'third-man' in comm_text.lower() or 'third man' in comm_text.lower():
            return 'third-man'
        if 'mid-wicket' in comm_text.lower() or 'midwicket' in comm_text.lower() or 'mid wicket' in comm_text.lower():
            return 'mid-wicket'
        if 'mid-on' in comm_text.lower() or 'mid on' in comm_text.lower():
            return 'mid-wicket'
        if 'mid-off' in comm_text.lower() or 'mid off' in comm_text.lower():
            return 'long-off'
        if 'long-on' in comm_text.lower() or 'long on' in comm_text.lower():
            return 'long-on'
        if 'long-off' in comm_text.lower() or 'long off' in comm_text.lower():
            return 'long-off'
        if 'defend' in comm_text.lower() or 'defence' in comm_text.lower() or 'blocked' in comm_text.lower() or 'wide' in comm_text.lower():
            return 'blocked'
        if  'keeper' in comm_text.lower():
            return 'keeper'
        return None

    def extract_length(comm_text):
        length_keywords = ['short', 'full', 'yorker', 'bouncer','half-volley','good length','tossed up']
        
        if 'full toss' in comm_text.lower() or 'full-toss' in comm_text.lower():
            return 'full-toss'
        for length in length_keywords:
            if length in comm_text.lower():
                return length
        if 'back of a length' in comm_text.lower() or 'short of a length' in comm_text.lower():
            return 'back of a length'
        elif 'length delivery' in comm_text.lower() or 'length ball' in comm_text.lower():
            return 'length ball'
        
        return None

    def extract_ball_line(comm_text):
        text_lower = comm_text.lower()
        line_keywords = {
            'outside-off': ['outside off', 'outside-off'],
            'off-stump': ['off stump line', 'off-stump', 'off stump', 'around off','middle and off'],
            'middle-stump': ['middle stump', 'middle-stump','on middle', 'on stump', 'around middle','middle and leg', 'body'],
            'leg-stump': ['leg stump', 'leg-stump', 'outside leg','on leg','leg bye', 'around leg','down leg', 'down-leg'],
            'wide': ['wide']
        }
        variations = ['seam', 'swing', 'seam-away', 'seam-in', 'seam-up',]
        for line, variations in line_keywords.items():
            for variation in variations:
                if variation in text_lower:
                    return line
            if ('swing' not in text_lower or 'seam' not in text_lower) and 'pads' in text_lower:
                return 'leg-stump'
        return None

    def extract_batting_control(row):
        text_lower = row['commText'].lower()
        control_keywords = {
            'no-control': ['missed', 'misses', 'edge', 'edges'],
            'beaten': ['beaten', 'beats']
        }

        if row['event'] == 'WICKET':
            return 'no-control'
        
        for control, keywords in control_keywords.items():
            for keyword in keywords:
                if keyword in text_lower:
                    return control
        return 'in-control'



    def extract_bowling_variation(comm_text):
        text_lower = comm_text.lower()
        
        # Check for combined variations first
        if 'seam' in text_lower and 'away' in text_lower:
            return 'seam-away'
        if 'seaming' in text_lower and 'in' in text_lower:
            return 'seam-in'
        
        variation_keywords = {
            'seam-up': ['seam up', 'seam-up'],
            'out-swing': ['swinging away','outswinger'],
            'in-swing': ['swinging in','inswinger'],
            'swing': ['swinging'],
            'googly': ['googly'],
            'slower': ['slower', 'lack of pace'],
            'cutter': ['cutter'],
            'yorker': ['yorker'],
            'bouncer': ['bouncer'],
            'off-break': ['off-break'],
            'carrom ball': ['carrom'],
            'flipper': ['flipper'],
            'leg-break': ['leg-break'],
            'off-spin': ['off-spin'],
            'leg-spin': ['leg spin'],
            'knuckle ball': ['knuckle ball'],
            'quick': ['quick'],
            'arm ball': ['arm ball'],
            'slider':['slider'],
            'skidding':['skidding', 'skidded']

        }
        
        for variation, keywords in variation_keywords.items():
            for keyword in keywords:
                if keyword in text_lower:
                    return variation
        return 'normal'
    
    # Add new fields from MongoDB data
    def get_bowler_type(bowler_id):
        player = players_dict.get(str(bowler_id))
        return player.get('bowling_type') if player else None

    def get_batting_hand(batsman_id):
        player = players_dict.get(str(batsman_id))
        if player and player.get('batting_style'):
            return 'right' if player['batting_style'].startswith('Right') else 'left'
        return None

    def get_bowling_hand(bowler_id):
        player = players_dict.get(str(bowler_id))
        if player and player.get('bowling_style'):
            if player['bowling_style'].startswith('Right'):
                return 'right'
            elif player['bowling_style'].startswith('Left'):
                return 'left'
        return None

    def get_venue(match_id):
        match = matches_dict.get(str(match_id))
        return match.get('venue', {}).get('name') if match else None

    # Extract runs and apply transformations
    commentary['runs'] = commentary['commText'].apply(extract_runs)
    
    # Apply the new transformations
    commentary['bowler_type'] = commentary['bowler_id'].apply(get_bowler_type)
    commentary['batter_hand'] = commentary['batsman_id'].apply(get_batting_hand)
    commentary['bowler_hand'] = commentary['bowler_id'].apply(get_bowling_hand)
    commentary['venue'] = commentary['_id'].apply(get_venue)

    # Identify if it's a wicket ball
    commentary['is_wicket'] = commentary['event'] == 'WICKET'
    
    # Apply wicket type only if it's a wicket ball
    commentary['wicket_type'] = commentary.apply(
        lambda row: wicket_type(row['commText']) if row['event'] == 'WICKET' else 'N/A',
        axis=1
    )
    
    # Apply existing transformations
    commentary['shot_played'] = commentary['commText'].apply(extract_shot)
    commentary['ball_length'] = commentary['commText'].apply(extract_length)
    commentary['ball_line'] = commentary['commText'].apply(extract_ball_line)
    commentary['batting_control'] = commentary.apply(extract_batting_control, axis=1)
    commentary['bowling_variation'] = commentary['commText'].apply(extract_bowling_variation)
    commentary['shot_direction'] = commentary['commText'].apply(extract_direction)
    
    return commentary

# Process the commentary DataFrame
commentary = process_commentary(commentary)

# Update column order to include new fields
column_order = [
    '_id', 'commText', 'overNumber', 'batTeamName',
    'batsman_name', 'bowler_name', 'batsman_id', 'bowler_id',
    'batter_hand', 'bowler_hand', 'bowler_type', 'venue',
    'is_wicket', 'wicket_type', 'shot_played', 'ball_length',
    'ball_line', 'batting_control', 'bowling_variation', 'shot_direction',
    'inning','runs'
]
commentary = commentary[column_order]
# # Remove unwanted columns
# columns_to_remove = [
#         'event','inningruns','batsman_name', 'bowler_name', 'batsman_id', 'bowler_id',
#     ]
# commentary.drop(columns=columns_to_remove, inplace=True)

In [3]:
def refine_commentary_fields(commentary):
    """
    Refine values in the filtered commentary based on specific cricket-related conditions
    """
    # Create a copy to avoid modifying the original DataFrame
    refined_df = commentary.copy()
    
    # Apply condition 1: Update shot_direction based on batting control and wicket type
    mask_control = (refined_df['batting_control'] != 'in-control') & (refined_df['shot_direction'].isnull())
    mask_stumped = refined_df['wicket_type'] == 'stumped'
    refined_df.loc[mask_control | mask_stumped, 'shot_direction'] = 'keeper'
    
    # Apply condition 2: Update shot_played based on direction and runs
    mask_shot_null = refined_df['shot_played'].isnull() & refined_df['shot_direction'].notnull()
    mask_low_runs = refined_df['runs'].isin([0, 1, 2])
    refined_df.loc[mask_shot_null & mask_low_runs, 'shot_played'] = 'pushed slightly'
    
    # Apply condition 3: Update ball_line based on shot direction and bowling variation
    mask_offside_direction = refined_df['shot_direction'].isin(['cover', 'point', 'third-man'])
    mask_not_away = ~refined_df['bowling_variation'].isin(['seam-away', 'out-swing'])
    mask_valid_line = refined_df['ball_line'].isnull()
    refined_df.loc[mask_valid_line & mask_offside_direction & mask_not_away, 'ball_line'] = 'outside-off'
    
    # Apply condition 4: Update ball_length based on multiple conditions
    # Condition 4.1: outside-off drive
    mask_outside_off_drive = (refined_df['ball_line'] == 'outside-off') & (refined_df['shot_played'] == 'drive')
    
    # Condition 4.2: specific shots
    mask_specific_shots = refined_df['shot_played'].isin(['slog', 'scoop', 'sweep'])
    
    # Condition 4.3: swing variations
    mask_swing = refined_df['bowling_variation'].isin(['in-swing', 'out-swing'])
    
    # Condition 4.4: point/third-man cut
    mask_cut_direction = refined_df['shot_direction'].isin(['point', 'third-man'])
    mask_cut_shot = refined_df['shot_played'] == 'cut'
    
    # Apply length updates
    mask_valid_length = refined_df['ball_length'].isnull()
    refined_df.loc[mask_valid_length & (mask_outside_off_drive | mask_specific_shots | mask_swing), 'ball_length'] = 'full'
    refined_df.loc[mask_valid_length & mask_cut_direction & mask_cut_shot, 'ball_length'] = 'back of a length'
    
    # Apply condition 5: Update ball_line based on additional conditions
    # Condition 5.1: off-stump line
    mask_off_direction = refined_df['shot_direction'].isin(['mid-off', 'long-off'])
    mask_pushed = refined_df['shot_played'] == 'pushed slightly'
    mask_not_short = refined_df['ball_length'] != 'short'
    mask_valid_line = refined_df['ball_line'].isnull()
    refined_df.loc[mask_valid_line & mask_off_direction & mask_pushed & mask_not_short, 'ball_line'] = 'off-stump'
    
    # Condition 5.2: leg-stump line for fine-leg direction
    mask_fine_leg = refined_df['shot_direction'] == 'fine-leg'
    mask_leg_shots = refined_df['shot_played'].isin(['pull', 'flick', 'sweep'])
    refined_df.loc[mask_valid_line & mask_fine_leg & mask_leg_shots, 'ball_line'] = 'leg-stump'
    
    # Condition 5.3: leg-stump line for square direction
    mask_square = refined_df['shot_direction'] == 'square'
    mask_not_short_back = ~refined_df['ball_length'].isin(['short', 'back of a length'])
    refined_df.loc[mask_valid_line & mask_square & mask_not_short_back, 'ball_line'] = 'leg-stump'
    
    return refined_df

# Apply the refinements to the filtered commentary
refined_commentary = refine_commentary_fields(commentary)


# Filter rows where specific columns are not None
commentary_before = commentary[
    # (commentary['bowler_id']==265)
    # (commentary['batsman_id']==1413)
    (commentary['shot_played'].notnull())&
    (commentary['ball_length'].notnull())&
    (commentary['ball_line'].notnull())&
    # (commentary['batting_control'] != 'in-control')&
    # (commentary['bowling_variation'] !='normal')
    (commentary['shot_direction'].notnull())
    # ((commentary['runs']==0) | (commentary['runs']==1) | (commentary['runs']==2)) & 
    # (commentary['is_wicket']!=True)
]


# Filter rows where specific columns are not None
commentary_after = refined_commentary[
    # (refined_commentary['bowler_id']==265)
    # (refined_commentary['batsman_id']==1413)
    (refined_commentary['shot_played'].notnull())&
    (refined_commentary['ball_length'].notnull())&
    (refined_commentary['ball_line'].notnull())&
    # (refined_commentary['batting_control'].notnull)
#     (refined_commentary['bowling_variation'] !='normal')
    (refined_commentary['shot_direction'].notnull())
#     ((refined_commentary['runs']==0) | (refined_commentary['runs']==1) | (refined_commentary['runs']==2)) & 
#     (refined_commentary['is_wicket']!=True)
]

# Print the filtered rows
row_count = len(commentary_before)
print(f"Number of rows in commentary_before: {row_count}")

# Print the filtered rows
row_count = len(commentary_after)
print(f"Number of rows in refined_commentary: {row_count}")
commentary_after.tail(25)

# Get first 20 rows and save to CSV
refined_commentary.head(20).to_csv('commentary_20_rows.csv', index=False)


Number of rows in commentary_before: 9083
Number of rows in refined_commentary: 19434


In [4]:
from pymongo import MongoClient
import pandas as pd

# MongoDB Atlas connection
client = MongoClient("mongodb+srv://harsh8423:8423047004@cluster0.1xbklyu.mongodb.net/cricket")
db = client["cricket"]  # Replace with your database name

# Collections
matches_collection = db["matches"]
teams_collection = db["teams"]
players_collection = db["players"]
scorecards_collection = db["scorecards"]
commentaries_collection = db["commentaries"]

def analyze_venue_matches(venue_name, commentary_df):
    # Filter commentary data for the specific venue
    venue_data = commentary_df[commentary_df['venue'] == venue_name].copy()

    # print(venue_data)
    
    # Helper function to calculate metrics for a group of data
    def calculate_metrics(data):
        total_runs = data['runs'].sum()
        total_balls = len(data)
        total_wickets = data['is_wicket'].sum()
        economy_rate = (total_runs * 6) / total_balls if total_balls > 0 else 0
        strike_rate = (total_runs * 100) / total_balls if total_balls > 0 else 0
        balls_per_wicket=total_balls/total_wickets if total_balls > 0 else 0
        runs_per_wicket=total_runs/total_wickets if total_balls > 0 else 0
        
        return {
            'total_runs': total_runs,
            'total_balls': total_balls,
            'total_wickets': total_wickets,
            'runs_per_wicket': round(runs_per_wicket,2),
            'balls_per_wicket':round(balls_per_wicket,2),
            'economy_rate': round(economy_rate, 2),
            'strike_rate': round(strike_rate, 2)
        }
    
    # 1. Overall analysis by bowler_type
    overall_stats = []
    for bowler_type in ['pace', 'spin']:
        type_data = venue_data[venue_data['bowler_type'] == bowler_type]
        metrics = calculate_metrics(type_data)
        metrics['bowler_type'] = bowler_type
        metrics['innings'] = 'Overall'
        overall_stats.append(metrics)
    
    # 2. Analysis by innings and bowler_type
    innings_stats = []
    for inning in [1, 2]:
        inning_data = venue_data[venue_data['inning'] == inning]
        for bowler_type in ['pace', 'spin']:
            type_data = inning_data[inning_data['bowler_type'] == bowler_type]
            metrics = calculate_metrics(type_data)
            metrics['bowler_type'] = bowler_type
            metrics['innings'] = f'Inning {inning}'
            innings_stats.append(metrics)
    
    # Combine all stats
    all_stats = overall_stats + innings_stats
    
    # Convert to DataFrame and normalize
    stats_df = pd.DataFrame(all_stats)
    
    # Add venue information
    stats_df['venue'] = venue_name
    
    # Reorder columns for better readability
    column_order = [
        'venue', 'innings', 'bowler_type', 'total_runs', 'total_balls',
        'total_wickets', 'runs_per_wicket', 'balls_per_wicket', 'economy_rate', 'strike_rate'
    ]
    stats_df = stats_df[column_order]
    
    # Fetch the last 10 matches at this venue from MongoDB
    matches = list(
        matches_collection.find(
            {"venue.name": venue_name},
            {"matchStartTimestamp": 1, "tossResults": 1, "result": 1}
        ).sort("matchStartTimestamp", -1).limit(10)
    )
    
    # Create matches DataFrame
    matches_df = pd.json_normalize(matches)
    
    return matches_df,stats_df

# 2. Fetch the last 10 matches between both teams
def analyze_team_head_to_head(team1, team2):
    # Query the last 10 matches between two teams
    matches = list(
        matches_collection.find(
            {
                "$or": [
                    {"team1.name": team1, "team2.name": team2},
                    {"team1.name": team2, "team2.name": team1}
                ]
            },
            {"matchStartTimestamp": 1, "result": 1}
        ).sort("matchStartTimestamp", -1).limit(10)
    )
    
    # Normalize the data
    df = pd.json_normalize(matches)
    return df


def analyze_players(team1, team2, venue_name):
    # Get player IDs from both teams
    team1_data = teams_collection.find_one({"team_name": team1}, {"players": 1})
    team2_data = teams_collection.find_one({"team_name": team2}, {"players": 1})
    
    # Combine player IDs
    player_ids = team1_data["players"] + team2_data["players"]

    # Dictionary to store venue matches for each player
    venue_matches_by_player = {}
    
    # First, get all venue matches for each player
    for player_id in player_ids:
        player_data = players_collection.find_one({"_id": player_id}, {"matches": 1})
        if player_data and "matches" in player_data:
            # Find matches at the specified venue
            venue_matches = list(matches_collection.find({
                "_id": {"$in": player_data["matches"]},
                "venue.name": venue_name
            }))
            venue_matches_by_player[player_id] = [match["_id"] for match in venue_matches][:5]
    
    # Get recent matches for each player
    player_recent_matches = {}
    for player_id in player_ids:
        player_data = players_collection.find_one({"_id": player_id}, {"matches": 1})
        if player_data and "matches" in player_data:
            match_ids = sorted(player_data["matches"], reverse=True)[:10]
            player_recent_matches[player_id] = match_ids

    batting_rows = []
    bowling_rows = []

    # Process recent matches
    for player_id, match_ids in player_recent_matches.items():
        for match_id in match_ids:
            scorecard = scorecards_collection.find_one(
                {"_id": match_id},
                {"firstInnings": 1, "secondInnings": 1}
            )
            if scorecard:
                for innings_key in ["firstInnings", "secondInnings"]:
                    innings = scorecard.get(innings_key, [])
                    for index, row in enumerate(innings):
                        if row.get("playerId") == str(player_id):
                            row_copy = row.copy()
                            row_copy['stats_type'] = 'recent'
                            row_copy['match_id'] = match_id
                            if row.get("type") == "Batting" and row.get('balls', 0) > 0:
                                row_copy["batting_position"] = index + 1
                                batting_rows.append(row_copy)
                            elif row.get("type") == "Bowling":
                                bowling_rows.append(row_copy)

    # Process venue matches
    for player_id, venue_match_ids in venue_matches_by_player.items():
        for match_id in venue_match_ids:
            scorecard = scorecards_collection.find_one(
                {"_id": match_id},
                {"firstInnings": 1, "secondInnings": 1}
            )
            if scorecard:
                for innings_key in ["firstInnings", "secondInnings"]:
                    innings = scorecard.get(innings_key, [])
                    for index, row in enumerate(innings):
                        if row.get("playerId") == str(player_id):
                            row_copy = row.copy()
                            row_copy['stats_type'] = 'venue'
                            row_copy['match_id'] = match_id
                            if row.get("type") == "Batting" and row.get('balls', 0) > 0:
                                row_copy["batting_position"] = index + 1
                                batting_rows.append(row_copy)
                            elif row.get("type") == "Bowling":
                                bowling_rows.append(row_copy)

    # Normalize the batting and bowling data
    batting_df = pd.DataFrame(batting_rows)
    bowling_df = pd.DataFrame(bowling_rows)

    # Calculate fantasy points for batting
    def calculate_batting_points(row):
        points = row['runs']  # +1 per run
        points += row['fours']  # +1 per four
        points += row['sixes'] * 2  # +2 per six
        
        if row['runs'] >= 100:
            points += 16  # Century bonus
        elif row['runs'] >= 50:
            points += 8  # Half-century bonus
        elif row['runs'] >= 30:
            points += 4  # 30-run bonus

        if row['dismissal'] == 'Duck' and row['balls'] > 0:
            points -= 2  # Duck penalty
        
        # Strike rate points
        if row['balls'] >= 10:
            strike_rate = (row['runs'] / row['balls']) * 100
            if strike_rate > 170:
                points += 6
            elif 150.01 <= strike_rate <= 170:
                points += 4
            elif 130 <= strike_rate <= 150:
                points += 2
            elif 60 <= strike_rate < 70:
                points -= 2
            elif 50 <= strike_rate < 60:
                points -= 4
            elif strike_rate < 50:
                points -= 6
        
        return points

    if not batting_df.empty:
        batting_df['fantasy_points'] = batting_df.apply(calculate_batting_points, axis=1)

    # Calculate fantasy points for bowling
    def calculate_bowling_points(row):
        points = row['wickets'] * 25  # +25 per wicket

        if row['wickets'] >= 5:
            points += 16  # 5-wicket haul bonus
        elif row['wickets'] == 4:
            points += 8  # 4-wicket haul bonus
        elif row['wickets'] == 3:
            points += 4  # 3-wicket haul bonus

        points += row['maidens'] * 12  # +12 per maiden over

        # Economy rate points
        if row['overs'] >= 2:
            economy_rate = row['runs'] / row['overs']
            if economy_rate < 5:
                points += 6
            elif 5 <= economy_rate < 6:
                points += 4
            elif 6 <= economy_rate <= 7:
                points += 2
            elif 10 <= economy_rate < 11:
                points -= 2
            elif 11 <= economy_rate < 12:
                points -= 4
            elif economy_rate >= 12:
                points -= 6
        
        return points

    if not bowling_df.empty:
        bowling_df['fantasy_points'] = bowling_df.apply(calculate_bowling_points, axis=1)

    return batting_df, bowling_df




def analyze_commentary(team1, team2, refined_commentary):
    
    # Get player IDs for both teams
    team1_data = teams_collection.find_one({"team_name": team1}, {"players": 1})
    team2_data = teams_collection.find_one({"team_name": team2}, {"players": 1})
    
    # Assume `players` is a list of integers representing player IDs
    player_ids = team1_data["players"] + team2_data["players"]
    player_ids_str = [str(player_id) for player_id in player_ids]
    print(type(player_ids_str[0]))

    # Filter commentary rows where batsman_id or bowler_id matches player IDs
    batting_commentary = refined_commentary[refined_commentary["batsman_id"].isin(player_ids_str)]
    bowling_commentary = refined_commentary[(refined_commentary["bowler_id"].isin(player_ids_str))]
    
    
    return batting_commentary, bowling_commentary



# Main Function
if __name__ == "__main__":
    # Example inputs
    venue_name = "Narendra Modi Stadium"
    team1 = "Gujarat Titans"
    team2 = "Chennai Super Kings"
    
    # 1. Venue Analysis
    venue_df, venue_stats_df = analyze_venue_matches(venue_name, refined_commentary)
    venue_df['toss_decision'] = venue_df['tossResults.decision']
    venue_df['winByRuns'] = venue_df['result.winByRuns']
    columns_to_remove_venue = ['matchStartTimestamp','result.resultType','result.winningTeam','result.winningteamId','result.winningMargin','result.winByRuns','result.winByInnings','tossResults.tossWinnerId','tossResults.tossWinnerName','tossResults.decision']
    venue_df.drop(columns=columns_to_remove_venue, inplace=True)
    
    # 2. Head-to-Head Analysis
    h2h_df = analyze_team_head_to_head(team1, team2)
    h2h_df['winningTeam'] = h2h_df['result.winningTeam']
    h2h_df['winningMargin'] = h2h_df['result.winningMargin']
    h2h_df['winByRuns'] = h2h_df['result.winByRuns']
    columns_to_remove_h2h = ['matchStartTimestamp','result.resultType','result.winningTeam','result.winningteamId','result.winningMargin','result.winByRuns','result.winByInnings']
    h2h_df.drop(columns=columns_to_remove_h2h, inplace=True)
    
    # # 3. Player Analysis
    batting_df, bowling_df = analyze_players(team1, team2, venue_name)
    
    # 4. Commentary Analysis
    batting_commentary, bowling_commentary = analyze_commentary(team1, team2, refined_commentary)

    



<class 'str'>


In [5]:
batting_commentary.tail(5)

Unnamed: 0,_id,commText,overNumber,batTeamName,batsman_name,bowler_name,batsman_id,bowler_id,batter_hand,bowler_hand,...,is_wicket,wicket_type,shot_played,ball_length,ball_line,batting_control,bowling_variation,shot_direction,inning,runs
46623,47605,"Boult to Shubman Gill, no run, 134kph, anothe...",2.2,GT,Shubman Gill,Trent Boult,11808,8117,right,left,...,False,,punch,,outside-off,in-control,normal,point,2,0
46624,47605,"Boult to Shubman Gill, no run, cuts back in o...",2.1,GT,Shubman Gill,Trent Boult,11808,8117,right,left,...,False,,defend,,outside-off,in-control,normal,cover,2,0
46631,47605,"Boult to Shubman Gill, FOUR, strays onto the ...",0.6,GT,Shubman Gill,Trent Boult,11808,8117,right,left,...,False,,flick,,leg-stump,in-control,normal,fine-leg,2,4
46632,47605,"Boult to Shubman Gill, no run, length, on the...",0.5,GT,Shubman Gill,Trent Boult,11808,8117,right,left,...,False,,,,,in-control,normal,,2,0
46633,47605,"Boult to Shubman Gill, no run, dropped. Now, ...",0.4,GT,Shubman Gill,Trent Boult,11808,8117,right,left,...,False,,pushed slightly,,leg-stump,in-control,normal,square,2,0


In [6]:
import pandas as pd

def process_match_summary(venue_df, h2h_df):
    """Process basic match information from venue and head-to-head dataframes"""

    match_analysis_report = {}

    # 1. Venue Analysis
    venue_analysis = []
    for _, row in venue_df.iterrows():
        toss_decision = row["toss_decision"]
        win_by_runs = row["winByRuns"]
        winner = "Batting Team" if win_by_runs else "Bowling Team"
        
        venue_analysis.append({
            "toss_decision": toss_decision,
            "winner": winner
        })
    match_analysis_report["venue_analysis"] = venue_analysis

    venue_stats=[]
    for _, row in venue_stats_df.iterrows():
        innings=row['innings']
        bowler_type=row['bowler_type']
        total_runs=row['total_runs']	
        total_balls	=row['total_balls']
        total_wickets=row['total_wickets']	
        runs_per_wicket	=row['runs_per_wicket']
        balls_per_wicket=row['balls_per_wicket']	
        economy_rate=row['economy_rate']	
        strike_rate=row['strike_rate']

        venue_stats.append({
            'innings':innings,
            'bowler_type':bowler_type,
            'total_runs':total_runs,
            'total_balls':total_balls,
            'total_wickets':total_wickets,
            'runs_per_wicket':runs_per_wicket,
            'balls_per_wicket':balls_per_wicket,	
            'economy_rate':economy_rate,
            'strike_rate':strike_rate
        })

        match_analysis_report["venue_stats"] = venue_stats
    # 2. Head-to-Head Analysis
    h2h_analysis = []
    for _, row in h2h_df.iterrows():
        winning_team = row["winningTeam"]
        winning_margin = row["winningMargin"]
        win_by_runs = row["winByRuns"]
        result = f"{winning_team} won by {winning_margin} runs" if win_by_runs else f"{winning_team} won by {winning_margin} wickets"
        h2h_analysis.append(result)
    match_analysis_report["head_to_head_analysis"] = h2h_analysis
        
    return match_analysis_report


def generate_bowler_analysis(bowling_df, commentary_df, team1_ids, team2_ids):
    """
    Generate a comprehensive analysis of bowling statistics for each player by combining
    bowling performance from `bowling_df` and detailed analysis from `commentary_df`.
    
    Args:
    - bowling_df (pd.DataFrame): Dataframe with player bowling performance.
    - commentary_df (pd.DataFrame): Dataframe with commentary analysis.

    Returns:
    - dict: Comprehensive analysis for each bowler.
    """
    bowler_analysis = {}

    # Process Bowling Stats from `bowling_df`
    for _, player in bowling_df.groupby('playerId'):
        player_id = int(player['playerId'].iloc[0])
        player_name = player['name'].iloc[0]

        # Fetch player details from players_collection
        player_details = players_collection.find_one({"_id": player_id})
        
        # Fetch team details from teams_collection
        team_info = teams_collection.find_one({"players": player_id})
        
        # Extract required fields from player_details
        role = player_details.get('role', '')
        batting_style = player_details.get('batting_style', '')
        bowling_style = player_details.get('bowling_style', '')
        team_name = team_info.get('team_name', '') if team_info else ''

        matches = []
        total_runs = 0
        total_wickets = 0
        total_balls = 0
        total_fantasy_points=0

        for _, innings in player.iterrows():
            # Convert overs to balls (e.g., 4.3 overs = 4*6 + 3 = 27 balls)
            overs_whole = int(innings['overs'])
            overs_decimal = round((innings['overs'] - overs_whole) * 10)  # Get decimal part
            balls = (overs_whole * 6) + overs_decimal

            match_stats = {
                'runs': innings['runs'],
                'overs': innings['overs'],
                'wickets': innings['wickets'],
                'economy': innings['economy'],
                'fantasy_points': innings['fantasy_points'],
                'match_id': innings['match_id'],
                'stats_type': innings['stats_type']
            }
            matches.append(match_stats)

            total_runs += innings['runs']
            total_wickets += innings['wickets']
            total_balls += balls
            total_fantasy_points +=innings['fantasy_points']

        # Initialize bowler's data
        bowler_analysis[player_id] = {
            'name': player_name,
            'role': role,
            'batting_style': batting_style,
            'bowling_style': bowling_style,
            'team_name': team_name,
            'matches': matches,
            'total_runs': total_runs,
            'total_wickets': total_wickets,
            'total_balls': total_balls,
            'runs_per_wicket': total_runs / total_wickets if total_wickets > 0 else float('inf'),
            'balls_per_wicket': total_balls / total_wickets if total_wickets > 0 else float('inf'),
            'total_fantasy_points': total_fantasy_points
        }

    # Process Detailed Bowling Analysis from `commentary_df`
    commentary_df['bowler_id'] = commentary_df['bowler_id'].astype(int)

    for bowler_id in commentary_df['bowler_id'].unique():
        bowler_data = commentary_df[commentary_df['bowler_id'] == bowler_id]
        bowler_name = bowler_data['bowler_name'].iloc[0]

        # Determine the opponent team
        if player_id in team1_ids:
            opponent_players = team2_ids
        elif player_id in team2_ids:
            opponent_players = team1_ids
        else:
            continue

        # Phase-wise analysis
        phases = {
            'powerplay': bowler_data[(bowler_data['overNumber'] >= 0) & (bowler_data['overNumber'] <= 6)],
            'middle': bowler_data[(bowler_data['overNumber'] > 6) & (bowler_data['overNumber'] <= 16)],
            'death': bowler_data[(bowler_data['overNumber'] > 16) & (bowler_data['overNumber'] <= 20)]
        }

        phase_stats = {}
        for phase_name, phase_data in phases.items():
            total_runs = phase_data['runs'].sum()
            total_balls = len(phase_data)
            total_wickets = len(phase_data[phase_data['is_wicket'] == True])

            phase_stats[phase_name] = {
                'runs': total_runs,
                'balls': total_balls,
                'wickets': total_wickets,
                'runs_per_wicket': total_runs / total_wickets if total_wickets > 0 else float('inf'),
                'balls_per_wicket': total_balls / total_wickets if total_wickets > 0 else float('inf'),
                'economy': (total_runs * 6) / total_balls if total_balls > 0 else 0
            }

        leftVSright={
            'left_arm': bowler_data[bowler_data['bowler_hand']=='left'],
            'right_arm': bowler_data[bowler_data['bowler_hand']=='right']
        }
        leftVright={}
        for bowl_hand, data in leftVSright.items():
            total_runs = data['runs'].sum()
            total_balls = len(data)
            strike_rate = (total_runs / total_balls * 100) if total_balls > 0 else 0
            wickets = len(data[data['is_wicket'] == True])
            leftVright[bowl_hand] = {
                'runs': total_runs,
                'balls': total_balls,
                'strike_rate': strike_rate,
                'wickets': wickets
            }

        # Wicket analysis
        wicket_data = bowler_data[bowler_data['is_wicket'] == True]
        filtered_wicket_stats = {
            str(tuple(row)): count
            for row, count in wicket_data.groupby(["ball_length", "ball_line", "bowling_variation"]).size().items()
            if count >= 2  # Add only if count >= 2
        }


        # Boundary analysis
        boundaries = bowler_data[bowler_data['runs'].isin([4, 6])]
        boundary_percentage = (len(boundaries) / len(bowler_data)) * 100 if len(bowler_data) > 0 else 0

        # Group boundary details and filter conditions
        boundary_details = boundaries.groupby(["ball_length", "ball_line", "bowling_variation"]).size()
        filtered_boundary_stats = {
            str(tuple(row)): count
            for row, count in boundary_details.items()
            if count >= 3  # Add only if count >= 3
        }

        # If no boundary group satisfies count >= 3, include groups with count >= 2
        if not filtered_boundary_stats:
            filtered_boundary_stats = {
                str(tuple(row)): count
                for row, count in boundary_details.items()
                if count >= 2
            }

        boundary_analysis = {
            "details": filtered_boundary_stats,
            "percentage": boundary_percentage
        }
        # Bowling variation frequency
        variation_frequency = bowler_data['bowling_variation'].value_counts().to_dict()


        # Analysis against opponents
        opponent_analysis = {}
        for opponent_id in opponent_players:
            # Get all interactions between current player and opponent
            interactions = bowler_data[
                (bowler_data['batsman_id'] == opponent_id)
            ]


            if len(interactions) >= 18 or interactions['is_wicket'].any():

                batter_name = interactions['batsman_name'].iloc[0] if not interactions.empty else "Unknown"

                total_balls = len(interactions)
                total_runs = interactions['runs'].sum()
                strike_rate = (total_runs / total_balls * 100) if total_balls > 0 else 0
                wickets = interactions['is_wicket'].sum()

                opponent_stats = {
                    'balls': total_balls,
                    'runs': total_runs,
                    'strike_rate': strike_rate,
                    'wickets': wickets
                }

                # Detailed wicket analysis if multiple wickets
                if wickets > 0:
                    wicket_rows = interactions[interactions['is_wicket'] == True]
                    
                    # Fill null values with 'unknown' before grouping
                    wicket_rows_filled = wicket_rows.fillna({
                        'wicket_type': 'unknown',
                        'shot_played': 'unknown',
                        'ball_line': 'unknown',
                        'ball_length': 'unknown'
                    })
                    
                    wicket_patterns = wicket_rows_filled.groupby(
                        ['wicket_type', 'shot_played', 'ball_line', 'ball_length'],
                        dropna=False  # Include groups with null values
                    ).size().to_dict()
                    
                    opponent_stats['wicket_patterns'] = {
                        str(k): v for k, v in wicket_patterns.items()
                    }

                opponent_analysis[batter_name] = opponent_stats

        # Update or merge detailed analysis into existing entry
        if bowler_id in bowler_analysis:
            bowler_analysis[bowler_id].update({
                'name': bowler_name,
                'phase_stats': phase_stats,
                'left_vs_right': leftVright,
                'wicket_analysis': filtered_wicket_stats,
                'boundary_analysis': boundary_analysis,
                'variation_frequency': variation_frequency,
                'opponent_analysis': opponent_analysis  # New field added
            })
        else:
            bowler_analysis[bowler_id] = {
                'name': bowler_name,
                'phase_stats': phase_stats,
                'left_vs_right': leftVright,
                'wicket_analysis': filtered_wicket_stats,
                'boundary_analysis': boundary_analysis,
                'variation_frequency': variation_frequency,
                'opponent_analysis': opponent_analysis  # New field added
            }

    return bowler_analysis





def generate_batting_analysis(batting_df, commentary_df, team1_ids, team2_ids):
    """
    Generate a comprehensive analysis of batting statistics for each player by combining
    batting performance from `batting_df` and detailed commentary analysis from `commentary_df`.
    Now includes analysis of performance against specific opponents.

    Args:
    - batting_df (pd.DataFrame): Dataframe with player batting performance.
    - commentary_df (pd.DataFrame): Dataframe with commentary analysis.

    Returns:
    - dict: Comprehensive analysis for each batsman.
    """
    player_analysis = {}

    # Process Batting Stats from `batting_df`
    for _, player in batting_df.groupby('playerId'):
        player_id = int(player['playerId'].iloc[0])
        player_name = player['name'].iloc[0]

        # Fetch player details from players_collection
        player_details = players_collection.find_one({"_id": player_id})
        
        # Fetch team details from teams_collection
        team_info = teams_collection.find_one({"players": player_id})
        
        # Extract required fields from player_details
        role = player_details.get('role', '')
        batting_style = player_details.get('batting_style', '')
        bowling_style = player_details.get('bowling_style', '')
        team_name = team_info.get('team_name', '') if team_info else ''

        matches = []
        total_runs = 0
        total_balls = 0
        total_fantasy_points=0

        for _, innings in player.iterrows():
            match_stats = {
                'runs': innings['runs'],
                'balls': innings['balls'],
                'strike_rate': innings['strikeRate'],
                'fantasy_points': innings['fantasy_points'],
                'match_id': innings['match_id'],
                'stats_type': innings['stats_type'],
                'batting_position': innings['batting_position']
            }
            matches.append(match_stats)
            total_runs += innings['runs']
            total_balls += innings['balls']
            total_fantasy_points +=innings['fantasy_points']

        # Initialize player's data
        player_analysis[player_id] = {
            'name': player_name,
            'role': role,
            'batting_style': batting_style,
            'bowling_style': bowling_style,
            'team_name': team_name,
            'matches': matches,
            'total_runs': total_runs,
            'total_balls': total_balls,
            'average_runs': total_runs / len(matches) if matches else 0,
            'overall_strike_rate': (total_runs / total_balls * 100) if total_balls > 0 else 0,
            'total_fantasy_points': total_fantasy_points,

        }

    commentary_df['batsman_id'] = commentary_df['batsman_id'].astype(int)
    # Process Detailed Commentary Analysis from `commentary_df`
    for player_id in commentary_df['batsman_id'].unique():
        player_data = commentary_df[commentary_df['batsman_id'] == (player_id)]

        player_name = player_data['batsman_name'].iloc[0]


        # Determine the opponent team
        if player_id in team1_ids:
            opponent_players = team2_ids
        elif player_id in team2_ids:
            opponent_players = team1_ids
        else:
            continue

        # Phase-wise analysis
        phases = {
            'powerplay': player_data[(player_data['overNumber'] >= 0) & (player_data['overNumber'] <= 6)],
            'middle': player_data[(player_data['overNumber'] > 6) & (player_data['overNumber'] <= 16)],
            'death': player_data[(player_data['overNumber'] > 16) & (player_data['overNumber'] <= 20)]
        }
        phase_stats = {}
        for phase_name, phase_data in phases.items():
            total_runs = phase_data['runs'].sum()
            total_balls = len(phase_data)
            strike_rate = (total_runs / total_balls * 100) if total_balls > 0 else 0
            phase_stats[phase_name] = {
                'runs': total_runs,
                'balls': total_balls,
                'strike_rate': strike_rate
            }
            

        spinVSpace={
            'spin': player_data[player_data['bowler_type']=='spin'],
            'pace': player_data[player_data['bowler_type']=='pace']
        }
        spinVpace={}
        for bowl_type, data in spinVSpace.items():
            total_runs = data['runs'].sum()
            total_balls = len(data)
            strike_rate = (total_runs / total_balls * 100) if total_balls > 0 else 0
            wickets = len(data[data['is_wicket'] == True])
            spinVpace[bowl_type] = {
                'runs': total_runs,
                'balls': total_balls,
                'strike_rate': strike_rate,
                'wickets': wickets
            }

        leftVSright={
            'left_arm': player_data[player_data['bowler_hand']=='left'],
            'right_arm': player_data[player_data['bowler_hand']=='right']
        }
        leftVright={}
        for bowl_hand, data in leftVSright.items():
            total_runs = data['runs'].sum()
            total_balls = len(data)
            strike_rate = (total_runs / total_balls * 100) if total_balls > 0 else 0
            wickets = len(data[data['is_wicket'] == True])
            leftVright[bowl_hand] = {
                'runs': total_runs,
                'balls': total_balls,
                'strike_rate': strike_rate,
                'wickets': wickets
            }


        # Analysis against opponents
        opponent_analysis = {}
        for opponent_id in opponent_players:
            # Get all interactions between current player and opponent
            interactions = player_data[
                (player_data['bowler_id'] == str(opponent_id))
            ]
            
            if len(interactions) >= 18 or interactions['is_wicket'].any():

                bowler_name = interactions['bowler_name'].iloc[0] if not interactions.empty else "Unknown"

                total_balls = len(interactions)
                total_runs = interactions['runs'].sum()
                strike_rate = (total_runs / total_balls * 100) if total_balls > 0 else 0
                wickets = interactions['is_wicket'].sum()

                opponent_stats = {
                    'balls': total_balls,
                    'runs': total_runs,
                    'strike_rate': strike_rate,
                    'wickets': wickets
                }

                if wickets > 0:
                    wicket_rows = interactions[interactions['is_wicket'] == True]
                    
                    # Fill null values with 'unknown' before grouping
                    wicket_rows_filled = wicket_rows.fillna({
                        'wicket_type': 'unknown',
                        'shot_played': 'unknown',
                        'ball_line': 'unknown',
                        'ball_length': 'unknown'
                    })
                    
                    wicket_patterns = wicket_rows_filled.groupby(
                        ['wicket_type', 'shot_played', 'ball_line', 'ball_length'],
                        dropna=False  # Include groups with null values
                    ).size().to_dict()
                    
                    opponent_stats['wicket_patterns'] = {
                        str(k): v for k, v in wicket_patterns.items()
                    }


                opponent_analysis[bowler_name] = opponent_stats

        control_stats = player_data['batting_control'].value_counts().to_dict()
        scoring_zone = player_data['shot_direction'].value_counts().to_dict()

        # Wicket analysis
        wicket_data = player_data[player_data['is_wicket'] == True]
        wicket_stats = {
            str(tuple(row)): count
            for row, count in wicket_data.groupby(["shot_played", "ball_length", "ball_line", "shot_direction"]).size().items()
            if count >= 2
        }

        # Boundary analysis
        boundaries = player_data[player_data['runs'].isin([4, 6])]
        boundary_percentage = (len(boundaries) / len(player_data)) * 100 if len(player_data) > 0 else 0

        boundary_details = boundaries.groupby(["shot_played", "ball_length", "ball_line"]).size()
        boundary_stats = {
            str(tuple(row)): count
            for row, count in boundary_details.items()
            if count >= 3
        }

        if not boundary_stats:
            boundary_stats = {
                str(tuple(row)): count
                for row, count in boundary_details.items()
                if count >= 2
            }

        # Update or merge detailed analysis into existing entry
        if player_id in player_analysis:
            player_analysis[player_id].update({
                'name': player_name,
                'phase_stats': phase_stats,
                'spin_vs_pace': spinVpace,
                'left_vs_right': leftVright,
                'wicket_analysis': wicket_stats,
                'batting_control': control_stats,
                'scoring_zone': scoring_zone,
                'boundary_analysis': {
                    "details": boundary_stats,
                    "percentage": boundary_percentage
                },
                'opponent_analysis': opponent_analysis  # New field added
            })
        else:
            player_analysis[player_id] = {
                'name': player_name,
                'phase_stats': phase_stats,
                'wicket_analysis': wicket_stats,
                'spin_vs_pace': spinVpace,
                'left_vs_right': leftVright,
                'batting_control': control_stats,
                'scoring_zone': scoring_zone,
                'boundary_analysis': {
                    "details": boundary_stats,
                    "percentage": boundary_percentage
                },
                'opponent_analysis': opponent_analysis  # New field added
            }

    return player_analysis



# Usage example:
def analyze_match(venue_df, h2h_df, batting_df,bowling_df, batting_commentary, bowling_commentary):
    """Analyze complete match data and return results dictionary"""
    team1 = "Gujarat Titans"
    team2 = "Chennai Super Kings"
    team1_ids = teams_collection.find_one({"team_name": team1}, {"players": 1})
    team2_ids = teams_collection.find_one({"team_name": team2}, {"players": 1})
    results = {
        'match_summary': process_match_summary(venue_df, h2h_df),
        'bowler_analysis': generate_bowler_analysis(bowling_df, bowling_commentary,team1_ids['players'],team2_ids['players']),
        'batting_analysis': generate_batting_analysis(batting_df, batting_commentary,team1_ids['players'],team2_ids['players']),
    }
    return results

result= analyze_match(venue_df, h2h_df, batting_df,bowling_df, batting_commentary, bowling_commentary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commentary_df['bowler_id'] = commentary_df['bowler_id'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commentary_df['batsman_id'] = commentary_df['batsman_id'].astype(int)


In [8]:
import json
import numpy as np 

def preprocess_for_json(obj):
    """
    Recursively convert non-serializable types in a dictionary or list to standard Python types.
    """
    if isinstance(obj, dict):
        new_dict = {}
        for key, value in obj.items():
            # Ensure all keys are strings
            if not isinstance(key, (str, int, float, bool, type(None))):
                key = str(key)
            new_dict[key] = preprocess_for_json(value)
        return new_dict
    elif isinstance(obj, list):
        return [preprocess_for_json(item) for item in obj]
    elif isinstance(obj, (np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.float64, np.float32)):
        return float(obj)
    else:
        return obj


# Preprocess the report
preprocessed_result = preprocess_for_json(result)
# Specify the file path
output_file = "match_analysis_claude.json"

# Save the report as a JSON file
with open(output_file, "w") as json_file:
    json.dump(preprocessed_result, json_file, indent=4)

print(f"Match analysis report saved to {output_file}")



Match analysis report saved to match_analysis_claude.json


In [7]:
import json
from groq import Groq
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

class CricketAnalysisBot:
    def __init__(self, api_key):
        """Initialize the bot with Groq API key"""
        self.client = Groq(api_key=api_key)
        self.summary_file = "player_summaries.json"
        
    def create_player_prompt(self, player_id, batting_data, bowling_data):
        """
        Create a detailed prompt for AI to summarize player statistics.
        
        Args:
        - player_id (int): Player ID
        - batting_data (dict): Batting statistics
        - bowling_data (dict): Bowling statistics
        
        Returns:
        - str: AI prompt
        """
        def convert_to_serializable(data):
            """Recursively convert numpy types in dictionaries to Python native types."""
            if isinstance(data, dict):
                return {key: convert_to_serializable(value) for key, value in data.items()}
            elif isinstance(data, list):
                return [convert_to_serializable(value) for value in data]
            elif isinstance(data, (np.integer, np.int64)):
                return int(data)
            elif isinstance(data, (np.floating, np.float64)):
                return float(data)
            else:
                return data

        prompt = f"Analyze the following cricket statistics for {batting_data.get('name', '') or bowling_data.get('name', '')}:\n\n"

        # Add batting data if available
        if batting_data:
            prompt += "BATTING STATISTICS:\n"
            prompt += f"- Matches:\n{json.dumps(convert_to_serializable(batting_data.get('matches', [])), indent=2)}\n"
            prompt += f"- Total Runs: {batting_data.get('total_runs', 0)}\n"
            prompt += f"- Total Balls: {batting_data.get('total_balls', 0)}\n"
            prompt += f"- Average Runs: {batting_data.get('average_runs', 0)}\n"
            prompt += f"- Overall Strike Rate: {batting_data.get('overall_strike_rate', 0)}\n"
            prompt += f"- Phase Stats:\n{json.dumps(convert_to_serializable(batting_data.get('phase_stats', {})), indent=2)}\n"
            prompt += f"- Wicket Analysis ('shot_played', 'ball_length', 'ball_line','shot_direction'):\n{json.dumps(convert_to_serializable(batting_data.get('wicket_analysis', {})), indent=2)}\n"
            prompt += f"- Batting Control:\n{json.dumps(convert_to_serializable(batting_data.get('batting_control', {})), indent=2)}\n"
            prompt += f"- Scoring Zones:\n{json.dumps(convert_to_serializable(batting_data.get('scoring_zone', {})), indent=2)}\n"
            prompt += f"- Boundary Analysis ('shot_played', 'ball_length', 'ball_line'):\n{json.dumps(convert_to_serializable(batting_data.get('boundary_analysis', {})), indent=2)}\n"
            prompt += f"- Opponent team bolwers head to head Analysis:\n{json.dumps(convert_to_serializable(batting_data.get('opponent_analysis', {})), indent=2)}\n"

        # Add bowling data if available
        if bowling_data:
            prompt += "\nBOWLING STATISTICS:\n"
            prompt += f"- Matches:\n{json.dumps(convert_to_serializable(bowling_data.get('matches', [])), indent=2)}\n"
            prompt += f"- Total Runs Conceded: {bowling_data.get('total_runs', 0)}\n"
            prompt += f"- Total Wickets: {bowling_data.get('total_wickets', 0)}\n"
            prompt += f"- Total Balls Bowled: {bowling_data.get('total_balls', 0)}\n"
            prompt += f"- Runs per Wicket: {bowling_data.get('runs_per_wicket', 0)}\n"
            prompt += f"- Balls per Wicket: {bowling_data.get('balls_per_wicket', 0)}\n"
            prompt += f"- Phase Stats:\n{json.dumps(convert_to_serializable(bowling_data.get('phase_stats', {})), indent=2)}\n"
            prompt += f"- Wicket Analysis ('ball_length', 'ball_line','bowling_variation'):\n{json.dumps(convert_to_serializable(bowling_data.get('wicket_analysis', {})), indent=2)}\n"
            prompt += f"- Boundary Analysis ('ball_length', 'ball_line','bowling_variation'):\n{json.dumps(convert_to_serializable(bowling_data.get('boundary_analysis', {})), indent=2)}\n"
            prompt += f"- Variation Frequency:\n{json.dumps(convert_to_serializable(bowling_data.get('variation_frequency', {})), indent=2)}\n"
            prompt += f"- Opponent team players head to head Analysis:\n{json.dumps(convert_to_serializable(bowling_data.get('opponent_analysis', {})), indent=2)}\n"

        prompt += "\nProvide a detailed, human-readable breakdown of the player's performance based on the following statistics. For each field, convert the data into an easy-to-analyze format. The output should emphasize clarity and readability, making it easy to quickly interpret the player's performance at a glance. Avoid summarizing the data; present the statistics as described in their respective formats."

        
        return prompt



    def get_ai_summary(self, prompt):
        """Get AI-generated summary using Groq"""
        try:
            messages = [
                {
                    "role": "system",
                    "content": '''
                        Provide a detailed, human-readable breakdown of the player's performance based on the following statistics. For each field, convert the data into an easy-to-analyze format as described below:

                        1. Matches: Summarize each match's performance in the format "Runs(Balls) SR: StrikeRate" (e.g., 23(10) SR: 230, 40(20) SR: 200).
                        2. batting Phase Stats: Summarize runs, balls, and strike rates for each phase in the format "Runs(Balls) SR: StrikeRate" for Powerplay, Middle Overs, and Death Overs.
                        3. Total Batting Statistics: Provide the player's total runs, balls faced, average runs, and overall strike rate in clear terms.
                        4. batsman's Wicket Analysis: Summarize dismissal patterns in the proper statistical format .
                        5. batsman's Boundary Analysis: List boundary details in the proper statistical format and include the boundary percentage that batsman scores.
                        8. Shot selection: Summarize the frequency of shot_played in the format "shot pyed by batsmen: Count" (e.g., drive: 12, cut: 8).
                        6. Bowling Matches: Summarize each match's bowling performance in the format "Runs(Overs) Econ: EconomyRate Wkts: Wickets" (e.g., 24(4) Econ: 6.0 Wkts: 2).
                        7. Bowling Phase Stats: Summarize runs, balls, economy, and wickets for each phase (Powerplay, Middle Overs, Death Overs) in a human-readable format.
                        4. bowler's Analysis: Summarize dismissal patterns in the proper statistical format .
                        5. bowler's Boundary Analysis: List boundary details in the proper statistical format and include the boundary percentage that bowler conceed.
                        8. Bowling Variations: Summarize the frequency of bowling variations in the format "Variation: Count" (e.g., Yorker: 12, Slower: 8).
                        9. Overall Bowling Statistics: Provide total runs conceded, total balls bowled, total wickets, runs per wicket, and balls per wicket in a concise format.

                        The output should emphasize clarity and readability, making it easy to quickly interpret the player's performance at a glance. Avoid summarizing the data; present the statistics as described in their respective formats.
                        generate in large and detailed manner.
                    '''
                },
                {
                    "role": "user",
                    "content": f"Context: {prompt}"
                }
            ]
            
            response = self.client.chat.completions.create(
                model='llama-3.1-8b-instant',
                messages=messages,
                temperature=0.2,
                max_tokens=1500
            )
            
            return response.choices[0].message.content
            
        except Exception as e:
            return f"Error generating summary: {str(e)}"
    
    def save_summary(self, player_id, player_name, summary):
        """Save or update player summary in JSON file"""
        try:
            # Load existing summaries
            if os.path.exists(self.summary_file):
                with open(self.summary_file, 'r') as f:
                    summaries = json.load(f)
            else:
                summaries = {}
            
            # Update or add new summary
            summaries[player_id] = {
                "name": player_name,
                "summary": summary
            }
            
            # Save updated summaries
            with open(self.summary_file, 'w') as f:
                json.dump(summaries, f, indent=4)
                
        except Exception as e:
            print(f"Error saving summary: {str(e)}")

    def analyze_players(self, batting_analysis, bowling_analysis):
        """Analyze all players in the provided data"""
        # Get all unique player IDs
        player_ids = set(list(batting_analysis.keys()) + list(bowling_analysis.keys()))
        
        for player_id in player_ids:
            batting_data = batting_analysis.get(player_id, {})
            bowling_data = bowling_analysis.get(player_id, {})
            
            # Skip if no data available
            if not batting_data and not bowling_data:
                continue
            
            # Get player name from e  ither batting or bowling data
            player_name = batting_data.get('name') or bowling_data.get('name')
            
            # Create prompt and get AI summary
            prompt = self.create_player_prompt(player_id, batting_data, bowling_data)
            summary = self.get_ai_summary(prompt)
            
            # Save the summary
            self.save_summary(player_id, player_name, summary)
            print(f"Processed analysis for {player_name}")
        


def main():
    
    # Initialize and run the bot
    bot = CricketAnalysisBot(api_key=os.getenv("GROQ_API_KEY"))
    bot.analyze_players(result['batting_analysis'], result['bowler_analysis'])

if __name__ == "__main__":
    main()

NameError: name 'np' is not defined

In [132]:
import json

# Specify the file name where summaries are stored
summary_file = "player_summaries.json"

def load_summaries(summary_file):
    """Load player summaries from the JSON file."""
    if os.path.exists(summary_file):
        with open(summary_file, 'r') as f:
            summaries = json.load(f)
            return summaries
    else:
        print(f"Summary file '{summary_file}' not found.")
        return {}

# Load the summaries
summaries = load_summaries(summary_file)

# Example: Display the summaries
for player_id, player_summary in summaries.items():
    print(f"Player ID: {player_id}")
    print(f"Name: {player_summary['name']}")
    print(f"Summary: {player_summary['summary']}")
    print("-" * 50)


Player ID: 13184
Name: Mukesh Choudhary
Summary: **Mukesh Choudhary's Performance Breakdown**

**Batting Statistics**

### Matches

* Match 1: 6(9) SR: 66.67
* Match 2: 3(5) SR: 60.0
* Match 3: 4(4) SR: 100.0

### Total Batting Statistics

* Total Runs: 13
* Total Balls: 18
* Average Runs: 4.33
* Overall Strike Rate: 72.22

### Batting Phase Stats

* **Powerplay**: 0(0) SR: 0
* **Middle**: 4(4) SR: 100.0
* **Death**: 2(2) SR: 100.0

### Wicket Analysis

* No wickets taken, no data available

### Batting Control

* In-control: 4
* No-control: 2

### Scoring Zones

* Mid-wicket: 1
* Keeper: 1
* Blocked: 1
* Point: 1
* Long-on: 1

### Boundary Analysis

* Boundary percentage: 16.67%

### Shot Selection

* No shot selection data available

**Bowling Statistics**

### Matches

* Match 1: 48(4.0) Econ: 12.0 Wkts: 2
* Match 2: 33(4.0) Econ: 8.2 Wkts: 3
* Match 3: 21(4.0) Econ: 5.2 Wkts: 3
* Match 4: 33(4.0) Econ: 8.2 Wkts: 4
* Match 5: 39(3.0) Econ: 13.0 Wkts: 1
* Match 6: 20(3.0) Econ: 6.7 W