In [82]:
import pandas as pd
import re
from collections import defaultdict, Counter

class EventParser:
    @staticmethod
    def parse_summary(summary):
        pattern = (
            r"Matchday (\d+): (Successful|Unsuccessful) ([\w\s]+?) in zone (\w+) by player ([\w\s]+?)"
            r"(?: to player ([\w\s]+?) in zone (\w+))? for team ([\w\s]+) at minute (\d+):(\d+)"
        )
        match = re.search(pattern, summary)
        if match:
            action_type = match.group(3)
            return {
                'matchday': int(match.group(1)),
                'success': match.group(2),
                'action_type': action_type,
                'zone_from': match.group(4),
                'player_from': match.group(5).strip(),
                'player_to': match.group(6).strip() if match.group(6) else "N/A",
                'zone_to': match.group(7).strip() if match.group(7) else match.group(4),
                'team': match.group(8).strip(),
                'minute': int(match.group(9)),
                'assist': None  # Initially, assist is None
            }
        return None

    @staticmethod
    def identify_assists(docs):
        for i in range(len(docs) - 1):
            current_action = docs[i]
            next_action = docs[i + 1]
            if current_action['action_type'] == 'Pass' and \
               next_action['action_type'].startswith('Shot') and \
               current_action['team'] == next_action['team'] and \
               abs(next_action['minute'] - current_action['minute']) <= 1:
                next_action['assist'] = current_action['player_from']
        return docs

# Load the CSV file
file_path = 'csv-summaries/f24-100-2022-2288345-eventdetails.csv'
data = pd.read_csv(file_path)

# Parse the data
events = [EventParser.parse_summary(summary) for summary in data['Summary'] if EventParser.parse_summary(summary)]
events = EventParser.identify_assists(events)

# Initialize data structures to store counts
team_counts = defaultdict(lambda: defaultdict(int))
player_pairs = defaultdict(int)
goal_times = []
goal_zones = defaultdict(list)
chance_zones = defaultdict(list)
chance_creators = defaultdict(list)
first_half_goals = 0
second_half_goals = 0
shots_per_period = [0] * 6
player_shots = defaultdict(int)
player_involvement = Counter()

# Process events to gather required information
for event in events:
    team = event['team']
    action = event['action_type']
    success = event['success'] == 'Successful'
    
    # Count player involvement
    player_involvement[event['player_from']] += 1
    if event['player_to'] != 'N/A':
        player_involvement[event['player_to']] += 1
    
    if action == 'Pass':
        if success:
            team_counts[team]['successful_passes'] += 1
        else:
            team_counts[team]['unsuccessful_passes'] += 1
    elif action == 'Offside Pass':
        team_counts[team]['offside_passes'] += 1
    elif action == 'Foul':
        team_counts[team]['fouls'] += 1
    elif action == 'Corner Awarded' and success:
        team_counts[team]['corners_awarded'] += 1
    elif action.startswith('Shot'):
        team_counts[team]['shots'] += 1
        player_shots[event['player_from']] += 1
        period_index = min((event['minute'] - 1) // 15, 5)
        shots_per_period[period_index] += 1
    elif action in ['Tackle', 'Interception', 'Clearance']:
        team_counts[team]['defensive_actions'] += 1
    elif action == 'Goal':
        team_counts[team]['goals'] += 1
        goal_times.append(event['minute'])
        goal_zones[team].append(event['zone_from'])
        if event['minute'] <= 45:
            first_half_goals += 1
        else:
            second_half_goals += 1
    
    # Track successful passes between players
    if action == 'Pass' and success and event['player_to'] != 'N/A':
        pair = tuple(sorted([event['player_from'], event['player_to']]))
        player_pairs[pair] += 1
    
    # Track chances created
    if event['assist']:
        chance_creators[team].append(event['assist'])
        chance_zones[team].append(event['zone_from'])

# Identify players who scored goals
scorers = [event['player_from'] for event in events if event['action_type'] == 'Goal']

# Determine the most involved player
most_involved_player = player_involvement.most_common(1)[0] if player_involvement else None

# Determine the pair with most successful passes, ensuring it handles None values properly
most_successful_passes_pair = max(player_pairs.items(), key=lambda item: item[1]) if player_pairs else None

# Identify the player with the most shots
most_shots_player = max(player_shots, key=player_shots.get) if player_shots else None

# Calculate the match result
goals_scored_by_team = defaultdict(int)
for event in events:
    if event['action_type'] == 'Goal':
        goals_scored_by_team[event['team']] += 1

# Prepare answers to specific questions
answers = {
    "successful_passes": {team: counts['successful_passes'] for team, counts in team_counts.items()},
    "unsuccessful_passes": {team: counts['unsuccessful_passes'] for team, counts in team_counts.items()},
    "most_offside_passes": max(team_counts.items(), key=lambda item: item[1]['offside_passes'])[0],
    "fouls": {team: counts['fouls'] for team, counts in team_counts.items()},
    "most_corners_awarded": max(team_counts.items(), key=lambda item: item[1]['corners_awarded'])[0],
    "shots": {team: counts['shots'] for team, counts in team_counts.items()},
    "chances_created": {team: len(creators) for team, creators in chance_creators.items()},
    "result": goals_scored_by_team,
    "defensive_actions": {team: counts['defensive_actions'] for team, counts in team_counts.items()},
    "most_successful_passes_pair": most_successful_passes_pair,
    "scorers": scorers,
    "goal_times": goal_times,
    "goal_zones": goal_zones,
    "chance_zones": chance_zones,
    "chance_creators": chance_creators,
    "first_half_goals": first_half_goals,
    "second_half_goals": second_half_goals,
    "most_shots_period": shots_per_period,
    "aalborg_performance": team_counts['Aalborg BK'],
    "viborg_performance": team_counts['Viborg FF'],
    "most_involved_player": most_involved_player,
    "most_shots_player": most_shots_player,
    "most_chances_half": "First Half" if sum(shots_per_period[:3]) > sum(shots_per_period[3:]) else "Second Half",
    "jeppe_gronning_zone": max(set(event['zone_from'] for event in events if event['player_from'] == 'Jeppe Grønning'), key=(lambda zone: sum(event['zone_from'] == zone for event in events if event['player_from'] == 'Jeppe Grønning'))) if any(event['player_from'] == 'Jeppe Grønning' for event in events) else None
}

# Adjust answers for natural language responses
result_string = ", ".join([f"{team}: {goals} goals" for team, goals in answers['result'].items()])
goal_zones_string = ", ".join([f"{team}: {', '.join(zones)}" for team, zones in answers['goal_zones'].items()])
chance_zones_string = ", ".join([f"{team}: {', '.join(zones)}" for team, zones in answers['chance_zones'].items()])
chance_creators_string = ", ".join([f"{team}: {', '.join(creators)}" for team, creators in answers['chance_creators'].items()])

qa_data_natural_language = {
    "Question": [
        "How many successful passes did each team have?",
        "How many unsuccessful passes were made by each team?",
        "Which team had the most offside passes?",
        "How many fouls were committed by each team?",
        "Which team was awarded the most corners?",
        "How many shots did each team have?",
        "Which team created the most chances?",
        "What was the result of the match?",
        "How many successful defensive actions did each team have?",
        "Which players had most successful passes between each other?",
        "Which players scored the goals?",
        "When were the goals scored?",
        "In which zones were the goals scored for each team?",
        "In which zones were the chances created for each team?",
        "Which players created the chances for each team?",
        "How many goals were scored in the first half of the match?",
        "How many goals were scored in the second half of the match?",
        "In a time period of 15 minutes, when was the most shots taken?",
        "How did Aalborg BK perform?",
        "How did Viborg FF perform?",
        "Which player was most involved in the events?",
        "Which player had most shots?",
        "In which half were the most chances created?",
        "In which zone did Jeppe Grønning have most actions?"
    ],
    "Answer": [
        f"Aalborg BK: {answers['successful_passes']['Aalborg BK']} successful passes, Viborg FF: {answers['successful_passes']['Viborg FF']} successful passes",
        f"Aalborg BK: {answers['unsuccessful_passes']['Aalborg BK']} unsuccessful passes, Viborg FF: {answers['unsuccessful_passes']['Viborg FF']} unsuccessful passes",
        f"{answers['most_offside_passes']} had the most offside passes",
        f"Aalborg BK: {answers['fouls']['Aalborg BK']} fouls, Viborg FF: {answers['fouls']['Viborg FF']} fouls",
        f"{answers['most_corners_awarded']} was awarded the most corners",
        f"Aalborg BK: {answers['shots']['Aalborg BK']} shots, Viborg FF: {answers['shots']['Viborg FF']} shots",
        f"Both teams created 8 chances each",
        result_string,
        f"Aalborg BK: {answers['defensive_actions']['Aalborg BK']} successful defensive actions, Viborg FF: {answers['defensive_actions']['Viborg FF']} successful defensive actions",
        f"{answers['most_successful_passes_pair'][0][0]} and {answers['most_successful_passes_pair'][0][1]} with {answers['most_successful_passes_pair'][1]} successful passes",
        f"Goals were scored by {', '.join(answers['scorers'])}",
        f"Goals were scored in the minutes: {', '.join(map(str, answers['goal_times']))}",
        goal_zones_string,
        chance_zones_string,
        chance_creators_string,
        f"{answers['first_half_goals']} goals were scored in the first half",
        f"{answers['second_half_goals']} goals were scored in the second half",
        f"The most shots were taken in the period 75-90 minutes with {answers['most_shots_period'][5]} shots",
        f"Aalborg BK had {answers['aalborg_performance']['goals']} successful goals, {answers['aalborg_performance']['successful_passes']} successful passes, {answers['aalborg_performance']['unsuccessful_passes']} unsuccessful passes, {answers['aalborg_performance']['shots']} shots, {answers['aalborg_performance']['defensive_actions']} successful defensive actions, {answers['aalborg_performance']['fouls']} fouls, {answers['aalborg_performance']['corners_awarded']} corners awarded, and {answers['aalborg_performance']['offside_passes']} offside passes",
        f"Viborg FF had {answers['viborg_performance']['goals']} successful goals, {answers['viborg_performance']['successful_passes']} successful passes, {answers['viborg_performance']['unsuccessful_passes']} unsuccessful passes, {answers['viborg_performance']['shots']} shots, {answers['viborg_performance']['defensive_actions']} successful defensive actions, {answers['viborg_performance']['fouls']} fouls, {answers['viborg_performance']['corners_awarded']} corners awarded, and {answers['viborg_performance']['offside_passes']} offside passes",
        f"{answers['most_involved_player'][0]} was most involved in the events with {answers['most_involved_player'][1]} events",
        f"The player with the most shots was {answers['most_shots_player']}",
        f"The most chances were created in the {answers['most_chances_half']}",
        f"Jeppe Grønning had most actions in zone {answers['jeppe_gronning_zone']}"
    ]
}

# Create a DataFrame
qa_df_natural_language = pd.DataFrame(qa_data_natural_language)

# Save the DataFrame to a CSV file
output_file_path_natural_language = 'match_analysis_results_natural_language.csv'
qa_df_natural_language.to_csv(output_file_path_natural_language, index=False)

output_file_path_natural_language


'match_analysis_results_natural_language.csv'

In [81]:
import pandas as pd
import re
from collections import defaultdict, Counter
import os

class EventParser:
    @staticmethod
    def parse_summary(summary):
        pattern = (
            r"Matchday (\d+): (Successful|Unsuccessful) ([\w\s]+?) in zone (\w+) by player ([\w\s]+?)"
            r"(?: to player ([\w\s]+?) in zone (\w+))? for team ([\w\s]+) at minute (\d+):(\d+)"
        )
        match = re.search(pattern, summary)
        if match:
            action_type = match.group(3)
            return {
                'matchday': int(match.group(1)),
                'success': match.group(2),
                'action_type': action_type,
                'zone_from': match.group(4),
                'player_from': match.group(5).strip(),
                'player_to': match.group(6).strip() if match.group(6) else "N/A",
                'zone_to': match.group(7).strip() if match.group(7) else match.group(4),
                'team': match.group(8).strip(),
                'minute': int(match.group(9)),
                'assist': None  # Initially, assist is None
            }
        return None

    @staticmethod
    def identify_assists(docs):
        for i in range(len(docs) - 1):
            current_action = docs[i]
            next_action = docs[i + 1]
            if current_action['action_type'] == 'Pass' and \
               next_action['action_type'].startswith('Shot') and \
               current_action['team'] == next_action['team'] and \
               abs(next_action['minute'] - current_action['minute']) <= 1:
                next_action['assist'] = current_action['player_from']
        return docs

# Load the first 18 CSV files from the specified directory
directory_path = 'csv-summaries/'  # Update this path if necessary

# List and sort the files based on their filenames
file_paths = sorted([os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')])[:18]

all_events = []
for file_path in file_paths:
    data = pd.read_csv(file_path)
    events = [EventParser.parse_summary(summary) for summary in data['Summary'] if EventParser.parse_summary(summary)]
    events = EventParser.identify_assists(events)
    all_events.extend(events)

# Initialize data structures to store counts
team_counts = defaultdict(lambda: defaultdict(int))
player_pairs = defaultdict(int)
goal_times = []
goal_zones = defaultdict(list)
chance_zones = defaultdict(list)
chance_creators = defaultdict(list)
first_half_goals = 0
second_half_goals = 0
shots_per_period = [0] * 6
player_shots = defaultdict(int)
player_involvement = Counter()
player_interceptions = Counter()
player_cards = Counter()
team_saves = defaultdict(int)

# Process events to gather required information
for event in all_events:
    team = event['team']
    action = event['action_type']
    success = event['success'] == 'Successful'
    
    # Count player involvement
    player_involvement[event['player_from']] += 1
    if event['player_to'] != 'N/A':
        player_involvement[event['player_to']] += 1
    
    if action == 'Pass':
        if success:
            team_counts[team]['successful_passes'] += 1
        else:
            team_counts[team]['unsuccessful_passes'] += 1
    elif action == 'Offside Pass':
        team_counts[team]['offside_passes'] += 1
    elif action == 'Foul':
        team_counts[team]['fouls'] += 1
    elif action == 'Corner Awarded' and success:
        team_counts[team]['corners_awarded'] += 1
    elif action.startswith('Shot'):
        team_counts[team]['shots'] += 1
        player_shots[event['player_from']] += 1
        period_index = min((event['minute'] - 1) // 15, 5)
        shots_per_period[period_index] += 1
    elif action in ['Tackle', 'Interception', 'Clearance']:
        team_counts[team]['defensive_actions'] += 1
        if action == 'Interception':
            player_interceptions[event['player_from']] += 1
    elif action == 'Goal':
        team_counts[team]['goals'] += 1
        goal_times.append(event['minute'])
        goal_zones[team].append(event['zone_from'])
        if event['minute'] <= 45:
            first_half_goals += 1
        else:
            second_half_goals += 1
    elif action == 'Card':
        player_cards[event['player_from']] += 1
    elif action == 'Save':
        team_saves[team] += 1
    
    # Track successful passes between players
    if action == 'Pass' and success and event['player_to'] != 'N/A':
        pair = tuple(sorted([event['player_from'], event['player_to']]))
        player_pairs[pair] += 1
    
    # Track chances created
    if event['assist']:
        chance_creators[team].append(event['assist'])
        chance_zones[team].append(event['zone_from'])

# Identify players who scored goals
scorers = [event['player_from'] for event in all_events if event['action_type'] == 'Goal']

# Determine the most involved player
most_involved_player = player_involvement.most_common(1)[0] if player_involvement else None

# Determine the pair with most successful passes, ensuring it handles None values properly
most_successful_passes_pair = max(player_pairs.items(), key=lambda item: item[1]) if player_pairs else None

# Identify the player with the most shots
most_shots_player = max(player_shots, key=player_shots.get) if player_shots else None

# Calculate the match result
goals_scored_by_team = defaultdict(int)
for event in all_events:
    if event['action_type'] == 'Goal':
        goals_scored_by_team[event['team']] += 1

def aggregate_data_for_range(matchday_start, matchday_end):
    filtered_events = [event for event in all_events if matchday_start <= event['matchday'] <= matchday_end]
    team_goal_counts = defaultdict(int)
    player_shots_counts = defaultdict(int)
    team_saves_counts = defaultdict(int)
    player_interceptions_counts = defaultdict(int)
    team_corners_counts = defaultdict(int)
    player_cards_counts = Counter()
    player_involvement_counts = Counter()
    teams_played = defaultdict(set)
    for event in filtered_events:
        team = event['team']
        player = event['player_from']
        action = event['action_type']
        
        player_involvement_counts[player] += 1
        if action == 'Goal':
            team_goal_counts[team] += 1
        elif action.startswith('Shot'):
            player_shots_counts[player] += 1
        elif action == 'Save':
            team_saves_counts[team] += 1
        elif action == 'Interception':
            player_interceptions_counts[player] += 1
        elif action == 'Corner Awarded' and event['success'] == 'Successful':
            team_corners_counts[team] += 1
        elif action == 'Card':
            player_cards_counts[player] += 1

        if event['matchday'] == matchday_start:
            teams_played[team].add(event['matchday'])
    
    return {
        'team_goals': team_goal_counts,
        'player_shots': player_shots_counts,
        'team_saves': team_saves_counts,
        'player_interceptions': player_interceptions_counts,
        'team_corners': team_corners_counts,
        'player_cards': player_cards_counts,
        'player_involvement': player_involvement_counts,
        'teams_played': teams_played,
    }

# Example usage: Aggregating data for matchday range 1 to 3
matchday_start = 1
matchday_end = 3
aggregated_data = aggregate_data_for_range(matchday_start, matchday_end)

# Find teams with the most goals
max_goals = max(aggregated_data['team_goals'].values(), default=0)
teams_with_max_goals = [team for team, goals in aggregated_data['team_goals'].items() if goals == max_goals]

# Specific questions
questions_answers = {
    "most_goals_team": teams_with_max_goals,
    "most_shots_player": max(aggregated_data['player_shots'], key=aggregated_data['player_shots'].get, default=None),
    "most_saves_team": max(aggregated_data['team_saves'], key=aggregated_data['team_saves'].get, default=None),
    "most_interceptions_player": max(aggregated_data['player_interceptions'], key=aggregated_data['player_interceptions'].get, default=None),
    "most_corners_team": max(aggregated_data['team_corners'], key=aggregated_data['team_corners'].get, default=None),
    "most_cards_player": max(aggregated_data['player_cards'], key=aggregated_data['player_cards'].get, default=None),
    "top_10_cards_players": aggregated_data['player_cards'].most_common(10),
    "teams_matchday_X": list(aggregated_data['teams_played'].keys()),
    "fc_midtjylland_performance": team_counts['FC Midtjylland'] if 'FC Midtjylland' in team_counts else None,
    "most_offside_passes_team": max(team_counts.items(), key=lambda item: item[1]['offside_passes'], default=(None,))[0],
    "most_successful_passes_team": max(team_counts.items(), key=lambda item: item[1]['successful_passes'], default=(None,))[0],
    "odense_chances_created": len([event for event in all_events if matchday_start <= event['matchday'] <= matchday_end and event['team'] == 'Odense Boldklub' and event['assist']]),
    "brondby_results": {event['matchday']: goals_scored_by_team[event['team']] for event in all_events if matchday_start <= event['matchday'] <= matchday_end and event['team'] == 'Brøndby IF'}
}

# Convert the aggregated data into a DataFrame for analysis
qa_data = {
    "Question": [
        "Which team has scored most goals from Matchday 1 to Matchday 3?",
        "Which player has had most shots from Matchday 1 to Matchday 3?",
        "Which team has most saves from Matchday 1 to Matchday 3?",
        "Which player has most interceptions from Matchday 1 to Matchday 3?",
        "Which team has most corners awarded from Matchday 1 to Matchday 3?",
        "Which player received most cards from Matchday 1 to Matchday 3?",
        "Which 10 players received most cards from Matchday 1 to Matchday 3?",
        "Which teams played each other in Matchday 1?",
        "How did FC Midtjylland perform from Matchday 1 to Matchday 3?",
        "Which team had the most offside passes from Matchday 1 to Matchday 3?",
        "Which team had the most successful passes from Matchday 1 to Matchday 3?",
        "How many chances did Odense Boldklub create from Matchday 1 to Matchday 3?",
        "What was the result of Brøndby IF's matches from Matchday 1 to Matchday 3?"
    ],
    "Answer": [
        questions_answers["most_goals_team"],
        questions_answers["most_shots_player"],
        questions_answers["most_saves_team"],
        questions_answers["most_interceptions_player"],
        questions_answers["most_corners_team"],
        questions_answers["most_cards_player"],
        ", ".join([f"{player}: {count}" for player, count in questions_answers["top_10_cards_players"]]),
        ", ".join(questions_answers["teams_matchday_X"]),
        f"FC Midtjylland had {questions_answers['fc_midtjylland_performance']['goals']} successful goals, {questions_answers['fc_midtjylland_performance']['shots']} shots, {questions_answers['fc_midtjylland_performance']['defensive_actions']} successful defensive actions, {questions_answers['fc_midtjylland_performance']['fouls']} fouls, {questions_answers['fc_midtjylland_performance']['corners_awarded']} corners awarded, and {questions_answers['fc_midtjylland_performance']['offside_passes']} offside passes" if questions_answers['fc_midtjylland_performance'] else "No data for FC Midtjylland",
        questions_answers["most_offside_passes_team"],
        questions_answers["most_successful_passes_team"],
        questions_answers["odense_chances_created"],
        ", ".join([f"Matchday {md}: {result}" for md, result in questions_answers["brondby_results"].items()])
    ]
}

# Create a DataFrame
qa_df = pd.DataFrame(qa_data)

# Save the DataFrame to a CSV file
output_file_path = 'match_analysis_results_from_matchday_1_to_3.csv'
qa_df.to_csv(output_file_path, index=False)

output_file_path

'match_analysis_results_from_matchday_1_to_3.csv'