In [7]:
import pandas as pd
import re
from collections import defaultdict

class EventParser:
    @staticmethod
    def parse_summary(summary):
        pattern = (
            r"Matchday (\d+): (Successful|Unsuccessful) ([\w\s]+?) in zone (\w+) by player ([\w\s]+?)"
            r"(?: to player ([\w\s]+?) in zone (\w+))? for team ([\w\s]+) at minute (\d+):(\d+)"
        )
        match = re.search(pattern, summary)
        if match:
            action_type = match.group(3)
            return {
                'matchday': int(match.group(1)),
                'success': match.group(2),
                'action_type': action_type,
                'zone_from': match.group(4),
                'player_from': match.group(5).strip(),
                'player_to': match.group(6).strip() if match.group(6) else "N/A",
                'zone_to': match.group(7).strip() if match.group(7) else match.group(4),
                'team': match.group(8).strip(),
                'minute': int(match.group(9)),
                'assist': None  # Initially, assist is None
            }
        return None

    @staticmethod
    def identify_assists(docs):
        for i in range(len(docs) - 1):
            current_action = docs[i]
            next_action = docs[i + 1]
            if current_action['action_type'] == 'Pass' and \
               next_action['action_type'].startswith('Shot') and \
               current_action['team'] == next_action['team'] and \
               abs(next_action['minute'] - current_action['minute']) <= 1:
                next_action['assist'] = current_action['player_from']
        return docs

# Load the CSV file
file_path = '/Users/jesperpilegaard/Desktop/Superliga 2022-2023/csv-summaries/f24-100-2022-2288345-eventdetails.csv'
data = pd.read_csv(file_path)

# Parse the data
events = [EventParser.parse_summary(summary) for summary in data['Summary'] if EventParser.parse_summary(summary)]
events = EventParser.identify_assists(events)

# Initialize data structures to store counts
team_counts = defaultdict(lambda: defaultdict(int))
player_pairs = defaultdict(int)
goal_times = []
goal_zones = defaultdict(list)
chance_zones = defaultdict(list)
chance_creators = defaultdict(list)
first_half_goals = 0
second_half_goals = 0
shots_per_period = [0] * 6
player_shots = defaultdict(int)

# Process events to gather required information
for event in events:
    team = event['team']
    action = event['action_type']
    success = event['success'] == 'Successful'
    
    if action == 'Pass':
        if success:
            team_counts[team]['successful_passes'] += 1
        else:
            team_counts[team]['unsuccessful_passes'] += 1
    elif action == 'Offside Pass':
        team_counts[team]['offside_passes'] += 1
    elif action == 'Foul':
        team_counts[team]['fouls'] += 1
    elif action == 'Corner Awarded':
        team_counts[team]['corners_awarded'] += 1
    elif action.startswith('Shot'):
        team_counts[team]['shots'] += 1
        player_shots[event['player_from']] += 1
        period_index = min((event['minute'] - 1) // 15, 5)
        shots_per_period[period_index] += 1
    elif action in ['Tackle', 'Interception', 'Clearance']:
        team_counts[team]['defensive_actions'] += 1
    elif action == 'Goal':
        goal_times.append(event['minute'])
        goal_zones[team].append(event['zone_from'])
        if event['minute'] <= 45:
            first_half_goals += 1
        else:
            second_half_goals += 1
    
    # Track successful passes between players
    if action == 'Pass' and success and event['player_to'] != 'N/A':
        pair = tuple(sorted([event['player_from'], event['player_to']]))
        player_pairs[pair] += 1
    
    # Track chances created
    if event['assist']:
        chance_creators[team].append(event['assist'])
        chance_zones[team].append(event['zone_from'])

# Identify players who scored goals
scorers = [event['player_from'] for event in events if event['action_type'] == 'Goal']

# Determine the most involved player
most_involved_player = max(player_pairs, key=player_pairs.get) if player_pairs else None

# Determine the pair with most successful passes, ensuring it handles None values properly
most_successful_passes_pair = max(player_pairs.items(), key=lambda item: item[1]) if player_pairs else None

# Identify the player with the most shots
most_shots_player = max(player_shots, key=player_shots.get) if player_shots else None

# Calculate the match result
goals_scored_by_team = defaultdict(int)
for event in events:
    if event['action_type'] == 'Goal':
        goals_scored_by_team[event['team']] += 1

# Prepare answers to specific questions
answers = {
    "successful_passes": {team: counts['successful_passes'] for team, counts in team_counts.items()},
    "unsuccessful_passes": {team: counts['unsuccessful_passes'] for team, counts in team_counts.items()},
    "most_offside_passes": max(team_counts.items(), key=lambda item: item[1]['offside_passes'])[0],
    "fouls": {team: counts['fouls'] for team, counts in team_counts.items()},
    "most_corners_awarded": max(team_counts.items(), key=lambda item: item[1]['corners_awarded'])[0],
    "shots": {team: counts['shots'] for team, counts in team_counts.items()},
    "chances_created": {team: len(creators) for team, creators in chance_creators.items()},
    "result": goals_scored_by_team,
    "defensive_actions": {team: counts['defensive_actions'] for team, counts in team_counts.items()},
    "most_successful_passes_pair": most_successful_passes_pair,
    "scorers": scorers,
    "goal_times": goal_times,
    "goal_zones": goal_zones,
    "chance_zones": chance_zones,
    "chance_creators": chance_creators,
    "first_half_goals": first_half_goals,
    "second_half_goals": second_half_goals,
    "most_shots_period": shots_per_period,
    "aalborg_performance": team_counts['Aalborg BK'],
    "viborg_performance": team_counts['Viborg FF'],
    "most_involved_player": most_involved_player,
    "most_shots_player": most_shots_player,
    "most_chances_half": "First Half" if sum(shots_per_period[:3]) > sum(shots_per_period[3:]) else "Second Half",
    "jeppe_gronning_zone": max(set(event['zone_from'] for event in events if event['player_from'] == 'Jeppe Grønning'), key=(lambda zone: sum(event['zone_from'] == zone for event in events if event['player_from'] == 'Jeppe Grønning'))) if any(event['player_from'] == 'Jeppe Grønning' for event in events) else None
}

# Adjust answers for natural language responses
result_string = ", ".join([f"{team}: {goals} goals" for team, goals in answers['result'].items()])
goal_zones_string = ", ".join([f"{team}: {', '.join(zones)}" for team, zones in answers['goal_zones'].items()])
chance_zones_string = ", ".join([f"{team}: {', '.join(zones)}" for team, zones in answers['chance_zones'].items()])
chance_creators_string = ", ".join([f"{team}: {', '.join(creators)}" for team, creators in answers['chance_creators'].items()])

qa_data_natural_language = {
    "Question": [
        "How many successful passes did each team have?",
        "How many unsuccessful passes were made by each team?",
        "Which team had the most offside passes?",
        "How many fouls were committed by each team?",
        "Which team was awarded the most corners?",
        "How many shots did each team have?",
        "Which team created the most chances?",
        "What was the result of the match?",
        "How many fouls did each team have?",
        "How many successful defensive actions did each team have?",
        "Which players had most successful passes between each other?",
        "Which players scored the goals?",
        "When were the goals scored?",
        "In which zones were the goals scored for each team?",
        "In which zones were the chances created for each team?",
        "Which players created the chances for each team?",
        "How many goals were scored in the first half of the match?",
        "How many goals were scored in the second half of the match?",
        "In a time period of 15 minutes, when was the most shots taken?",
        "How did Aalborg BK perform?",
        "How did Viborg FF perform?",
        "Which player was most involved in the events?",
        "Which player had most shots?",
        "In which half were the most chances created?",
        "In which zone did Jeppe Grønning have most actions?"
    ],
    "Answer": [
        f"Aalborg BK: {answers['successful_passes']['Aalborg BK']} successful passes, Viborg FF: {answers['successful_passes']['Viborg FF']} successful passes",
        f"Aalborg BK: {answers['unsuccessful_passes']['Aalborg BK']} unsuccessful passes, Viborg FF: {answers['unsuccessful_passes']['Viborg FF']} unsuccessful passes",
        f"{answers['most_offside_passes']} had the most offside passes",
        f"Aalborg BK: {answers['fouls']['Aalborg BK']} fouls, Viborg FF: {answers['fouls']['Viborg FF']} fouls",
        f"{answers['most_corners_awarded']} was awarded the most corners",
        f"Aalborg BK: {answers['shots']['Aalborg BK']} shots, Viborg FF: {answers['shots']['Viborg FF']} shots",
        f"Both teams created 8 chances each",
        result_string,
        f"Aalborg BK: {answers['fouls']['Aalborg BK']} fouls, Viborg FF: {answers['fouls']['Viborg FF']} fouls",
        f"Aalborg BK: {answers['defensive_actions']['Aalborg BK']} successful defensive actions, Viborg FF: {answers['defensive_actions']['Viborg FF']} successful defensive actions",
        f"{answers['most_successful_passes_pair'][0][0]} and {answers['most_successful_passes_pair'][0][1]} with {answers['most_successful_passes_pair'][1]} successful passes",
        f"Goals were scored by {', '.join(answers['scorers'])}",
        f"Goals were scored in the minutes: {', '.join(map(str, answers['goal_times']))}",
        goal_zones_string,
        chance_zones_string,
        chance_creators_string,
        f"{answers['first_half_goals']} goals were scored in the first half",
        f"{answers['second_half_goals']} goals were scored in the second half",
        f"The most shots were taken in the period 75-90 minutes with {answers['most_shots_period'][5]} shots",
        f"Aalborg BK had {answers['aalborg_performance']['successful_passes']} successful passes, {answers['aalborg_performance']['unsuccessful_passes']} unsuccessful passes, {answers['aalborg_performance']['shots']} shots, {answers['aalborg_performance']['defensive_actions']} successful defensive actions, {answers['aalborg_performance']['fouls']} fouls, {answers['aalborg_performance']['corners_awarded']} corners awarded, and {answers['aalborg_performance']['offside_passes']} offside passes",
        f"Viborg FF had {answers['viborg_performance']['successful_passes']} successful passes, {answers['viborg_performance']['unsuccessful_passes']} unsuccessful passes, {answers['viborg_performance']['shots']} shots, {answers['viborg_performance']['defensive_actions']} successful defensive actions, {answers['viborg_performance']['fouls']} fouls, {answers['viborg_performance']['corners_awarded']} corners awarded, and {answers['viborg_performance']['offside_passes']} offside passes",
        f"{answers['most_involved_player'][0]} and {answers['most_involved_player'][1]} were most involved in the events",
        f"The player with the most shots was {answers['most_shots_player']}",
        f"The most chances were created in the {answers['most_chances_half']}",
        f"Jeppe Grønning had most actions in zone {answers['jeppe_gronning_zone']}"
    ]
}

# Create a DataFrame
qa_df_natural_language = pd.DataFrame(qa_data_natural_language)

# Save the DataFrame to a CSV file
output_file_path_natural_language = 'match_analysis_results_natural_language.csv'
qa_df_natural_language.to_csv(output_file_path_natural_language, index=False)

output_file_path_natural_language



'match_analysis_results_natural_language.csv'

In [19]:
import pandas as pd

# Load the CSV files
match_analysis = pd.read_csv('/Users/jesperpilegaard/Desktop/Superliga 2022-2023/ground_truth.csv')
query_results = pd.read_csv('/Users/jesperpilegaard/Desktop/Superliga 2022-2023/query_results_gpt-3.5_multi.csv')

# Print the column names to verify the presence of 'Question'
print("Columns in match_analysis:", match_analysis.columns)
print("Columns in query_results:", query_results.columns)

# Correct the phrasing in query_results to match match_analysis
query_results.loc[query_results['Question'] == 'When was the goals scored?', 'Question'] = 'When were the goals scored?'

# Merge the dataframes based on the Question column
merged_df = pd.merge(match_analysis, query_results, on='Question')

# Rename the columns for clarity
merged_df = merged_df.rename(columns={
    'Answer_x': 'Expected Answer',
    'Answer_y': 'GPT-3.5-turbo Answer',
    'Response Time (s)': 'Response Time'
})

# Save the merged dataframe to a new CSV file
merged_df.to_csv('merged_analysis_results_gpt-3.5_multi.csv', index=False)

print("Merging complete. The merged file is saved as 'merged_analysis_results.csv'.")


Columns in match_analysis: Index(['Question;Expected answer;;;;;'], dtype='object')
Columns in query_results: Index(['Question', 'Answer', 'Response Time (s)'], dtype='object')


KeyError: 'Question'

In [12]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('merged_analysis_results_phi3.csv')

# Column names based on actual column names in your CSV
ground_truth_column = 'Expected Answer'
predicted_answer_column = 'Phi3 Answer'

# Preprocess text
def preprocess(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = ''.join(e for e in text if e.isalnum() or e.isspace())
    return text.strip()

df[ground_truth_column] = df[ground_truth_column].apply(preprocess)
df[predicted_answer_column] = df[predicted_answer_column].apply(preprocess)

# Calculate F1-Score for each row
def calculate_f1(ground_truth, predicted_answer):
    if not ground_truth and not predicted_answer:
        return 1.0
    if not ground_truth or not predicted_answer:
        return 0.0
    ground_truth_tokens = set(ground_truth.split())
    predicted_answer_tokens = set(predicted_answer.split())
    common_tokens = ground_truth_tokens & predicted_answer_tokens
    if len(common_tokens) == 0:
        return 0.0
    precision = len(common_tokens) / len(predicted_answer_tokens)
    recall = len(common_tokens) / len(ground_truth_tokens)
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

df['f1_score'] = df.apply(lambda row: calculate_f1(row[ground_truth_column], row[predicted_answer_column]), axis=1)

# Overall metrics
average_f1_score = df['f1_score'].mean()
average_response_time = df['Response Time'].mean()

print(f'Average Response Time: {average_response_time:.4f} seconds')
print(f'Average F1-Score: {average_f1_score:.4f}')

# Save the DataFrame to a CSV file
df.to_csv('evaluation_results_phi3.csv', index=False)
print("Evaluation results saved to evaluation_results.csv")

# Display the DataFrame
from IPython.display import display
display(df)


Average Response Time: 980.5208 seconds
Average F1-Score: 0.1215
Evaluation results saved to evaluation_results.csv


Unnamed: 0,Question,Expected Answer,Phi3 Answer,Response Time,f1_score
0,How many successful passes did each team have?,aalborg bk 333 successful passes viborg ff 410...,to determine the number of successful passes f...,459.775963,0.043478
1,How many unsuccessful passes were made by each...,aalborg bk 101 unsuccessful passes viborg ff 9...,to determine the number of unsuccessful passes...,151.559957,0.037037
2,Which team had the most offside passes?,viborg ff had the most offside passes,the team that had the most offside pass events...,88.198881,0.333333
3,How many fouls were committed by each team?,aalborg bk 18 fouls viborg ff 17 fouls,unfortunately the provided information does no...,74.020198,0.031746
4,Which team was awarded the most corners?,aalborg bk was awarded the most corners,the team that was awarded the most corners is ...,230.177609,0.08547
5,How many shots did each team have?,aalborg bk 12 shots viborg ff 13 shots,to determine the number of successful and unsu...,770.807669,0.013514
6,Which team created the most chances?,both teams created 8 chances each,to determine which team created the most chanc...,11266.451282,0.040404
7,What was the result of the match?,viborg ff 2 goals aalborg bk 1 goals,as an ai i cannot provide realtime results or ...,80.270446,0.0
8,How many fouls did each team have?,aalborg bk 18 fouls viborg ff 17 fouls,unfortunately the provided information does no...,76.429128,0.033898
9,How many successful defensive actions did each...,aalborg bk 58 successful defensive actions vib...,to determine the number of successful defensiv...,1935.613834,0.043165
