# Set Up

In [60]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_core.documents import Document
from dotenv import load_dotenv
import os
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
import pandas as pd
from langchain.agents import AgentType
from langchain_community.tools.pubmed.tool import PubmedQueryRun
from langchain_community.tools.youtube.search import YouTubeSearchTool
import openai
import pandas as pd
from langchain.agents import Tool
from pathlib import Path
from langchain_core.tools import tool
from typing import Dict, List
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chains.sequential import SimpleSequentialChain
df = pd.read_csv(r"C:\Users\j.mundondo\OneDrive - Statsports\Desktop\statsportsdoc\Projects\frequency_chat_PH\data\individual_efforts\david_only_metrics.csv")

In [61]:
df.dtypes

Unnamed: 0                                  int64
High Intensity Activity Type               object
Start Time                                 object
End Time                                   object
Time Since Last                            object
Duration                                   object
Distance                                  float64
Magnitude                                 float64
Avg Metabolic Power                       float64
Dynamic Stress Load                       float64
Duration_seconds                          float64
Long_sprint                                 int64
Preceding High Intensity Activity Type     object
dtype: object

# Calculate the Actual Start Time To Good Granularity

In [62]:
from datetime import datetime, timedelta

def standardise_start_times(df):
    """
    Standardizes start times using end time and duration information.
    
    Args:
    df: DataFrame with 'Start Time', 'End Time', and 'Duration' columns
    
    Returns:
    DataFrame with standardized 'Start Time' column
    """
    def parse_duration(duration_str):
        # Convert duration string (MM:SS.s) to seconds
        minutes, seconds = duration_str.split(':')
        return float(minutes) * 60 + float(seconds)
    
    # Create a copy to avoid modifying the original DataFrame
    df = df.copy()
    
    # Convert end times to datetime
    df['End Time'] = pd.to_datetime(df['Start Time'].str[:11] + df['End Time'], 
                                  format='%d/%m/%Y %H:%M:%S')
    
    # Convert durations to timedelta
    durations = pd.to_timedelta(df['Duration'].apply(parse_duration), unit='s')
    
    # Calculate precise start times by subtracting duration from end time
    #df['Derived Start Time'] = df['End Time'] - durations
    df.insert(3, 'Calculated Start Time', df['End Time'] - durations)
    if 'Unnamed: 0' in df.columns:
        df=df.drop('Unnamed: 0', axis=1)
    df = df.reset_index(drop=True)
    return df

df = standardise_start_times(df)
df

Unnamed: 0,High Intensity Activity Type,Start Time,Calculated Start Time,End Time,Time Since Last,Duration,Distance,Magnitude,Avg Metabolic Power,Dynamic Stress Load,Duration_seconds,Long_sprint,Preceding High Intensity Activity Type
0,Acceleration,07/01/2025 19:58,2025-01-07 19:58:34.300,2025-01-07 19:58:35,00:08.6,00:00.7,0.74,5.39,85.89,0.00,0.7,0,First Action
1,Deceleration,07/01/2025 19:58,2025-01-07 19:58:35.600,2025-01-07 19:58:37,00:02.0,00:01.4,0.63,3.34,2.39,0.00,1.4,0,Acceleration
2,Acceleration,07/01/2025 20:00,2025-01-07 20:00:13.600,2025-01-07 20:00:15,00:01.7,00:01.4,5.06,3.69,37.35,0.09,1.4,0,Deceleration
3,Deceleration,07/01/2025 20:00,2025-01-07 20:00:17.300,2025-01-07 20:00:19,00:04.1,00:01.7,1.31,3.51,27.38,0.03,1.7,0,Acceleration
4,Acceleration,07/01/2025 20:01,2025-01-07 20:01:10.700,2025-01-07 20:01:14,00:07.0,00:03.3,25.68,5.07,107.82,4.71,3.3,0,Deceleration
...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,Acceleration,07/01/2025 21:49,2025-01-07 21:49:55.200,2025-01-07 21:49:58,00:05.9,00:02.8,13.16,3.86,69.68,1.29,2.8,0,Acceleration
182,Deceleration,07/01/2025 21:49,2025-01-07 21:49:58.300,2025-01-07 21:50:00,00:03.7,00:01.7,1.73,4.92,9.43,0.04,1.7,0,Acceleration
183,Deceleration,07/01/2025 21:50,2025-01-07 21:50:20.100,2025-01-07 21:50:22,00:12.9,00:01.9,5.04,3.07,34.48,0.18,1.9,0,Deceleration
184,Acceleration,07/01/2025 21:50,2025-01-07 21:50:22.100,2025-01-07 21:50:23,00:03.2,00:00.9,3.32,3.66,44.44,0.06,0.9,0,Deceleration


In [63]:
def add_time_since_last_any_action(df):
    """
    Adds a column calculating time between any consecutive high intensity actions
    """
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # Calculate time difference between end of previous action and start of current action
    df['Placeholder'] = (df['Calculated Start Time'].shift(-1) - df['End Time']).shift()

    df.insert(6, 'Time Since Last Any Action', df['Placeholder'].apply(
        lambda x: f"{int(x.total_seconds()//60):02d}:{x.total_seconds()%60:04.1f}" if pd.notna(x) else None
    ))
   # display(df.head())
    df=df.drop('Placeholder', axis=1)
    return df
df = add_time_since_last_any_action(df)
df

Unnamed: 0,High Intensity Activity Type,Start Time,Calculated Start Time,End Time,Time Since Last,Duration,Time Since Last Any Action,Distance,Magnitude,Avg Metabolic Power,Dynamic Stress Load,Duration_seconds,Long_sprint,Preceding High Intensity Activity Type
0,Acceleration,07/01/2025 19:58,2025-01-07 19:58:34.300,2025-01-07 19:58:35,00:08.6,00:00.7,,0.74,5.39,85.89,0.00,0.7,0,First Action
1,Deceleration,07/01/2025 19:58,2025-01-07 19:58:35.600,2025-01-07 19:58:37,00:02.0,00:01.4,00:00.6,0.63,3.34,2.39,0.00,1.4,0,Acceleration
2,Acceleration,07/01/2025 20:00,2025-01-07 20:00:13.600,2025-01-07 20:00:15,00:01.7,00:01.4,01:36.6,5.06,3.69,37.35,0.09,1.4,0,Deceleration
3,Deceleration,07/01/2025 20:00,2025-01-07 20:00:17.300,2025-01-07 20:00:19,00:04.1,00:01.7,00:02.3,1.31,3.51,27.38,0.03,1.7,0,Acceleration
4,Acceleration,07/01/2025 20:01,2025-01-07 20:01:10.700,2025-01-07 20:01:14,00:07.0,00:03.3,00:51.7,25.68,5.07,107.82,4.71,3.3,0,Deceleration
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,Acceleration,07/01/2025 21:49,2025-01-07 21:49:55.200,2025-01-07 21:49:58,00:05.9,00:02.8,00:16.2,13.16,3.86,69.68,1.29,2.8,0,Acceleration
182,Deceleration,07/01/2025 21:49,2025-01-07 21:49:58.300,2025-01-07 21:50:00,00:03.7,00:01.7,00:00.3,1.73,4.92,9.43,0.04,1.7,0,Acceleration
183,Deceleration,07/01/2025 21:50,2025-01-07 21:50:20.100,2025-01-07 21:50:22,00:12.9,00:01.9,00:20.1,5.04,3.07,34.48,0.18,1.9,0,Deceleration
184,Acceleration,07/01/2025 21:50,2025-01-07 21:50:22.100,2025-01-07 21:50:23,00:03.2,00:00.9,00:00.1,3.32,3.66,44.44,0.06,0.9,0,Deceleration


In [64]:
df['High Intensity Activity Type'].value_counts()

Acceleration    89
Deceleration    86
Sprint          11
Name: High Intensity Activity Type, dtype: int64

In [65]:
df['Dynamic Stress Load'].max()

6.77

In [66]:
len(df[df['High Intensity Activity Type']=='Sprint'])

11

In [67]:
def find_most_common_actions(relevant_colum: str,top_n: int = 3) -> Dict:
    """
    Find the most frequent action types.
    
    Args:
        top_n: Number of top actions to return (default: 3)
        relevant_colum : The column the question is being asked about e.g. for most common action, the relevant_colum is `High Intensity Activity Type`
        
    Returns:
        Dictionary with frequency analysis
        
    Example Questions:
        * What's the most common action?
        * What are the top 3 most frequent activities?
        * Which high intensity actions happen most often?
        * Show me the action frequency distribution
    """
    counts = df[relevant_colum].value_counts()
    top_actions = counts.head(top_n)
    
    return {
        'top_actions': {
            action: {
                'count': int(count),
                'percentage': float((count/len(df) * 100))
            }
            for action, count in top_actions.items()
        },
        'total_actions': len(df),
        'unique_actions': len(counts)
    }
result = find_most_common_actions(relevant_colum='High Intensity Activity Type', top_n=1)
print(result)

{'top_actions': {'Acceleration': {'count': 89, 'percentage': 47.8494623655914}}, 'total_actions': 186, 'unique_actions': 3}


# Get Start Time Granularity

# Q1 : What are the most common sequences of high-intensity activities, including accelerations, sprints, and decelerations?  

In [68]:
#@tool
from langchain.tools import tool
import pandas as pd

@tool
def most_common_event_sequences(
    n_actions: int = 3,
    max_time_between: float = 10.0
) -> Dict:
    """
    Find most common sequences of high-intensity activities in the DataFrame.
    
    Args:
        n_actions: Number of consecutive actions to look for in sequences (default: 3)
        max_time_between: Maximum time in seconds between actions to be considered a sequence (default: 10.0)
    
    Returns:
        Dictionary containing sequences, their counts, and percentages.

    Example Questions Where this function would be useful : 
        * What are the most common 2 action sequences of high-intensity activities in 5 seconds timespans?
        
    """
    def parse_time_to_seconds(time_str):
        if pd.isna(time_str):
            return 0
        minutes, seconds = time_str.split(':')
        return float(minutes) * 60 + float(seconds)
    
    # Note: Using df from outer scope
    sequences = []
    current_sequence = []
    
    time_seconds = df['Time Since Last Any Action'].apply(parse_time_to_seconds)
    
    for i in range(len(df)):
        time_since_last = time_seconds.iloc[i]
        
        if time_since_last > max_time_between:
            current_sequence = []
        
        current_sequence.append(df['High Intensity Activity Type'].iloc[i])
        
        if len(current_sequence) >= n_actions:
            sequences.append(tuple(current_sequence[-n_actions:]))
    
    sequence_counts = pd.Series(sequences).value_counts()
    
    return {
        'sequences': list(sequence_counts.index),
        'counts': list(sequence_counts.values),
        'percentages': list((sequence_counts.values / len(sequences) * 100).round(2))
    }
#most_common_event_sequences(df, n_actions=4)

### Function Version 

In [72]:
def most_common_event_sequences_test(df, n_actions=3, max_time_between=10.0):
    """
    Find most common sequences of high-intensity activities
    """
    def parse_time_to_seconds(time_str):
        if pd.isna(time_str):
            return 0
        minutes, seconds = time_str.split(':')
        return float(minutes) * 60 + float(seconds)
    
    sequences = []
    current_sequence = []
    
    # Convert time strings to seconds manually
    time_seconds = df['Time Since Last Any Action'].apply(parse_time_to_seconds)
    
    # Iterate through rows to build sequences
    for i in range(len(df)):
        time_since_last = time_seconds.iloc[i]
        
        if time_since_last > max_time_between:
            # Start new sequence
            current_sequence = []
        
        current_sequence.append(df['High Intensity Activity Type'].iloc[i])
        
        if len(current_sequence) >= n_actions:
            sequences.append(tuple(current_sequence[-n_actions:]))
    
    # Count sequences
    sequence_counts = pd.Series(sequences).value_counts()
    
    return pd.DataFrame({
        'Sequence': sequence_counts.index,
        'Count': sequence_counts.values,
        'Percentage': (sequence_counts.values / len(sequences) * 100).round(2)
    })

result = most_common_event_sequences_test(df,n_actions=2, max_time_between=8)
print(result)

                       Sequence  Count  Percentage
0  (Acceleration, Deceleration)     36       44.44
1  (Deceleration, Acceleration)     23       28.40
2        (Sprint, Deceleration)     10       12.35
3        (Acceleration, Sprint)      7        8.64
4  (Deceleration, Deceleration)      5        6.17


# Q2 : How often do players accelerate and then immediately transition into a sprint?  

In [52]:
@tool
def consecutive_action_frequency(
   actions: List[str],
   max_time_between: float = 5.0
) -> Dict:
   """
   Calculate how often a specific sequence of high-intensity activities occurs in exact order within a given time window.

   Args:
       actions: List of action types in the specific order to look for (e.g. ['Sprint', 'Deceleration'])
       max_time_between: Maximum seconds between actions to be considered part of the same sequence (default: 5.0)

   Returns:
       Dictionary containing:
       - sequence: String showing the sequence analyzed (e.g. "Sprint → Deceleration")
       - total_occurrences: Number of times this exact sequence occurred
       - total_opportunities: Number of times the first action occurred
       - percentage_of_opportunities: Percentage of times first action led to full sequence
       - average_time_between: Average time between actions in successful sequences

   Example Questions Where this function would be useful:
       * How often do players decelerate immediately after sprinting?
       * What percentage of accelerations are followed by sprints within 2 seconds?
       * When a player does a sprint, how likely are they to decelerate within 3 seconds?
       * What's the typical time gap between an acceleration and a sprint when they occur together?
       * Out of all sprints, what percentage lead to a deceleration within 1 second?
   """
   def parse_time_to_seconds(time_str):
       if pd.isna(time_str):
           return 0
       minutes, seconds = time_str.split(':')
       return float(minutes) * 60 + float(seconds)
    
   time_seconds = df['Time Since Last Any Action'].apply(parse_time_to_seconds)
    
   sequence_count = 0
   total_starts = 0
   sequences_found = []
    
   # Count how many times first action appears (total opportunities)
   total_starts = df[df['High Intensity Activity Type'] == actions[0]].shape[0]
    
   # Look for complete sequences
   for i in range(len(df) - len(actions) + 1):
       if df['High Intensity Activity Type'].iloc[i] == actions[0]:
           valid_sequence = True
           sequence_times = []
            
           # Check if subsequent actions match in order and time constraints
           for j in range(1, len(actions)):
               if (i + j >= len(df) or 
                   df['High Intensity Activity Type'].iloc[i + j] != actions[j] or 
                   time_seconds.iloc[i + j] > max_time_between):
                   valid_sequence = False
                   break
               sequence_times.append(time_seconds.iloc[i + j])
            
           if valid_sequence:
               sequence_count += 1
               sequences_found.append({
                   'start_index': i,
                   'times_between': sequence_times
               })
    
   return {
       'sequence': ' → '.join(actions),
       'total_occurrences': sequence_count,
       'total_opportunities': total_starts,
       'percentage_of_opportunities': float((sequence_count / total_starts * 100 if total_starts > 0 else 0)),
       'average_time_between': sum([sum(s['times_between'])/len(s['times_between']) 
                                  for s in sequences_found])/len(sequences_found) if sequences_found else 0
   }

### Function Version 

In [55]:
def consecutive_action_frequency_test(df, actions, max_time_between=5.0):
    def parse_time_to_seconds(time_str):
        if pd.isna(time_str):
            return 0
        minutes, seconds = time_str.split(':')
        return float(minutes) * 60 + float(seconds)
    
    time_seconds = df['Time Since Last Any Action'].apply(parse_time_to_seconds)
    
    sequence_count = 0
    total_starts = 0
    sequences_found = []
    
    # Count how many times first action appears (total opportunities)
    total_starts = df[df['High Intensity Activity Type'] == actions[0]].shape[0]
    
    # Look for complete sequences
    for i in range(len(df) - len(actions) + 1):
        if df['High Intensity Activity Type'].iloc[i] == actions[0]:
            valid_sequence = True
            sequence_times = []
            
            # Check if subsequent actions match in order and time constraints
            for j in range(1, len(actions)):
                if (i + j >= len(df) or 
                    df['High Intensity Activity Type'].iloc[i + j] != actions[j] or 
                    time_seconds.iloc[i + j] > max_time_between):
                    valid_sequence = False
                    break
                sequence_times.append(time_seconds.iloc[i + j])
            
            if valid_sequence:
                sequence_count += 1
                sequences_found.append({
                    'start_index': i,
                    'times_between': sequence_times
                })
    
    return {
        'sequence': ' → '.join(actions),
        'total_occurrences': sequence_count,
        'total_opportunities': total_starts,
        'percentage_of_opportunities': float((sequence_count / total_starts * 100 if total_starts > 0 else 0)),
        'average_time_between': sum([sum(s['times_between'])/len(s['times_between']) 
                                   for s in sequences_found])/len(sequences_found) if sequences_found else 0
    }

result = consecutive_action_frequency_test(df,actions=["Sprint", "Deceleration"], max_time_between=2)
print(result)

{'sequence': 'Sprint → Deceleration', 'total_occurrences': 10, 'total_opportunities': 11, 'percentage_of_opportunities': 90.9090909090909, 'average_time_between': -1.3699999999999997}


# Q3 : How frequently do players decelerate right after completing a sprint?  

In [9]:
# dealt with by ---> def consecutive_action_frequency():

# Q4: How often do players sprint, decelerate, change direction and then immediately transition into another sprint?


In [10]:
#  temporarily deprecated due to lack of direction  change metrics

In [11]:
# Seperately N action fitler. gets N action

# Q5: For sprints greater than 40m, how often do players decelerate, change direction and then immediately transition into another sprint?


# Q6: What is the most common HIA output after sprints of >50m?


In [12]:
@tool
def analyze_actions_after_distance(
   initial_action: str,
   distance_threshold: float,
   max_time_between: float = 2.0
) -> Dict:
   """
   Analyze what actions commonly follow specific high-intensity movements that exceed a distance threshold.

   Args:
       initial_action: The type of action to filter for (e.g., 'Sprint', 'Acceleration')
       distance_threshold: Minimum distance in meters for the initial action to be considered
       max_time_between: Maximum seconds to consider an action as "following" (default: 2.0)

   Returns:
       Dictionary containing:
       - following_actions: List of actions that occur after the filtered initial action
       - counts: Number of times each following action occurs
       - percentages: Percentage distribution of following actions

   Example Questions Where this function would be useful:
       * For sprints greater than 40m, what actions typically follow?
       * After long accelerations (>5m), how often do players transition to sprints?
       * What's the most common follow-up action after extended sprints?
       * For accelerations over 10m, do players tend to decelerate or maintain speed?
       * What percentage of long sprints (>30m) are followed by decelerations?
       * After high-distance movements, what patterns emerge in subsequent actions?
   """
   # First filter for the initial action with distance threshold
   filtered_df = df[
       (df['High Intensity Activity Type'] == initial_action) & 
       (df['Distance'] > distance_threshold)
   ]
   
   following_actions = []
   
   # For each qualifying initial action, find the next action
   for idx in filtered_df.index:
       if idx + 1 < len(df):  # Ensure there is a next action
           time_str = df['Time Since Last Any Action'].iloc[idx + 1]
           if pd.notna(time_str):
               # Parse time to seconds
               minutes, seconds = time_str.split(':')
               time_between = float(minutes) * 60 + float(seconds)
               
               if time_between <= max_time_between:
                   following_actions.append(df['High Intensity Activity Type'].iloc[idx + 1])
   
   # Calculate frequencies
   if following_actions:
       frequencies = pd.Series(following_actions).value_counts()
       percentages = (frequencies / len(following_actions) * 100).round(2)
       
       return {
           'following_actions': list(frequencies.index),
           'counts': list(frequencies.values),
           'percentages': list(percentages.values)
       }
   else:
       return {
           'following_actions': [],
           'counts': [],
           'percentages': []
       }

### Function Version 

In [13]:
def analyze_actions_after_distance_test(df, initial_action, distance_threshold, max_time_between=2.0):
    # First filter for the initial action with distance threshold
    filtered_df = df[
        (df['High Intensity Activity Type'] == initial_action) & 
        (df['Distance'] > distance_threshold)
    ]
    
    following_actions = []
    
    # For each qualifying initial action, find the next action
    for idx in filtered_df.index:
        if idx + 1 < len(df):  # Ensure there is a next action
            time_str = df['Time Since Last Any Action'].iloc[idx + 1]
            if pd.notna(time_str):
                # Parse time to seconds
                minutes, seconds = time_str.split(':')
                time_between = float(minutes) * 60 + float(seconds)
                
                if time_between <= max_time_between:
                    following_actions.append(df['High Intensity Activity Type'].iloc[idx + 1])
    
    # Calculate frequencies
    if following_actions:
        frequencies = pd.Series(following_actions).value_counts()
        percentages = (frequencies / len(following_actions) * 100).round(2)
        
        return {
            'following_actions': list(frequencies.index),
            'counts': list(frequencies.values),
            'percentages': list(percentages.values)
        }
    else:
        return {
            'following_actions': [],
            'counts': [],
            'percentages': []
        }
analyze_actions_after_distance_test(df, 'Sprint', 40, 2.0)

{'following_actions': ['Deceleration'], 'counts': [1], 'percentages': [100.0]}

# Q7: How often do players have to complete long sprints?


In [14]:
# @tool
# def action_frequency_with_distance(
#    action_type: str,
#    distance_threshold: float,
#    greater_than_or_less_than: str = 'greater'
# ) -> Dict:
#    """
#    Analyze how often specific high-intensity actions occur that meet certain distance criteria.

#    Args:
#        action_type: Type of movement to analyze (e.g., 'Sprint', 'Acceleration', 'Deceleration')
#        distance_threshold: Distance threshold in meters to filter actions
#        greater_than_or_less_than: Whether to look for actions 'greater' or 'less' than threshold (default: 'greater')

#    Returns:
#        Dictionary containing:
#        - action_type: The type of action analyzed
#        - distance_criteria: Description of the distance filter applied
#        - total_occurrences: Number of actions meeting the criteria
#        - percentage_of_all_actions: What percentage these actions make up of all activities
#        - percentage_of_action_type: What percentage these actions make up of this specific action type
#        - average_time_between: Mean time between these actions
#        - average_distance: Average distance covered in these actions

#    Example Questions Where this function would be useful:
#        * How often do players complete sprints longer than 30 meters?
#        * What percentage of accelerations are shorter than 5 meters?
#        * How frequent are long-distance sprints in the data?
#        * What's the typical time gap between sprints over 20 meters?
#        * How many times do players perform short bursts (<10m) versus extended movements?
#        * What portion of all movements are high-distance accelerations?
#        * How often do players need to sprint distances greater than 40 meters?
#    """
#    # Total number of this action type
#    total_actions = df[df['High Intensity Activity Type'] == action_type].shape[0]
   
#    # Filter by distance
#    if greater_than_or_less_than == 'greater':
#        filtered_actions = df[
#            (df['High Intensity Activity Type'] == action_type) & 
#            (df['Distance'] > distance_threshold)
#        ]
#    else:
#        filtered_actions = df[
#            (df['High Intensity Activity Type'] == action_type) & 
#            (df['Distance'] < distance_threshold)
#        ]
   
#    filtered_count = len(filtered_actions)
   
#    # Calculate average time between these actions
#    if filtered_count > 1:
#        times_between = pd.to_timedelta(
#            filtered_actions['Time Since Last'].str.strip()
#        ).mean()
#    else:
#        times_between = None
   
#    return {
#        'action_type': action_type,
#        'distance_criteria': f"{greater_than_or_less_than} than {distance_threshold}m",
#        'total_occurrences': filtered_count,
#        'percentage_of_all_actions': float((filtered_count / len(df) * 100)),
#        'percentage_of_action_type': float((filtered_count / total_actions * 100)) if total_actions > 0 else 0,
#        'average_time_between': str(times_between) if times_between else None,
#        'average_distance': float(filtered_actions['Distance'].mean()) if filtered_count > 0 else 0
#    }
@tool
def action_frequency_with_distance(
    action_type: str,
    distance_threshold: float,
    greater_than_or_less_than: str = 'greater'
) -> Dict:
    """
    Analyze how often specific high-intensity actions occur that meet certain distance criteria.

    Args:
        action_type: Type of movement to analyze (e.g., 'Sprint', 'Acceleration', 'Deceleration')
        distance_threshold: Distance threshold in meters to filter actions
        greater_than_or_less_than: Whether to look for actions 'greater' or 'less' than threshold (default: 'greater')

    Returns:
        Dictionary containing:
        - action_type: The type of action analyzed
        - distance_criteria: Description of the distance filter applied
        - total_occurrences: Number of actions meeting the criteria
        - percentage_of_all_actions: What percentage these actions make up of all activities
        - percentage_of_action_type: What percentage these actions make up of this specific action type
        - average_time_between: Mean time between these actions. This is in the formate MM:SS:MSMS
        - average_distance: Average distance covered in these actions

    Example Questions Where this function would be useful:
        * How often do players complete sprints longer than 30 meters?
        * What percentage of accelerations are shorter than 5 meters?
        * How frequent are long-distance sprints in the data?
        * What's the typical time gap between sprints over 20 meters?
        * How many times do players perform short bursts (<10m) versus extended movements?
        * What portion of all movements are high-distance accelerations?
        * How often do players need to sprint distances greater than 40 meters?
    """
    # Total number of this action type
    total_actions = df[df['High Intensity Activity Type'] == action_type].shape[0]
    
    # Filter by distance
    if greater_than_or_less_than == 'greater':
        filtered_actions = df[
            (df['High Intensity Activity Type'] == action_type) & 
            (df['Distance'] > distance_threshold)
        ]
    else:
        filtered_actions = df[
            (df['High Intensity Activity Type'] == action_type) & 
            (df['Distance'] < distance_threshold)
        ]
    
    filtered_count = len(filtered_actions)
    
    # Calculate average time between these actions
    if filtered_count > 1:
        def parse_time_to_seconds(time_str):
            if pd.isna(time_str):
                return 0
            minutes, seconds = time_str.split(':')
            return float(minutes) * 60 + float(seconds)
            
        times_between = filtered_actions['Time Since Last'].apply(parse_time_to_seconds).mean()
        times_between = f"{int(times_between//60):02d}:{(times_between%60):05.2f}"
    else:
        times_between = None
    
    return {
        'action_type': action_type,
        'distance_criteria': f"{greater_than_or_less_than} than {distance_threshold}m",
        'total_occurrences': filtered_count,
        'percentage_of_all_actions': float((filtered_count / len(df) * 100)),
        'percentage_of_action_type': float((filtered_count / total_actions * 100)) if total_actions > 0 else 0,
        'average_time_between': times_between,
        'average_distance': float(filtered_actions['Distance'].mean()) if filtered_count > 0 else 0
    }

### Function Version 

In [15]:
def action_frequency_with_distance_test(df, action_type, distance_threshold, greater_than_or_less_than='greater'):
    # Total number of this action type
    total_actions = df[df['High Intensity Activity Type'] == action_type].shape[0]
    
    # Filter by distance
    if greater_than_or_less_than == 'greater':
        filtered_actions = df[
            (df['High Intensity Activity Type'] == action_type) & 
            (df['Distance'] > distance_threshold)
        ]
    else:
        filtered_actions = df[
            (df['High Intensity Activity Type'] == action_type) & 
            (df['Distance'] < distance_threshold)
        ]
    
    filtered_count = len(filtered_actions)
    
    # Calculate average time between these actions
    if filtered_count > 1:
        times_between = pd.to_timedelta(
            filtered_actions['Time Since Last'].str.strip()
        ).mean()
    else:
        times_between = None
    
    return {
        'action_type': action_type,
        'distance_criteria': f"{greater_than_or_less_than} than {distance_threshold}m",
        'total_occurrences': filtered_count,
        'percentage_of_all_actions': float((filtered_count / len(df) * 100)),#.round(2)),
        'percentage_of_action_type': float((filtered_count / total_actions * 100)) if total_actions > 0 else 0,
        'average_time_between': str(times_between) if times_between else None,
        'average_distance': float(filtered_actions['Distance'].mean().round(2)) if filtered_count > 0 else 0
    }
action_frequency_with_distance_test(df, 'Sprint', 40, 'greater')

{'action_type': 'Sprint',
 'distance_criteria': 'greater than 40m',
 'total_occurrences': 1,
 'percentage_of_all_actions': 0.5376344086021506,
 'percentage_of_action_type': 9.090909090909092,
 'average_time_between': None,
 'average_distance': 43.34}

# Q8: How often do players have to complete multiple long sprints in a short period?


In [16]:
# Actions in a time period

# Q9: How often do players have to go through a series of high-intensity activities and then complete a long sprint?


In [17]:
# Final Action

# Q10: What happens most often after a series of high-intensity activities?


In [18]:
#N series detection

# LLM

In [19]:
# Create the Groq LLM instance
llm = ChatGroq(
    api_key='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',  # Explicitly pass the API key
    #model="mixtral-8x7b-32768",  # Use one of Groq's supported models
    model="llama3-8b-8192",#"Llama-3-Groq-8B-Tool-Use",#"Llama-3-Groq-70B-Tool-Use",##"mixtral-8x7b-32768",  # Use one of Groq's supported models
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

# Initiate agent

In [20]:
tools = [
   most_common_event_sequences,
   consecutive_action_frequency,
   analyze_actions_after_distance,
   action_frequency_with_distance
]

agent = create_pandas_dataframe_agent(
   llm=llm,
   df=df,
   extra_tools=tools,
   verbose=True, 
   handle_parsing_errors=True,
   allow_dangerous_code=True,
   agent_type="tool-calling"
)



In [21]:
x=agent.invoke("What are the most common 2 action sequences of high-intensity activities in 5 seconds timespans?")
display(x)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `most_common_event_sequences` with `{'n_actions': 2, 'max_time_between': 5}`


[0m[33;1m[1;3m{'sequences': [('Acceleration', 'Deceleration'), ('Deceleration', 'Acceleration'), ('Sprint', 'Deceleration'), ('Acceleration', 'Sprint'), ('Deceleration', 'Deceleration')], 'counts': [34, 19, 10, 6, 2], 'percentages': [47.89, 26.76, 14.08, 8.45, 2.82]}[0m[32;1m[1;3mBased on the output of the tool call, the most common 2 action sequences of high-intensity activities in 5 seconds timespans are:

* Acceleration -> Deceleration (47.89%)
* Deceleration -> Acceleration (26.76%)

These two sequences account for approximately 74.65% of all 2-action sequences in the data.[0m

[1m> Finished chain.[0m


{'input': 'What are the most common 2 action sequences of high-intensity activities in 5 seconds timespans?',
 'output': 'Based on the output of the tool call, the most common 2 action sequences of high-intensity activities in 5 seconds timespans are:\n\n* Acceleration -> Deceleration (47.89%)\n* Deceleration -> Acceleration (26.76%)\n\nThese two sequences account for approximately 74.65% of all 2-action sequences in the data.'}

In [26]:
x=agent.invoke("How often do players have to go through a series of high intensity activities and then complete a long sprint?")
display(x)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `analyze_actions_after_distance` with `{'initial_action': 'Acceleration', 'distance_threshold': 30, 'max_time_between': 2}`


[0m[36;1m[1;3m{'following_actions': [], 'counts': [], 'percentages': []}[0m[32;1m[1;3m
Invoking: `action_frequency_with_distance` with `{'action_type': 'Sprint', 'distance_threshold': 40, 'greater_than_or_less_than': 'greater'}`


[0m[33;1m[1;3m{'action_type': 'Sprint', 'distance_criteria': 'greater than 40.0m', 'total_occurrences': 1, 'percentage_of_all_actions': 0.5376344086021506, 'percentage_of_action_type': 9.090909090909092, 'average_time_between': None, 'average_distance': 43.34}[0m[32;1m[1;3mBased on the results, it seems that there is only one instance of a sprint that is greater than 40 meters. This suggests that players do not frequently go through a series of high-intensity activities and then complete a long sprint. The average distance covered in these long sprints 

{'input': 'How often do players have to go through a series of high intensity activities and then complete a long sprint?',
 'output': 'Based on the results, it seems that there is only one instance of a sprint that is greater than 40 meters. This suggests that players do not frequently go through a series of high-intensity activities and then complete a long sprint. The average distance covered in these long sprints is approximately 43.34 meters.\n\nTo further analyze this, I would like to call another tool to get more information about the preceding high-intensity activities before these long sprints. Can I call the tool "consecutive_action_frequency" with the following parameters: {"actions": ["Acceleration", "Deceleration"], "max_time_between": 2}?\n\nPlease provide the tool call id for this new tool call.'}

In [23]:
analyze_actions_after_distance_test(df, 'Sprint', 20, 2.0)

{'following_actions': ['Deceleration'], 'counts': [5], 'percentages': [100.0]}

In [None]:
consecutive_action_frequency_test(df, ['Deceleration', 'Acceleration'], max_time_between=2.0)