In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import os
import yaml
from tqdm import tqdm
import re

In [None]:
# Code to take a peak at the data structure

file_path = 'C:/Users/josep/OneDrive/Desktop/Erdos/anonymized/data/1669407306-9f1295bf-4615-43d8-beab-b2027cb3ed97.jsonl.gz'

logs = []
with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            event = json.loads(line)
            if event.get("event_type") == "combat_state_update":
                combat = event.get("data")
                for actor in combat.get("combatants", []):
                    if actor["type"] == "monster":
                        logs.append(actor)
                        
logs[1]

In [19]:
# Main function that parses each combat session

from collections import defaultdict
combat_data = defaultdict(lambda: {
     'start_time': None, 
     'player_ids': set(),
     'player_info': {} 
})
def process_file(file_path):
    monsters_found = False
    players_found = False
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            event = json.loads(line)
            if event.get("event_type") == "combat_state_update":
                combat = event.get("data")
                for actor in combat.get("combatants", []):
                    if actor["type"] == "monster":
                        monsters_found = True
                    elif actor["type"] == "player":
                        players_found = True
                    if monsters_found and players_found:
                        break  # Break from the inner loop if both found
                if monsters_found and players_found:
                    break  # Break from the outer loop if both found

    # If no monsters or players were found, return immediately
    if not monsters_found or not players_found:
        return None  # Or an appropriate indicator that the file was skipped
    
    

    last_human_readable = {}

    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            event = json.loads(line)
            combat_id = event.get("combat_id")

            # Checking for combat start and getting timestamp (dont think we actually need timestamp)
            if event.get("event_type") == "combat_start":
                combat_data[combat_id]['start_time'] = event.get("timestamp")

            # counting players who joined initiative and counting spell slots
            #elif event.get("event_type") == "command" and event.get("command_name") == "init join":
              #  player_id = event.get("author_id")
              #  player_name = event.get("caster", {}).get("name", event.get("author_name"))
              #  combat_data[combat_id]['player_ids'].add(player_id)
              #  combat_data[combat_id]['player_info'][player_name] = {'hp_ratio': None}
              #  classes_dict = event['caster']['levels']['classes']
              #  combat_data[combat_id]['player_info'][player_name]['class'] = list(classes_dict.items())
            elif event.get("event_type") == "command" and event.get("command_name") == "init join":
                player_id = event.get("author_id")
                player_name = event.get("caster", {}).get("name", event.get("author_name"))
                combat_data[combat_id]['player_ids'].add(player_id)
                classes_dict = event['caster']['levels']['classes']
                combat_data[combat_id]['player_info'][player_name] = {
                    'hp_ratio': None,
                    'class': list(classes_dict.items()),
                    'slots': event['caster']['spellbook']['slots'],
                    'max_slots': event['caster']['spellbook']['max_slots'],
                    'ac': event['caster']['ac'],  # Adding AC value
                    'stats': event['caster']['stats']  # Adding stats
                }

                # Initialize slot sums 
                if 'total_slots' not in combat_data[combat_id]:
                    combat_data[combat_id]['total_slots'] = defaultdict(int)
                    combat_data[combat_id]['total_max_slots'] = defaultdict(int)

                # Summing slots and max_slots
                for slot, value in event['caster']['spellbook']['slots'].items():
                    combat_data[combat_id]['total_slots'][slot] += value

                for slot, value in event['caster']['spellbook']['max_slots'].items():
                    combat_data[combat_id]['total_max_slots'][slot] += value

            elif event.get("event_type") == "combat_state_update":
                combat = event.get("data")
                for actor in combat.get("combatants", []):
                    if actor["type"] == "monster":
                        # Ensure 'monsters' is initialized as a list if it doesn't already exist
                        if 'monsters' not in combat_data[combat_id]:
                            combat_data[combat_id]['monsters'] = []

                        # Define a helper function to check if the monster is already added
                        def monster_exists(monster_list, monster_id):
                            return any(monster['monster_id'] == monster_id for monster in monster_list)

                        # Check if this monster is already added, to avoid duplication
                        if not monster_exists(combat_data[combat_id]['monsters'], actor["id"]):
                            # If the monster is not in the list, add it with all its details
                            combat_data[combat_id]['monsters'].append({
                                'monster_id': actor["id"],
                                'monster_code': actor["name"],
                                'monster_name': actor['monster_name'],
                                'level': actor['levels']['total_level']
                            })


                #trying to get that human readable part
                human_readable = event.get("human_readable")

                if human_readable:
                    last_human_readable[combat_id] = human_readable
  
    # Grab player names from combat_data dictionary
    player_names = ', '.join(str(key) for key in combat_data[combat_id]['player_info'].keys())
    if len(player_names) != 0:
        player_names = player_names.split(',')
        player_names = [name.lstrip() for name in player_names]

    


    if last_human_readable[combat_id]:
        # print(last_human_readable[combat_id])

        # Iterating through player names and finding their health in the human_readable string
        if len(player_names) != 0:
            for player in player_names:
                try: 
                    pattern = rf"{player} <(\d+)/(\d+) HP>"

                    match = re.search(pattern, last_human_readable[combat_id])

                    if match:
                        current_hp, max_hp = match.groups()

                        if player in combat_data[combat_id]['player_info'] and current_hp and max_hp:
                            combat_data[combat_id]['player_info'][player]['hp_ratio'] = [int(current_hp), int(max_hp)]
                        else:
                            continue
                except re.error:
                    print(f"Skipping due to an error with pattern: {pattern}")
                    continue


#     # Processing the last human_readable to extract HP ratios (this is not working ....)
#     for cid, human_readable in last_human_readable.items():
#         pattern = r'\d+: ([\w\s.-]+) <(\d+/\d+ HP)>'
#         matches = re.findall(pattern, human_readable)

#         for name, hp_ratio in matches:
#             if name in combat_data[cid]['player_info']:
#                 combat_data[cid]['player_info'][name]['hp_ratio'] = hp_ratio

#     # Printing the last human_readable string for each combat ID
#     for cid, human_readable in last_human_readable.items():
#         print(f"Last 'human_readable' for {cid}: {human_readable}")

    # # Creating the DataFrame from collected data
    # processed_data = []
    # for cid, data in combat_data.items():
    #     processed_data.append({
    #         "combat_id": cid,
    #         "start_time": data['start_time'],
    #         "num_player_actors": len(data['player_ids']),
    #         "player_info": data['player_info'],
    #         "num_monster_actors": len(data['monster_ids']),
    #         "monster_ids": list(data['monster_ids']),
    #         "monster_names": list(data['monster_names']),
    #     })

    # return pd.DataFrame(processed_data)
    return combat_data
    
# Define a file path
file_path = 'C:/Users/josep/OneDrive/Desktop/Erdos/anonymized/data/1669407306-9f1295bf-4615-43d8-beab-b2027cb3ed97.jsonl.gz'

# Process the file and create DataFrame
combat_data = process_file(file_path)

# # Export to CSV
# csv_file_path = 'C:/Users/josep/OneDrive/Desktop/Erdos/anonymized/data/combat_analysis_with_hp.csv'
# df.to_csv(csv_file_path, index=False)

# print(f"Data exported to CSV file at: {csv_file_path}")

combat_data

defaultdict(<function __main__.<lambda>()>,
            {'1669407306-9f1295bf-4615-43d8-beab-b2027cb3ed97': {'start_time': 1669407306.4081872,
              'player_ids': {'127363826277379389',
               '137220621341913368',
               '139194158998008781',
               '319173686662838782',
               '812784442415776715'},
              'player_info': {'Zara': {'hp_ratio': [433, 462],
                'class': [('Warlock', 19)],
                'slots': {'1': 0,
                 '2': 0,
                 '3': 0,
                 '4': 0,
                 '5': 4,
                 '6': 0,
                 '7': 0,
                 '8': 0,
                 '9': 0},
                'max_slots': {'1': 0,
                 '2': 0,
                 '3': 0,
                 '4': 0,
                 '5': 4,
                 '6': 0,
                 '7': 0,
                 '8': 0,
                 '9': 0},
                'ac': 25,
                'stats': {'prof_bonus': 6,
       

In [20]:
# Function that takes the above function and iterates it over a directory of files

directory_path = 'C:\\Users\\josep\\OneDrive\\Desktop\\Erdos\\anonymized\\data'
files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.jsonl.gz')]
# files = files[:5000]
def process_combat_files(file_paths):
    combat_datas = {}  # This will store the combined data from all files

    for file_path in tqdm(file_paths, desc="Processing files"):
        # Process each file to get its combat data
        combat_data_latest = process_file(file_path)
        
        # If combat_data_latest is None (file was skipped), continue to the next file
        if combat_data_latest is None:
            continue

        # Merge the data into combat_datas
        # Assuming combat_data_latest contains data keyed by combat_id
        for combat_id, data in combat_data_latest.items():
            if combat_id in combat_datas:
                # If combat_id already exists, merge or update data as needed
                # This part depends on how you want to handle duplicate combat_ids
                # For simplicity, let's just update the existing data with the new one
                combat_datas[combat_id].update(data)
            else:
                # Add the new combat_id and its data to combat_datas
                combat_datas[combat_id] = data

    return combat_datas


final_combat_data = process_combat_files(files)


Processing files:  52%|█████▏    | 12780/24748 [05:28<04:49, 41.38it/s]

Skipping due to an error with pattern: Zeppelin Waverunner (TD <(\d+)/(\d+) HP>
Skipping due to an error with pattern: SRS) <(\d+)/(\d+) HP>


Processing files:  72%|███████▏  | 17936/24748 [07:38<02:28, 45.91it/s] 

Skipping due to an error with pattern: Laz (Ripper lasso <(\d+)/(\d+) HP>


Processing files:  73%|███████▎  | 17949/24748 [07:38<03:20, 33.88it/s]

Skipping due to an error with pattern: Laz (Ripper lasso <(\d+)/(\d+) HP>


Processing files:  73%|███████▎  | 18081/24748 [07:42<02:14, 49.58it/s]

Skipping due to an error with pattern: Laz (Ripper lasso <(\d+)/(\d+) HP>


Processing files:  73%|███████▎  | 18109/24748 [07:42<02:24, 46.07it/s]

Skipping due to an error with pattern: Laz (Ripper lasso <(\d+)/(\d+) HP>


Processing files:  76%|███████▋  | 18873/24748 [08:07<03:01, 32.29it/s]

Skipping due to an error with pattern: Laz (Ripper lasso <(\d+)/(\d+) HP>


Processing files:  77%|███████▋  | 19070/24748 [08:12<03:23, 27.90it/s]

Skipping due to an error with pattern: Laz (Ripper lasso <(\d+)/(\d+) HP>


Processing files:  79%|███████▉  | 19540/24748 [08:25<02:42, 32.10it/s]

Skipping due to an error with pattern: Laz (Ripper lasso <(\d+)/(\d+) HP>


Processing files:  79%|███████▉  | 19622/24748 [08:26<02:10, 39.25it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  80%|████████  | 19827/24748 [08:32<01:29, 55.04it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  81%|████████  | 20090/24748 [08:40<01:44, 44.75it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ??? <(\d+)/(\d+) HP>


Processing files:  81%|████████▏ | 20167/24748 [08:43<02:31, 30.26it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  83%|████████▎ | 20561/24748 [08:57<01:50, 38.03it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  84%|████████▍ | 20795/24748 [09:04<01:25, 46.23it/s]

Skipping due to an error with pattern: ??? <(\d+)/(\d+) HP>


Processing files:  85%|████████▍ | 20928/24748 [09:07<01:08, 55.65it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  85%|████████▍ | 21031/24748 [09:10<01:46, 34.97it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  86%|████████▌ | 21197/24748 [09:15<01:02, 56.39it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  86%|████████▌ | 21335/24748 [09:20<01:44, 32.64it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  87%|████████▋ | 21609/24748 [09:29<01:33, 33.49it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  88%|████████▊ | 21710/24748 [09:32<01:15, 40.02it/s]

Skipping due to an error with pattern: ???? <(\d+)/(\d+) HP>


Processing files:  89%|████████▉ | 22114/24748 [09:43<01:02, 42.48it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  91%|█████████▏| 22595/24748 [09:58<00:48, 44.52it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  91%|█████████▏| 22609/24748 [09:59<01:02, 34.02it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files:  92%|█████████▏| 22838/24748 [10:06<01:03, 30.08it/s]

Skipping due to an error with pattern: Rosemary Gwindlelin (dnd realm <(\d+)/(\d+) HP>
Skipping due to an error with pattern: ) <(\d+)/(\d+) HP>


Processing files: 100%|██████████| 24748/24748 [11:16<00:00, 36.58it/s]


In [21]:
final_combat_data[list(final_combat_data.keys())[np.random.randint(0,len(final_combat_data))]] # Randomly selecting an item from the list to see if the function is working properly

{'start_time': 1662989777.8962593,
 'player_ids': {'483720663430960073'},
 'player_info': {'Durak Ironhide': {'hp_ratio': [0, 98],
   'class': [('Cleric', 6), ('Druid', 2)],
   'slots': {'1': 4,
    '2': 3,
    '3': 3,
    '4': 2,
    '5': 0,
    '6': 0,
    '7': 0,
    '8': 0,
    '9': 0},
   'max_slots': {'1': 4,
    '2': 3,
    '3': 3,
    '4': 2,
    '5': 0,
    '6': 0,
    '7': 0,
    '8': 0,
    '9': 0},
   'ac': 19,
   'stats': {'prof_bonus': 3,
    'strength': 12,
    'dexterity': 13,
    'constitution': 18,
    'intelligence': 8,
    'wisdom': 16,
    'charisma': 8}}},
 'total_slots': defaultdict(int,
             {'1': 4,
              '2': 3,
              '3': 3,
              '4': 2,
              '5': 0,
              '6': 0,
              '7': 0,
              '8': 0,
              '9': 0}),
 'total_max_slots': defaultdict(int,
             {'1': 4,
              '2': 3,
              '3': 3,
              '4': 2,
              '5': 0,
              '6': 0,
             

In [22]:
len(final_combat_data)

19069

In [23]:
from tqdm import tqdm
import numpy as np

def calculate_averages_for_encounters(encounters_dict):
    monster_levels = []  # List to accumulate monster levels
    player_healths = []  # List to accumulate player health percentages

    for encounter_name in tqdm(encounters_dict.keys(), desc="Calculating averages"):
        encounter_data = encounters_dict[encounter_name]

        # Accumulate monster levels
        total_level = sum(monster['level'] for monster in encounter_data['monsters'])
        monster_levels.append(total_level) if encounter_data['monsters'] else 0

        # Handle missing or None hp_ratio
        player_infos = encounter_data['player_info'].values()
        total_health_ratio = sum(player['hp_ratio'][0] for player in player_infos if player.get('hp_ratio'))
        total_max_health = sum(player['hp_ratio'][1] for player in player_infos if player.get('hp_ratio'))
        
        # Calculate and accumulate average health left over, handling division by zero
        if total_max_health > 0:
            player_healths.append(total_health_ratio / total_max_health * 100)
        else:
            player_healths.append(0)  # Default value or handle as appropriate

    # Convert lists to NumPy arrays for final calculation
    monster_levels_array = np.array(monster_levels)
    player_healths_array = np.array(player_healths)

    # Calculate overall averages
    monster_average = np.mean(monster_levels_array)
    player_average = np.mean(player_healths_array)

    return monster_levels, player_healths, monster_average, player_average


mls,phs, monster_average, player_average = calculate_averages_for_encounters(final_combat_data)

Calculating averages: 100%|██████████| 19069/19069 [00:00<00:00, 288907.61it/s]


In [24]:

final_combat_data[list(final_combat_data.keys())[0]]

{'start_time': 1669407306.4081872,
 'player_ids': {'127363826277379389',
  '137220621341913368',
  '139194158998008781',
  '319173686662838782',
  '812784442415776715'},
 'player_info': {'Zara': {'hp_ratio': [433, 462],
   'class': [('Warlock', 19)],
   'slots': {'1': 0,
    '2': 0,
    '3': 0,
    '4': 0,
    '5': 4,
    '6': 0,
    '7': 0,
    '8': 0,
    '9': 0},
   'max_slots': {'1': 0,
    '2': 0,
    '3': 0,
    '4': 0,
    '5': 4,
    '6': 0,
    '7': 0,
    '8': 0,
    '9': 0},
   'ac': 25,
   'stats': {'prof_bonus': 6,
    'strength': 10,
    'dexterity': 16,
    'constitution': 20,
    'intelligence': 12,
    'wisdom': 12,
    'charisma': 14}},
  'Hugh': {'hp_ratio': [334, 367],
   'class': [('Paladin', 16), ('Barbarian', 4)],
   'slots': {'1': 4,
    '2': 3,
    '3': 3,
    '4': 2,
    '5': 0,
    '6': 0,
    '7': 0,
    '8': 0,
    '9': 0},
   'max_slots': {'1': 4,
    '2': 3,
    '3': 3,
    '4': 2,
    '5': 0,
    '6': 0,
    '7': 0,
    '8': 0,
    '9': 0},
   'ac': 26,


In [25]:
monster_average, player_average

(17.156287692065657, 73.5423537543945)