In [1]:
import pandas as pd
import numpy as np
import gzip
import json
import os
import yaml
from tqdm import tqdm
import re

In [None]:
# Code to take a peak at the data structure

file_path = 'C:/Erdos/Project/DnDFireballProject/anonymized/data/1669407306-9f1295bf-4615-43d8-beab-b2027cb3ed97.jsonl.gz'

logs = []
with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            event = json.loads(line)
            if event.get("event_type") == "combat_state_update":
                combat = event.get("data")
                for actor in combat.get("combatants", []):
                    if actor["type"] == "monster":
                        logs.append(actor)
                        
logs[1]

In [2]:
# Main function that parses each combat session

from collections import defaultdict
combat_data = defaultdict(lambda: {
     'start_time': None, 
     'player_ids': set(),
     'player_info': {} 
})
def process_file(file_path):
    monsters_found = False
    players_found = False
    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            event = json.loads(line)
            if event.get("event_type") == "combat_state_update":
                combat = event.get("data")
                for actor in combat.get("combatants", []):
                    if actor["type"] == "monster":
                        monsters_found = True
                    elif actor["type"] == "player":
                        players_found = True
                    if monsters_found and players_found:
                        break  # Break from the inner loop if both found
                if monsters_found and players_found:
                    break  # Break from the outer loop if both found

    # If no monsters or players were found, return immediately
    if not monsters_found or not players_found:
        return None  # Or an appropriate indicator that the file was skipped
    
    

    last_human_readable = {}

    with gzip.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            event = json.loads(line)
            combat_id = event.get("combat_id")

            # Checking for combat start and getting timestamp (dont think we actually need timestamp)
            if event.get("event_type") == "combat_start":
                combat_data[combat_id]['start_time'] = event.get("timestamp")

            # counting players who joined initiative
            elif event.get("event_type") == "command" and event.get("command_name") == "init join":
                player_id = event.get("author_id")
                player_name = event.get("caster", {}).get("name", event.get("author_name"))
                combat_data[combat_id]['player_ids'].add(player_id)
                combat_data[combat_id]['player_info'][player_name] = {'hp_ratio': None}
                classes_dict = event['caster']['levels']['classes']
                combat_data[combat_id]['player_info'][player_name]['class'] = list(classes_dict.items())


            elif event.get("event_type") == "combat_state_update":
                combat = event.get("data")
                for actor in combat.get("combatants", []):
                    if actor["type"] == "monster":
                        # Ensure 'monsters' is initialized as a list if it doesn't already exist
                        if 'monsters' not in combat_data[combat_id]:
                            combat_data[combat_id]['monsters'] = []

                        # Define a helper function to check if the monster is already added
                        def monster_exists(monster_list, monster_id):
                            return any(monster['monster_id'] == monster_id for monster in monster_list)

                        # Check if this monster is already added, to avoid duplication
                        if not monster_exists(combat_data[combat_id]['monsters'], actor["id"]):
                            # If the monster is not in the list, add it with all its details
                            combat_data[combat_id]['monsters'].append({
                                'monster_id': actor["id"],
                                'monster_code': actor["name"],
                                'monster_name': actor['monster_name'],
                                'level': actor['levels']['total_level']
                            })


                #trying to get that human readable part
                human_readable = event.get("human_readable")

                if human_readable:
                    last_human_readable[combat_id] = human_readable
  
    # Grab player names from combat_data dictionary
    player_names = ', '.join(str(key) for key in combat_data[combat_id]['player_info'].keys())
    if len(player_names) != 0:
        player_names = player_names.split(',')
        player_names = [name.lstrip() for name in player_names]

    


    if last_human_readable[combat_id]:
        # print(last_human_readable[combat_id])

        # Iterating through player names and finding their health in the human_readable string
        if len(player_names) != 0:
            for player in player_names:
                try: 
                    pattern = rf"{player} <(\d+)/(\d+) HP>"

                    match = re.search(pattern, last_human_readable[combat_id])

                    if match:
                        current_hp, max_hp = match.groups()

                        if player in combat_data[combat_id]['player_info'] and current_hp and max_hp:
                            combat_data[combat_id]['player_info'][player]['hp_ratio'] = [int(current_hp), int(max_hp)]
                        else:
                            continue
                except re.error:
                    print(f"Skipping due to an error with pattern: {pattern}")
                    continue


#     # Processing the last human_readable to extract HP ratios (this is not working ....)
#     for cid, human_readable in last_human_readable.items():
#         pattern = r'\d+: ([\w\s.-]+) <(\d+/\d+ HP)>'
#         matches = re.findall(pattern, human_readable)

#         for name, hp_ratio in matches:
#             if name in combat_data[cid]['player_info']:
#                 combat_data[cid]['player_info'][name]['hp_ratio'] = hp_ratio

#     # Printing the last human_readable string for each combat ID
#     for cid, human_readable in last_human_readable.items():
#         print(f"Last 'human_readable' for {cid}: {human_readable}")

    # # Creating the DataFrame from collected data
    # processed_data = []
    # for cid, data in combat_data.items():
    #     processed_data.append({
    #         "combat_id": cid,
    #         "start_time": data['start_time'],
    #         "num_player_actors": len(data['player_ids']),
    #         "player_info": data['player_info'],
    #         "num_monster_actors": len(data['monster_ids']),
    #         "monster_ids": list(data['monster_ids']),
    #         "monster_names": list(data['monster_names']),
    #     })

    # return pd.DataFrame(processed_data)
    return combat_data
    
# Define a file path
file_path = 'C:/Erdos/Project/DnDFireballProject/anonymized/data/1669407306-9f1295bf-4615-43d8-beab-b2027cb3ed97.jsonl.gz'

# Process the file and create DataFrame
combat_data = process_file(file_path)

# # Export to CSV
# csv_file_path = 'C:/Users/josep/OneDrive/Desktop/Erdos/anonymized/data/combat_analysis_with_hp.csv'
# df.to_csv(csv_file_path, index=False)

# print(f"Data exported to CSV file at: {csv_file_path}")

combat_data

defaultdict(<function __main__.<lambda>()>,
            {'1669407306-9f1295bf-4615-43d8-beab-b2027cb3ed97': {'start_time': 1669407306.4081872,
              'player_ids': {'127363826277379389',
               '137220621341913368',
               '139194158998008781',
               '319173686662838782',
               '812784442415776715'},
              'player_info': {'Zara': {'hp_ratio': [433, 462],
                'class': [('Warlock', 19)]},
               'Hugh': {'hp_ratio': [334, 367],
                'class': [('Paladin', 16), ('Barbarian', 4)]},
               'Shirea': {'hp_ratio': [157, 257], 'class': [('Fighter', 18)]},
               'Prince': {'hp_ratio': [194, 194], 'class': [('Fighter', 19)]},
               'Neel Aoi': {'hp_ratio': [99, 141],
                'class': [('Paladin', 11), ('Warlock', 7)]}},
              'monsters': [{'monster_id': '2c508b97-cb05-4637-87bd-9ec82c73b0c4',
                'monster_code': 'KR2',
                'monster_name': 'Kraken',
    

In [3]:
# Function that takes the above function and iterates it over a directory of files

directory_path = 'C:\\Erdos\\Project\\data'
files = [os.path.join(directory_path, f) for f in os.listdir(directory_path) if f.endswith('.jsonl.gz')]
files = files[:5000]
def process_combat_files(file_paths):
    combat_datas = {}  # This will store the combined data from all files

    for file_path in tqdm(file_paths, desc="Processing files"):
        # Process each file to get its combat data
        combat_data_latest = process_file(file_path)
        
        # If combat_data_latest is None (file was skipped), continue to the next file
        if combat_data_latest is None:
            continue

        # Merge the data into combat_datas
        # Assuming combat_data_latest contains data keyed by combat_id
        for combat_id, data in combat_data_latest.items():
            if combat_id in combat_datas:
                # If combat_id already exists, merge or update data as needed
                # This part depends on how you want to handle duplicate combat_ids
                # For simplicity, let's just update the existing data with the new one
                combat_datas[combat_id].update(data)
            else:
                # Add the new combat_id and its data to combat_datas
                combat_datas[combat_id] = data

    return combat_datas


final_combat_data = process_combat_files(files)


Processing files: 100%|██████████| 5000/5000 [02:55<00:00, 28.52it/s]


In [4]:
final_combat_data[list(final_combat_data.keys())[np.random.randint(0,len(final_combat_data))]] # Randomly selecting an item from the list to see if the function is working properly

{'start_time': 1656527597.5520818,
 'player_ids': {'163631059791733886'},
 'player_info': {'Elias Varkin': {'hp_ratio': [0, 201],
   'class': [('Rogue', 18)]}},
 'monsters': [{'monster_id': 'c1fe0ab0-fe0e-4272-8d72-d7d3bc7e59c2',
   'monster_code': 'CY1',
   'monster_name': 'Cyclops',
   'level': 6.0},
  {'monster_id': '91b90566-dea8-46e3-8d99-e86677370d71',
   'monster_code': 'CY2',
   'monster_name': 'Cyclops',
   'level': 6.0},
  {'monster_id': '1f61ddbc-6b93-481d-9d9d-6891f2fa62e2',
   'monster_code': 'SB3',
   'monster_name': 'Sahuagin Baron',
   'level': 5.0},
  {'monster_id': 'cb4863a7-8a4e-481a-833c-7f016c969e47',
   'monster_code': 'SB2',
   'monster_name': 'Sahuagin Baron',
   'level': 5.0},
  {'monster_id': '09805004-bf91-49b4-8024-c19b2da391d8',
   'monster_code': 'SB4',
   'monster_name': 'Sahuagin Baron',
   'level': 5.0},
  {'monster_id': '72471b83-0afc-4a78-ab14-ae590ed8eb68',
   'monster_code': 'SB1',
   'monster_name': 'Sahuagin Baron',
   'level': 5.0}]}

In [None]:
import yaml

# Assuming final_combat_data is your dictionary that might contain sets
with open('combat_data.yaml', 'w', encoding='utf-8') as yaml_file:
    yaml.dump(final_combat_data, yaml_file, default_flow_style=False, allow_unicode=True)


In [67]:
import yaml

# Load the YAML file back into a dictionary
with open('combat_data.yaml', 'r', encoding='utf-8') as yaml_file:
    loaded_dict = yaml.load(yaml_file, Loader=yaml.FullLoader)

# Now loaded_dict contains your dictionary
len(loaded_dict)


KeyboardInterrupt: 