# Libraries

In [None]:
import numpy as np
import pandas as pd
import json
import glob
import ast
import gzip
import os
import yaml
from tqdm import tqdm
import re
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import sqlite3

# Functions

In [None]:
def filter_std_player_classes_monsters(df):
    # Convert the list of standard classes to a set for faster membership checks
    standard_classes = {
        'Barbarian', 'Bard', 'Cleric', 'Druid', 'Fighter', 'Monk', 
        'Paladin', 'Ranger', 'Rogue', 'Sorcerer', 'Warlock', 'Wizard', 'Blood Hunter'
    }

    def are_all_classes_standard(player_list_str):
        # Convert the string representation of the list only once
        player_list = ast.literal_eval(player_list_str)
        for player in player_list:
            # Iterate through each class information tuple
            for class_info in player['class']:
                # Check against the set of standard classes
                if class_info[0].strip() not in standard_classes:
                    return False
        return True
    
    # Filter rows based on player_info
    tqdm.pandas(desc="Filtering Players")
    df_filtered = df[df['player_info'].progress_apply(are_all_classes_standard)]
    
    # Filter rows based on monsters_info
    df_filtered = df_filtered[df_filtered['monsters_info'] != "[]"]

    # Filter rows based on party total hpratio
    df_filtered = df_filtered.dropna(subset=['party_total_hpratio'])
    
    # Filter rows based on party_total_precombat_hp
    df_filtered = df_filtered[df_filtered['party_total_precombat_hp']<1e10]

    # Filter rows based on party_total_precombat_hp
    df_filtered = df_filtered[df_filtered['party_total_postcombat_hp']<1e10]
    
    return df_filtered

# Workflow

In [None]:
combat_data_df = pd.read_csv("filtered_combat_data_df_04-01.csv")
print(combat_data_df.shape)

# combat_data_filtered_df = filter_std_player_classes_monsters(combat_data_df)
# print(combat_data_filtered_df.shape)

In [None]:
# csv_file_path = './filtered_combat_data_df_04-01.csv'
# combat_data_filtered_df.to_csv(csv_file_path, index=False)

In [None]:
df = pd.read_csv('./filtered_combat_data_df_04-01.csv').drop(columns=['Unnamed: 0'])

# df = combat_data_filtered_df
# # Display the first few rows of the dataset
# display(df.head())
# # Display summary statistics
# display(df.describe())
# # Check for missing values
# #display(df.isnull().sum())

In [None]:
print(df.shape)


filtered = df.copy()

# Dropping party sizes below 10
filtered = filtered[filtered['party_size'] < 10]


rows_to_drop = []

# Dropping levels above 20
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        total_lvl = sum(class_lvl[1] for class_lvl in d['class'])
        if total_lvl > 20:
            rows_to_drop.append(index)
            break  # No need to check further dicts in this row

filtered = filtered.drop(rows_to_drop)


rows_to_drop = []

# Droping those with either no hp ratio or hp's above 350
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        # Corrected the check for None
        if d['hp_ratio'] is None:
            rows_to_drop.append(index)  # Appending index for consistency
            break  # Exit the loop after finding the condition met
        else:
            max_health = d['hp_ratio'][1]
            if max_health > 350:
                rows_to_drop.append(index)
                break  # No need to check further dicts in this row


rows_to_drop = []

# Dropping those either with no AC value(s) or AC's above 38
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        # Corrected the check for None
        if d['ac'] is None:
            rows_to_drop.append(index)  # Appending index for consistency
            break  # Exit the loop after finding the condition met
        else:
            ac = d['ac']
            if ac > 38:
                rows_to_drop.append(index)
                break  # No need to check further dicts in this row


filtered = filtered.drop(rows_to_drop)


# Dropping who either do not have an ability score or if an ability score is above 22
stats = ['strength', 'dexterity', 'constitution', 'intelligence', 'wisdom', 'charisma']

rows_to_drop = []
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        # Check if 'stats' is None
        if d['stats'] is None:
            rows_to_drop.append(index)  # Appending index for consistency
            break  # Exit the loop after finding the condition met
        else:
            # Iterate over the items in the 'stats' dictionary
            for key, value in d['stats'].items():
                if key in stats and value > 22:
                    rows_to_drop.append(index)
                    break  # Exit the loop after finding a stat greater than 22


filtered = filtered.drop(rows_to_drop)

filtered.shape

In [None]:
# Adding player to monster and monster to player ratios 
filtered['player_monster_ratio'] = filtered['party_size']/filtered['monster_number']

filtered['monster_player_ratio'] = filtered['monster_number']/filtered['party_size']


In [None]:
csv_file_path = './filtered_combat_data_df_04-01.csv'
filtered.to_csv(csv_file_path, index=False)

In [None]:
num_features = filtered.select_dtypes(include=['int64', 'float64']).columns.drop(['start_time'])
print(num_features)

In [None]:
# Histograms for numerical features
for feature in num_features:
    try:
        plt.figure(figsize=(10, 4))
        sns.histplot(filtered[feature], kde=True)
        plt.title(f'Distribution of {feature}')
        plt.show()
    except:
        print(f"Failed {feature}")


In [None]:
# Correlation heatmap for numerical features including sessions with no damage
corr = filtered[num_features].corr()
corr_filled = corr#corr.fillna(0)

plt.figure(figsize=(20, 16))
sns.heatmap(corr_filled, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Feature Correlation Heatmap (Sessions without Damage Included)')
plt.savefig("corr_no_dmg_included.png", format='png')

plt.show()


In [None]:
from ast import literal_eval

# Attempt to convert the string representation of lists in 'party_classes_with_level' into actual lists
# Note: We'll need to be cautious since this column contains complex data structures (lists of tuples).

# Sample conversion to check the format and ensure our method works
sample_composition = literal_eval(df['party_classes_with_level'][0])

# For demonstration, let's process the first few rows to see the data we're working with
df['party_composition_parsed'] = df['party_classes_with_level'].apply(literal_eval)

# Now, let's create a simplified representation: count of each class in a combat
# This is a simplification. A more detailed analysis might consider levels, or the presence of specific classes.
def simplify_composition(composition):
    class_count = {}
    for class_level in composition:
        class_name = class_level[0]  # Extract class name
        if class_name in class_count:
            class_count[class_name] += 1
        else:
            class_count[class_name] = 1
    return class_count

df['simple_composition'] = df['party_composition_parsed'].apply(simplify_composition)

# Let's take a look at what this simplified composition looks like for the first few entries
df[['party_composition_parsed', 'simple_composition', 'party_total_hpratio']].head()



In [None]:
# To proceed with the analysis, we'll group the data by the simplified compositions and calculate the average HP ratio.
# Given the complexity of using the simplified composition (a dictionary) directly for grouping, we'll instead use a string representation of it.
# This approach simplifies the grouping but loses some granularity (e.g., different compositions with the same classes in different quantities will be treated the same).

# Convert the simple_composition dictionaries to sorted strings for consistent grouping
df['composition_str'] = df['simple_composition'].apply(lambda x: str(sorted(x.items())))

# Group by these composition strings and calculate the average HP ratio for each
composition_success = df.groupby('composition_str')['party_total_hpratio'].agg(['mean', 'count', 'std']).reset_index()

# Rename the columns for clarity
composition_success.columns = ['Composition', 'Average_HP_Ratio', 'Encounter_Count', 'HP_Ratio_STD']

# Sort the results by the average HP ratio in descending order to see the most successful compositions
# We also filter for compositions encountered more than once for more reliable insights
composition_success_filtered = composition_success[composition_success['Encounter_Count'] > 1].sort_values(by='Average_HP_Ratio', ascending=False)

# composition_success_filtered.head(10)


In [None]:
df.simple_composition