# Libraries

In [None]:
import numpy as np
import pandas as pd
import json
import glob
import ast
import gzip
import os
import yaml
from tqdm import tqdm
import re
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt

# Functions

In [None]:
### Run this code on the unfiltered CSV that was extracted from the raw data



def filter_std_player_classes_monsters(df):
    # Convert the list of standard classes to a set for faster membership checks
    standard_classes = {
        'Barbarian', 'Bard', 'Cleric', 'Druid', 'Fighter', 'Monk', 
        'Paladin', 'Ranger', 'Rogue', 'Sorcerer', 'Warlock', 'Wizard', 'Blood Hunter'
    }

    def are_all_classes_standard(player_list_str):
        # Convert the string representation of the list only once
        player_list = ast.literal_eval(player_list_str)
        for player in player_list:
            # Iterate through each class information tuple
            for class_info in player['class']:
                # Check against the set of standard classes
                if class_info[0].strip() not in standard_classes:
                    return False
        return True
    
    # Filter rows based on player_info
    tqdm.pandas(desc="Filtering Players")
    df_filtered = df[df['player_info'].progress_apply(are_all_classes_standard)]
    
    # Filter rows based on monsters_info
    df_filtered = df_filtered[df_filtered['monsters_info'] != "[]"]

    # Filter rows based on party total hpratio
    df_filtered = df_filtered.dropna(subset=['party_total_hpratio'])
    
    # Filter rows based on party_total_precombat_hp
    df_filtered = df_filtered[df_filtered['party_total_precombat_hp']<1e10]

    # Filter rows based on party_total_precombat_hp
    df_filtered = df_filtered[df_filtered['party_total_postcombat_hp']<1e10]
    
    return df_filtered

# Workflow

In [None]:
print(df_filtered.shape)


filtered = df_filtered.copy()

# Dropping party sizes below 10
filtered = filtered[filtered['party_size'] < 10]


rows_to_drop = []

# Dropping levels above 20
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        total_lvl = sum(class_lvl[1] for class_lvl in d['class'])
        if total_lvl > 20:
            rows_to_drop.append(index)
            break  # No need to check further dicts in this row

filtered = filtered.drop(rows_to_drop)


rows_to_drop = []

# Droping those with either no hp ratio or hp's above 350
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        # Corrected the check for None
        if d['hp_ratio'] is None:
            rows_to_drop.append(index)  # Appending index for consistency
            break  # Exit the loop after finding the condition met
        else:
            max_health = d['hp_ratio'][1]
            if max_health > 350:
                rows_to_drop.append(index)
                break  # No need to check further dicts in this row


rows_to_drop = []

# Dropping those either with no AC value(s) or AC's above 38
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        # Corrected the check for None
        if d['ac'] is None:
            rows_to_drop.append(index)  # Appending index for consistency
            break  # Exit the loop after finding the condition met
        else:
            ac = d['ac']
            if ac > 38:
                rows_to_drop.append(index)
                break  # No need to check further dicts in this row


filtered = filtered.drop(rows_to_drop)


# Dropping who either do not have an ability score or if an ability score is above 22
stats = ['strength', 'dexterity', 'constitution', 'intelligence', 'wisdom', 'charisma']

rows_to_drop = []
for index, row in filtered.iterrows():
    player_info = ast.literal_eval(row['player_info'])
    for d in player_info:
        # Check if 'stats' is None
        if d['stats'] is None:
            rows_to_drop.append(index)  # Appending index for consistency
            break  # Exit the loop after finding the condition met
        else:
            # Iterate over the items in the 'stats' dictionary
            for key, value in d['stats'].items():
                if key in stats and value > 22:
                    rows_to_drop.append(index)
                    break  # Exit the loop after finding a stat greater than 22


filtered = filtered.drop(rows_to_drop)

filtered.shape

In [None]:
# Adding player to monster and monster to player ratios 
filtered['player_monster_ratio'] = filtered['party_size']/filtered['monster_number']

filtered['monster_player_ratio'] = filtered['monster_number']/filtered['party_size']


In [None]:
import ast

filtered['party_total_class_composition'] = filtered['party_total_class_composition'].apply(lambda x: ast.literal_eval(x))

# Get all unique classes
all_classes_str = set(class_name.strip() for sublist in filtered['party_total_class_composition'] for class_name in sublist)

# Initialize columns for each class with zeros
for class_name in all_classes_str:
    filtered[class_name] = 0

# Fill in the DataFrame with one-hot encoding
for index, row in filtered.iterrows():
    corrected_class_names = []
    for class_name in row['party_total_class_composition']:
        # Correct the class name if needed
        if class_name == 'Barbarian ':
            class_name = 'Barbarian'
        corrected_class_names.append(class_name)
        filtered.at[index, class_name.strip()] = 1
    # Update the row with corrected class names if necessary
    filtered.at[index, 'party_total_class_composition'] = corrected_class_names



In [None]:
filtered.head()

In [None]:
# filtered.to_csv('filtered_24_4_5.csv', index = False)

In [None]:
x = pd.read_csv('C:\\Erdos\\Project\\DnDFireballProject\\is\\DnDFireballProject\\scaled_filtered_24_4_5.csv')

In [None]:
plt.scatter(x['party_total_hpratio'], x['weighted_monster_level'])
x['party_total_hpratio'].corr(x['weighted_monster_level'])

# Throwing shit into models

In [None]:
df = x.copy()
df.dtypes

In [None]:
df.columns

In [None]:
features_to_include = ['party_size','monster_number','monster_total_level','party_total_ac',
                      'party_total_prof_bonus',
                      'party_total_strength', 'party_total_dexterity',
                       'party_total_constitution',
                      'party_total_intelligence',
                      'party_total_wisdom',
                       'party_total_charisma', 'player_monster_ratio','monster_player_ratio', 'Druid','Cleric','Wizard','Rogue','Warlock','Sorcerer',
                       'Blood Hunter','Monk','Bard','Barbarian','Fighter','Paladin','Ranger',
                      'weighted_monster_level']

## Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df is your DataFrame

# Prepare your data
X = df[['monster_total_level']]  # Predictor
y = df['party_total_hpratio']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Always predicting the mean
mean_train = np.mean(y_train)

# Create an array filled with the mean value that matches the length of the test set
mean_predictions = np.full(shape=y_test.shape, fill_value=mean_train)

# Evaluating the model
mse = mean_squared_error(y_test, mean_predictions)
rmse = np.sqrt(mse)

print("Baseline Model RMSE:", rmse)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming df is your DataFrame

# Prepare your data
X = df['monster_total_level'].values.reshape(-1,1) # Predictor
y = df['party_total_hpratio']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=455)

# Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Baseline Model RMSE:", rmse)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np


# Prepare your data
X = df['weighted_monster_level'].values.reshape(-1,1)  # Predictor

y = df['party_total_hpratio']  # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predicting on the test set
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Baseline Model RMSE:", rmse)


## Random Forest Regressor

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


# Defining the target variable
target = 'party_total_hpratio'

# Preparing the features and target variable
X_rf = df[features_to_include].copy()
y = df[target].copy()

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_rf, y, test_size=0.2, random_state=42)



# Creating the RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Training the model
regressor.fit(X_train, y_train)

# Predicting on the test set
y_pred = regressor.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
rmse
plt.scatter(y_test,y_pred)

## PLA

In [None]:
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

X_PLA = df[features_to_include].copy()
Y = df[target]

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_PLA, Y, random_state=35)

# Initial PLS model fitting with a predefined number of components
pls = PLSRegression(n_components=4)
pls.fit(X_train, y_train)
Y_pred = pls.predict(X_test)

# Using KFold for cross-validation to find the optimal number of components
kf = KFold(n_splits=5, shuffle=True, random_state=42)
max_components = 20
scores = []
for i in range(1, max_components + 1):
    pls = PLSRegression(n_components=i)
    # Make sure to use X_train and y_train for cross-validation
    score = -cross_val_score(pls, X_train, y_train, cv=kf, scoring='neg_mean_squared_error').mean()
    scores.append(score)
    
optimal_components = np.argmin(scores) + 1

# Evaluation metrics for the initial model (consider re-evaluating after selecting the optimal number of components)
print(f"R-squared: {r2_score(y_test, Y_pred)}")
print(f"Mean Squared Error: {mean_squared_error(y_test, Y_pred)}")
print(f"Optimal number of components based on CV: {optimal_components}")



In [None]:
plt.scatter(y_test,Y_pred)