In [None]:
# Basic imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Model imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# Data processing

We start by importing our processed data, dropping columns unhelpful for classification, and separating the features from the target column.

In [None]:
dnd = pd.read_csv("dnd_monsters_processed.csv").drop(columns=["Index", "name"])
features = dnd.drop(columns=["cr"])
target = dnd.cr

Here's what the complete dataset looks like.

In [None]:
dnd

Here's a visual for how many monsters are in each challenge rating class.

In [None]:
ax = dnd.cr.value_counts().sort_index().plot(kind='bar', figsize=(14,8),title="Count of Challenge Ratings")
ax.set_xlabel("Challenge Ratings")
ax.set_ylabel("Count")
ax

For our machine learning models we'll need to convert the features and targets to numpy arrays.

In [None]:
X = features.to_numpy()
y = target.to_numpy()

Since not all models can take real-numbered targets, we also create a shifted target vector that converts the fractional challenge ratings into the lowest-valued integers and shift the rest of the challenge ratings to accommodate them.

In [None]:
y_shift = []
for i in y:
    if i == .125:
        y_shift.append(1)
    elif i == .25:
        y_shift.append(2)
    elif i == .5:
        y_shift.append(3)
    else:
        y_shift.append(i+3)

In addition to using the whole dataset, we thought that it would be interesting to partition the dataset into different categories and test how much influence each category had. When examining the data, three main partitions stood out: general traits, mobility capabilities, and ability scores. To examine their individual impacts we created an array of modified observations with different combinations of these categories.

In [None]:
traits = ["size", "type", "alignment", "legendary"]
mobility = ["speed", "swim", "climb", "fly", "burrow"]
abilityscore = ["ac", "hp","strength", "dex", "con", "intel", "wis", "cha"]

names = ["No traits", "No mobility", "No ability score", "Only traits", "Only mobility", "Only ability score"]
X_mod = [features.drop(columns=traits)]
X_mod.append(features.drop(columns=mobility))
X_mod.append(features.drop(columns=abilityscore))
X_mod.append(features.drop(columns=mobility+abilityscore))
X_mod.append(features.drop(columns=abilityscore+traits))
X_mod.append(features.drop(columns=traits+mobility))

# Testing Tools

To run our experiments, we created a helper function that takes a model, the observations, target values, and a number of trials to run. This allowed us to easily run experiments per model and find an average score for them.

In [None]:
def test_model(model, X, y, trials):
    test = []
    train = []
    for i in range(trials):
        X_train, X_test, y_train, y_test = train_test_split(X, y)    
        model.fit(X_train, y_train)
        train.append(model.score(X_train, y_train))
        test.append(model.score(X_test, y_test))
    return (np.mean(train), np.mean(test))

# wrapper for nice printing
def print_test_model(model, X, y, trials):
    train_mean, test_mean = test_model(model, X, y, trials)
    print("Accuracy on training set: {:.5f}".format(train_mean))
    print("Accuracy on test set: {:.5f}\n".format(test_mean))

We also created a helper function to test how the model performs when using the various feature partitions. Since the partitions were based on human judgment we kept this one simple.

In [None]:
def test_model_mod(model, names, X, y, trials):
    for i in range(len(names)):
        print(names[i])
        print_test_model(model, X[i], y, 100)

Lastly, we created a function to determine which targets the model missed. Instead of running multiple trials, these functions run a model just once. The most notable feature here is the margin of error checker, which gives a nice picture of how far off the model was on average.

In [None]:
# Returns an array of (predict, true) pairs for all misclassifications
def find_missed(model, X, y):
    y_pred = model.predict(X)
    return np.array([[y_pred[i], y[i]] for i in range(len(y)) if y_pred[i] != y[i] ])
        
def test_missed_model(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y)    
    model.fit(X_train, y_train)
    train = model.score(X_train, y_train)
    test = model.score(X_test, y_test)
    
    train_missed = find_missed(model, X_train, y_train)
    test_missed = find_missed(model, X_test, y_test)
    
    return (train, train_missed, test, test_missed)

# Finds the average margin of error for a model
def find_margin(missed):
    margin = 0
    for pred, true in missed:
        margin += abs(pred-true)
    return margin/len(missed)

# Helper print functions #

# Print first n incorrect predictions
def print_missed(missed, n):
    print("Number of incorrect predictions: {}".format(len(missed)))
    print("Average margin of error: {}".format(find_margin(missed)))
    for pred, true in missed[:n]:
        print("Predicted {}, was {}".format(pred, true))
    print()
    
def print_test_missed_model(model, X, y, n):
    train, train_missed, test, test_missed = test_missed_model(model, X, y)
    print("Accuracy on training set: {:.5f}".format(train))
    print_missed(train_missed, n)
    print("Accuracy on test set: {:.5f}".format(test))
    print_missed(test_missed, n)

For each model family, we start by running the test_model function on all the features to create a baseline for expected accuracy. Next, we do a sample run to get an idea of what the model predicts incorrectly. After that, we experiment with different model parameters as applicable. Finally, we run the model against the different feature partitions to see if those have any influence.

# KNN Classification and Regression

As a baseline, we first started by trying to use knn classification and regression schemes. Since knn models cannot use continuous target values, we had to use the shifted targets. Let's start with trying out classification.

In [None]:
knn = KNeighborsClassifier(5)
print_test_model(knn, X, y_shift, 100)

Ouch. Not good. Let's take a look at some of the misclassifications.

In [None]:
knn = KNeighborsClassifier(5)
print_test_missed_model(knn, X, y_shift, 10)

Pretty significant number of misclassifications and a high margin of error. Let's see how regression fares.

In [None]:
knn_reg = KNeighborsRegressor(5)
print_test_model(knn_reg, X, y_shift, 100)

Already that looks much better. Let's see what the incorrect predictions are like this time.

In [None]:
knn_reg = KNeighborsRegressor(5)
print_test_missed_model(knn_reg, X, y_shift, 10)

Note that we have many more incorrect predictions, but the margin of error has decreased by half. This smaller margin is the reason why regression scores much better overall, as the incorrect regression predictions are off by much less compared to the classification predictions. This implies that classification schemes are less appropriate for our task because they force non-continuous predictions and thus will have wider margins of error.

Since regression is more appropriate for determining challenge rating, we choose to focus on regression model families for the rest of our experiments. But first, let's see if we can improve the knn results by changing the number of neighbors.

In [None]:
test_max = 0
best_pos = 0

for i in range(1, 20):
    knn_reg = KNeighborsRegressor(i)
    train_mean, test_mean = test_model(knn_reg, X, y_shift, 20)
    if (test_mean > test_max):
        test_max = test_mean
        best_pos = i

print("Best pos: {}".format(best_pos))
print("Best test mean: {}".format(test_mean))

Repeating this test indicated that the best results typically occur between 12 and 18 neighbors. Sinec the accuracy was pretty consistent across tests, we'll split the difference and use 15. Now let's test this against the different partitions.

In [None]:
knn_reg = KNeighborsRegressor(15)
test_model_mod(knn_reg, names, X_mod, y_shift, 100)

# Linear Regression

Our next set of tests uses the linear regression model. This ran much faster compared to knn regression, but had a slight hit to accuracy.

In [None]:
lr = LinearRegression()
print_test_model(lr, X, y, 100)

Now for the missed:

In [None]:
lr = LinearRegression()
print_test_missed_model(lr, X, y, 10)

And now for the modified features.

In [None]:
lr = LinearRegression()
test_model_mod(lr, names, X_mod, y, 100)

# MLP Regression

We now move on to the multilevel perception model. Before we run the tests, we're going to ignore warnings. The MLPRegressor will warn us whenever the model doesn't converge, which gets really distracting. Later we'll adjust the maximum number of iterations for the model, which can also eliminate the convergence warnings, but for presentation purposes we'll create the filter beforehand.

In [None]:
import warnings
warnings.filterwarnings('ignore')

Now to actually test the MLP. This was the slowest to run of them all, so we had to reduce the number of trials to get results in a reasonable amount of time. As a consequence there was some more variance compared to when we ran trials on the other models, but MLPs still had the highest accuracy overall.

In [None]:
mlp = MLPRegressor()
print_test_model(mlp, X, y, 20)

It's a slight improvement over the other models. Let's see what it gets wrong...

In [None]:
mlp = MLPRegressor()
print_test_missed_model(mlp, X, y, 10)

Overall, it's a good baseline, but let's see if we can improve it by fiddling with some parameters. Many are solver-specific, but the following are universal parameters that we can adjust. 

First, the number of hidden layers generally improved the training accuracy, but didn't do much for the test set. If anything, test set accuracy went down slightly, which may be an indication of overfitting.

In [None]:
# Default is [100]
hidden_layer_count = [[100]*i for i in range(1, 6)]
for s in hidden_layer_count:
    print("Hidden layers: {}".format(s))
    mlp = MLPRegressor(hidden_layer_sizes=s)
    print_test_model(mlp, X, y, 20)

Increasing the number of hidden units in a layer followed the same trend, but the test set accuracy didn't suffer as much. In fact, by the end it improved slightly, though it's unclear how much of that came from random variance.

In [None]:
# Default is [100]
hidden_layer_sizes = [[100*i] for i in range(1, 11)]
for s in hidden_layer_sizes:
    print("Hidden layer: {}".format(s))
    mlp = MLPRegressor(hidden_layer_sizes=s)
    print_test_model(mlp, X, y, 20)

The default relu activation function works the best.

In [None]:
# Default is relu
activation = ['relu', 'identity', 'logistic', 'tanh']
for a in activation:
    print("Activation: " + a)
    mlp = MLPRegressor(activation=a)
    print_test_model(mlp, X, y, 20)

The default adam solver has the best test accuracy, but lbfgs did have better training accuracy. The sgd solver doesn't work at all though.

In [None]:
# Default is adam
solvers = ['adam', 'lbfgs', 'sgd']
for s in solvers:
    print("Solver: " + s)
    mlp = MLPRegressor(solver=s)
    print_test_model(mlp, X, y, 20)

Adjusting the alpha had some fluxuations but didn't result in any significant improvement.

In [None]:
# Default is 0.0001
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
for a in alphas:
    print("Alpha: {}".format(a))
    mlp = MLPRegressor(alpha=a)
    print_test_model(mlp, X, y, 20)

Lastly, increasing the iteration count to allow for convergence trended towards higher overall accuracy, though this seems to be more of a question of consistency than actual improvement.

In [None]:
# Default is 200
count = [200, 400, 600, 800, 1000]
for c in count:
    print("count: {}".format(c))
    mlp = MLPRegressor(max_iter=c)
    print_test_model(mlp, X, y, 20)

As a whole, it looks like changing individual parameters can only improve accuracy by around 1% at most. Let's try combining the best-performing parameters.

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=[800, 800], max_iter=1500)
print_test_model(mlp, X, y, 20)

Maybe a bit better, but not by much. While we could likely fine-tune the parameters to eek out more overall accuracy, it seems unlikely that it'll improve by more than a percent or two. The biggest change appears to be the amount of time the model takes to train, which may not be worth it for such small improvements.

Let's go back to the default parameters and look at the partitioned features once more.

In [None]:
mlp = MLPRegressor()
test_model_mod(mlp, names, X_mod, y, 10)

# Conclusions

Of the models we testd, MLPs performed the best.

One possible consideration for why accuracy seems to be stuck in the 90-95% range is the distribution of challenge score ratings. As we saw with the initial graph, there's far more monsters on the lower end of the spectrum than high-ranking ones.