In [1]:
import numpy as np
import pandas as pd
import numpy.random as rnd
import seaborn as sns
from matplotlib import animation
import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

## 1. Read datasets

In [2]:
# Read the data
file = "2krolls_plus_data.csv"
data = pd.read_csv(file)
data = data.iloc[:,1:]
data.head()

Unnamed: 0,roll,adv,stat,basis,type
0,13,advantage,STR,dexterity,save
1,16,advantage,ATTACK,stealth,check
2,14,advantage,INIT,melee,attack
3,18,advantage,DEX,perception,check
4,2,disadvantage,CHA,melee,attack


In [3]:
file2 = "TravisRollsDataset.csv"
data2 = pd.read_csv(file2)
data2 = data2.iloc[:,1:]
data2.head()

Unnamed: 0,season,episode,roll,mod,basis,type,adv,nat,adv_num,stat
0,3,1,13,0,wisdom,save,,False,0,WIS
1,3,1,17,0,investigation,check,,False,0,INT
2,3,1,4,2,dexterity,save,,False,0,DEX
3,3,2,16,7,melee,attack,,False,0,ATTACK
4,3,2,16,7,melee,attack,advantage,False,1,ATTACK


## 2. Data pre-processing

In [4]:
# create a column to delineate between simulated data and Travis' rolls
# 0 is a Travis roll
deter = [0] * len(data2)
data2["deter"] = deter

# 1 is a generated roll
deter = [1] * len(data)
data["deter"] = deter

In [5]:
# pick out shared columns
col_names = list(data.columns)
merge1 = data[col_names]
merge2 = data2[col_names]

# merge the datasets
total_data_rodeo = pd.concat([merge1, merge2], axis=0).reset_index()

# shuffle the data 3x in replicable way
rodeo = total_data_rodeo.sample(frac=1, random_state=1234567).reset_index(drop=True)
rodeo = rodeo.sample(frac=1, random_state=57389).reset_index(drop=True)
rodeo = rodeo.sample(frac=1, random_state=98754).reset_index(drop=True)
rodeo = rodeo[col_names]

In [6]:
# put rolls into bins
rodeo["new_rolls"] = pd.cut(rodeo.roll, bins=[0, 12, 20], labels=["0-11", "12-20"])
rodeo = rodeo.drop("roll", axis=1)

In [7]:
# sample 2/3 of the data
dnd_train, dnd_test = train_test_split(rodeo, test_size=1/3, random_state = 2263748)

In [8]:
#dnd_train = dnd_train.set_index("deter")
# dummy code the variables to get the word list
dnd_dum = pd.get_dummies(dnd_train)

column_list = list(dnd_dum.columns)
column_list.remove("deter")
#column_list.remove("new_rolls")

In [9]:
agg_dict = {}
for column in column_list:
    agg_dict[column] = "sum"

In [10]:
# group by determination, sum all other columns
groupies = pd.DataFrame(dnd_dum.groupby("deter").agg(agg_dict))

dummies = groupies.T
dummies.head()

deter,0,1
adv_advantage,10.0,1051.0
adv_disadvantage,2.0,282.0
stat_ATTACK,36.0,428.0
stat_CHA,8.0,80.0
stat_CON,5.0,38.0


## 3. Generate Priors

In [11]:
# calculate the priors
count = dnd_train.groupby("deter").size()
print("There are", count.iloc[0], "Travis rolls and", count.iloc[1], "Roll20 generated rolls in our training set." )

nrolls = np.array([count.iloc[0], count.iloc[1]])

# take sums
total_rolls = sum(nrolls)

# find proportion of edible and poisonous mushrooms to all
priors = nrolls/total_rolls
print("\nThe prior probability of a roll being from Travis is", round(priors[0], 4), "and that of being Roll20 generated is", round(priors[1], 4))


There are 113 Travis rolls and 1333 Roll20 generated rolls in our training set.

The prior probability of a roll being from Travis is 0.0781 and that of being Roll20 generated is 0.9219


In [12]:
# find the sum of the wordlist
dum_sums = dummies.sum(0)

# number of distinct words in the data
v = len(dummies.index)
# smoothing parameter
α = 1.5

In [13]:
dummies.head()

deter,0,1
adv_advantage,10.0,1051.0
adv_disadvantage,2.0,282.0
stat_ATTACK,36.0,428.0
stat_CHA,8.0,80.0
stat_CON,5.0,38.0


## 4. Generate Likelihoods

In [14]:
# find the likelihood of each trait in the list
proll = np.zeros([len(dummies.index), len(dummies.columns)])
for i in range(len(dummies.index)):
    for j in range(len(dummies.columns)):
        proll[i,j] = (dummies.iloc[i,j] + α)/(dum_sums[j] + v*α)
proll = pd.DataFrame(proll)
proll.head(5)

Unnamed: 0,0,1
0,0.022308,0.159797
1,0.00679,0.043043
2,0.072745,0.065209
3,0.018429,0.012374
4,0.012609,0.005997


In [15]:
# process the test data -- take dummies
dnd_test_dum = pd.get_dummies(dnd_test).reset_index(drop=True).copy()

# drop column we will predict
dnd_test_dum = dnd_test_dum.drop(columns=["deter"], inplace=False)
dnd_test_dum.head()

Unnamed: 0,adv_advantage,adv_disadvantage,stat_ATTACK,stat_CHA,stat_CON,stat_DEX,stat_INIT,stat_INT,stat_STR,stat_WIS,...,basis_sleight of hand,basis_stealth,basis_strength,basis_thieves’ tools,basis_wisdom,type_attack,type_check,type_save,new_rolls_0-11,new_rolls_12-20
0,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1


## 5. Calculate Posteriors

In [16]:
def calculate_posteriors(dummy_dataset, likelihood_src):
    # create dummy dataframes filled with 1s
    nrows = len(dummy_dataset.index)
    ncols = len(dummy_dataset.columns)
    likelihood_travis = pd.DataFrame(index = range(nrows), 
                        columns = range(ncols))
    likelihood_travis = likelihood_travis.fillna(value=1)

    likelihood_rand = pd.DataFrame(index = range(nrows), 
                        columns = range(ncols))
    likelihood_rand = likelihood_rand.fillna(value=1)

    likelihoods = pd.DataFrame(index = range(nrows), 
                        columns= range(len(likelihood_src.columns)))

    # replace 1 with probability of that predictor
    # time consuming
    for i in range(nrows):
        for j in range(ncols):
                if dummy_dataset.iloc[i,j] == 1:
                    likelihood_rand.iloc[i,j] = likelihood_src.iloc[j,0]
    # replace 1 with probability of that predictor
    # not a quick calculation
                    likelihood_travis.iloc[i,j] = likelihood_src.iloc[j,1]
        # aggregate likelihoods by taking the product
        likelihoods.iloc[i,0] = np.prod(likelihood_rand.iloc[i,:])
        likelihoods.iloc[i,1] = np.prod(likelihood_travis.iloc[i,:])

    #Find the posterior probability of each class assignment
    postprob = likelihoods.copy()

    for j in range(len(likelihoods.columns)):
        postprob.iloc[:,j] = likelihoods.iloc[:,j] * priors[j]
    postprob = pd.DataFrame(postprob)
    postprob.columns = ["deter_0", "deter_1"]
    postprob['prediction'] = postprob.apply(lambda x: \
                1 if x['deter_1'] > x['deter_0'] else 0, axis=1)
    
    return postprob

In [17]:
posterior_test = calculate_posteriors(dnd_test_dum, proll)
posterior_test.head()

Unnamed: 0,deter_0,deter_1,prediction
0,5.71596e-09,2.12269e-07,1
1,7.81887e-08,4.04518e-06,1
2,1.35226e-08,7.756e-07,1
3,3.48241e-09,1.25902e-07,1
4,4.5126e-08,1.79629e-06,1


In [18]:
conf_matrix = pd.DataFrame(confusion_matrix(dnd_test.deter, 
                        posterior_test.prediction, labels = [0,1]), 
                        columns = ["Predicted Travis", "Predicted Simulation"])
conf_matrix.index = conf_matrix.index.to_series().replace({0: "True Travis", 1: "True Simulation"})
conf_matrix

Unnamed: 0,Predicted Travis,Predicted Simulation
True Travis,0,57
True Simulation,0,667


## 6. Travis Only

In [19]:
# run previously built model only on Travis data
travis_data = data2.copy()
travis_data["deter"] = 0

# bin rolls
travis_data["new_rolls"] = pd.cut(travis_data.roll, bins=[0, 12, 20], labels=["0-11", "12-20"])
travis_data = travis_data.drop(["roll", "season", "episode", "mod", "nat", "adv_num"], axis=1)

In [20]:
# process the test data -- take dummies
travis_dum = pd.get_dummies(travis_data).reset_index(drop=True)

# drop column we will predict
travis_dum = travis_dum.drop(columns=["deter"], inplace=False)
travis_dum.head()

Unnamed: 0,basis_arcana,basis_athletics,basis_charisma,basis_constitution,basis_deception,basis_dexterity,basis_initiative,basis_insight,basis_intimidate,basis_intimidation,...,stat_DEX,stat_INIT,stat_INT,stat_STR,stat_WIS,stat_intimidate,stat_investigate,stat_thieves’ tools,new_rolls_0-11,new_rolls_12-20
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
posterior_travis = calculate_posteriors(travis_dum, proll)
posterior_travis

Unnamed: 0,deter_0,deter_1,prediction
0,1.3352e-09,2.88998e-09,1
1,9.34638e-09,8.58181e-09,0
2,7.14657e-09,2.81564e-08,1
3,3.21436e-09,1.56066e-09,0
4,1.96416e-10,9.18174e-11,0
...,...,...,...
165,2.00055e-09,9.53259e-10,0
166,3.21436e-09,1.15353e-09,0
167,2.00055e-09,9.53259e-10,0
168,1.14827e-08,3.40718e-08,1


In [22]:
conf_matrix = pd.DataFrame(confusion_matrix(travis_data.deter, 
                        posterior_travis.prediction, labels = [0,1]), 
                        columns = ["Predicted Travis", "Predicted Simulation"])
conf_matrix.index = conf_matrix.index.to_series().replace({0: "True Travis", 1: "True Simulation"})
conf_matrix

Unnamed: 0,Predicted Travis,Predicted Simulation
True Travis,79,91
True Simulation,0,0


## 7. All Data Together

In [23]:
# run model on all data together
rodeo = rodeo.copy()

# process the test data -- take dummies
all_dum = pd.get_dummies(rodeo).reset_index(drop=True)

# drop column we will predict
all_dum = all_dum.drop(columns=["deter"], inplace=False)
all_dum.head()

Unnamed: 0,adv_advantage,adv_disadvantage,stat_ATTACK,stat_CHA,stat_CON,stat_DEX,stat_INIT,stat_INT,stat_STR,stat_WIS,...,basis_sleight of hand,basis_stealth,basis_strength,basis_thieves’ tools,basis_wisdom,type_attack,type_check,type_save,new_rolls_0-11,new_rolls_12-20
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,1
3,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1


In [24]:
posterior_all = calculate_posteriors(all_dum, proll)
posterior_all

Unnamed: 0,deter_0,deter_1,prediction
0,3.78959e-09,2.93589e-07,1
1,5.14169e-07,4.64101e-06,1
2,4.61288e-09,5.85014e-08,1
3,1.88298e-08,1.16714e-06,1
4,2.22622e-08,8.92918e-07,1
...,...,...,...
2165,3.75198e-09,1.70862e-07,1
2166,2.21608e-09,4.56569e-08,1
2167,2.80854e-08,1.48443e-06,1
2168,2.90849e-08,1.13571e-06,1


In [25]:
conf_matrix = pd.DataFrame(confusion_matrix(rodeo.deter, 
                                    posterior_all.prediction, 
                                    labels = [0,1]), 
                                columns = ["Predicted Travis", "Predicted Simulation"])
conf_matrix.index = conf_matrix.index.to_series().replace({0: "True Travis", 1: "True Simulation"})
conf_matrix

Unnamed: 0,Predicted Travis,Predicted Simulation
True Travis,2,168
True Simulation,0,2000
