In [128]:
import numpy as np
import pandas as pd
import numpy.random as rnd
import seaborn as sns
from matplotlib import animation
import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [129]:
# Read the data

file = "5krolls_plus_data.csv"

data = pd.read_csv(file)
data.head()

Unnamed: 0,id,roll,adv,basis,type
0,0,20,none,initiative,
1,1,15,none,melee,attack
2,2,2,none,melee,attack
3,3,16,advantage,perception,check
4,4,15,none,melee,attack


In [130]:
file2 = "TravisRollsDataset.csv"

data2 = pd.read_csv(file2)
data2.head()

Unnamed: 0,id,season,episode,roll,mod,basis,type,adv,critical
0,2,3,1,13,0,wisdom,save,none,none
1,3,3,1,17,0,investigation,check,none,none
2,4,3,1,4,2,dexterity,save,none,none
3,6,3,2,16,7,melee,attack,none,none
4,7,3,2,16,7,melee,attack,advantage,none


In [131]:
# create a column to delineate between simulated data and Travis' rolls
# 0 is a Travis roll
deter = [0] * len(data2)
data2["deter"] = deter

# 1 is a generated roll
deter = [1] * len(data)
data["deter"] = deter

In [132]:
# # make list of column names
# 
# col_names.remove("Unnamed: 0")
# col_names.remove("basis_type")
# col_names

In [133]:
# pick out shared columns
col_names = list(data.columns)
merge1 = data[col_names]
merge2 = data2[col_names]

# merge the datasets
total_data_rodeo = pd.concat([merge1, merge2], axis=0).reset_index()

# shuffle the data 3x in replicable way
rodeo = total_data_rodeo.sample(frac=1, random_state=1234567).reset_index(drop=True)
rodeo = rodeo.sample(frac=1, random_state=57389).reset_index(drop=True)
rodeo = rodeo.sample(frac=1, random_state=98754).reset_index(drop=True)
rodeo = rodeo[col_names]
print(rodeo.shape)
rodeo.head()

(5170, 6)


Unnamed: 0,id,roll,adv,basis,type,deter
0,3613,5,none,initiative,,1
1,4254,20,none,melee,attack,1
2,3641,18,none,constitution,check,1
3,1281,8,none,perception,check,1
4,1542,16,none,melee,attack,1


In [134]:
series_vals = [str(x) for x in rodeo['roll'].values.tolist()]
cats = [str(x) for x in list(range(1,21))]

rodeo['roll'] = pd.Categorical(series_vals, 
                                   categories= cats,
                                   ordered=False)
rodeo["basis_type"] = rodeo["basis"].astype(str) + "_" + rodeo["type"].astype(str)
rodeo.head()

Unnamed: 0,id,roll,adv,basis,type,deter,basis_type
0,3613,5,none,initiative,,1,initiative_nan
1,4254,20,none,melee,attack,1,melee_attack
2,3641,18,none,constitution,check,1,constitution_check
3,1281,8,none,perception,check,1,perception_check
4,1542,16,none,melee,attack,1,melee_attack


In [135]:
rodeo = rodeo.drop("basis", axis=1)
rodeo = rodeo.drop("type", axis=1)
rodeo = rodeo.drop("id", axis=1)
rodeo.head()

Unnamed: 0,roll,adv,deter,basis_type
0,5,none,1,initiative_nan
1,20,none,1,melee_attack
2,18,none,1,constitution_check
3,8,none,1,perception_check
4,16,none,1,melee_attack


In [136]:
# put rolls into bins
#rodeo["new_rolls"] = pd.cut(rodeo.roll, bins=[0, 12, 20], labels=["0-11", "12-20"])
# rodeo = rodeo.drop("roll", axis=1)
# rodeo.head()

In [137]:
# sample 2/3 of the data
dnd_train, dnd_test = train_test_split(rodeo, test_size=1/3, random_state = 600)

In [138]:
#dnd_train = dnd_train.set_index("deter")
# dummy code the variables to get the word list
dnd_dum = pd.get_dummies(dnd_train)

column_list = list(dnd_dum.columns)
column_list.remove("deter")
#column_list.remove("new_rolls")
dnd_dum.head()

Unnamed: 0,deter,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,...,basis_type_nature_check,basis_type_perception_check,basis_type_persuasion_check,basis_type_ranged_attack,basis_type_sleight of hand_check,basis_type_stealth_check,basis_type_strength_check,basis_type_strength_save,basis_type_thieves’ tools_check,basis_type_wisdom_save
1446,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1903,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
384,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
682,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5025,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [139]:
# group by determination, sum all other columns
agg_dict = {col: "sum" for col in column_list}
groupies = pd.DataFrame(dnd_dum.groupby("deter").agg(agg_dict))

dummies = groupies.T
dummies

deter,0,1
roll_1,3.0,155.0
roll_2,1.0,170.0
roll_3,2.0,143.0
roll_4,5.0,133.0
roll_5,1.0,171.0
roll_6,1.0,159.0
roll_7,3.0,169.0
roll_8,2.0,178.0
roll_9,2.0,129.0
roll_10,2.0,171.0


In [140]:
# calculate the priors
count = dnd_train.groupby("deter").size()
print("There are", count.iloc[0], "Travis rolls and", count.iloc[1], "Roll20 generated rolls in our training set." )

nrolls = np.array([count.iloc[0], count.iloc[1]])

# take sums
total_rolls = sum(nrolls)

# find proportion of edible and poisonous mushrooms to all
priors = nrolls/total_rolls
print("\nThe prior probability of a roll being from Travis is", round(priors[0], 4), "and that of being Roll20 generated is", round(priors[1], 4))

There are 113 Travis rolls and 3333 Roll20 generated rolls in our training set.

The prior probability of a roll being from Travis is 0.0328 and that of being Roll20 generated is 0.9672


In [141]:
# find the sum of the wordlist
dum_sums = dummies.sum(0)

# number of distinct words in the data
v = len(dummies.index)
# smoothing parameter
α = 1.5

In [142]:
# find the likelihood of each trait in the list
n = len(dummies.index)
k = len(dummies.columns)
proll = np.zeros([n, k])
dummies['0_lik'] = (dummies[0] + α) / (sum(dummies[0]) + v*α)
dummies['1_lik'] = (dummies[1] + α) / (sum(dummies[1]) + v*α)
# for i in range(n):
#     for j in range(k):
#         proll[i,j] = (dummies.iloc[i,j] + α)/(dum_sums[j] + v*α)
# proll = pd.DataFrame(proll)
# proll.head(5)
dummies.head()

deter,0,1,0_lik,1_lik
roll_1,3.0,155.0,0.010989,0.015542
roll_2,1.0,170.0,0.006105,0.017032
roll_3,2.0,143.0,0.008547,0.01435
roll_4,5.0,133.0,0.015873,0.013357
roll_5,1.0,171.0,0.006105,0.017131


In [143]:
# process the test data -- take dummies
dnd2 = pd.get_dummies(dnd_test).reset_index(drop=True)

# drop column we will predict
dnd2 = dnd2.drop(columns=["deter"], inplace=False)
dnd2.head()

Unnamed: 0,roll_1,roll_2,roll_3,roll_4,roll_5,roll_6,roll_7,roll_8,roll_9,roll_10,...,basis_type_nature_check,basis_type_perception_check,basis_type_persuasion_check,basis_type_ranged_attack,basis_type_sleight of hand_check,basis_type_stealth_check,basis_type_strength_check,basis_type_strength_save,basis_type_thieves’ tools_check,basis_type_wisdom_save
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
# create dummy dataframes filled with 1s
nrows = len(dnd2.index)
ncols = len(dnd2.columns)
likelihood_travis = pd.DataFrame(index = range(nrows), columns = range(ncols))
likelihood_travis = likelihood_travis.fillna(value=1)

likelihood_rand = pd.DataFrame(index = range(nrows), columns = range(ncols))
likelihood_rand = likelihood_rand.fillna(value=1)

likelihoods = pd.DataFrame(index = range(nrows), columns= range(k))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1719,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1720,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1721,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1722,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [150]:
# replace 1 with probability of that predictor
# time consuming
for i in range(nrows):
    for j in range(ncols):
            # replace 1 with probability of that predictor
            if dnd2.iloc[i,j] == 1:
                likelihood_rand.iloc[i,j] = proll[j,0]
                likelihood_travis.iloc[i,j] = proll[j,1]
    # aggregate likelihoods by taking the product
    likelihoods.iloc[i,0] = np.prod(likelihood_rand.iloc[i,:])
    likelihoods.iloc[i,1] = np.prod(likelihood_travis.iloc[i,:])

In [186]:
# # replace 1 with probability of that predictor
# # not a quick calculation
# for i in range(nrows):
#     for j in range(ncols):
#             if dnd2.iloc[i,j] == 1:
#                 likelihood_travis.iloc[i,j] = proll.iloc[j,1]

In [187]:

# likelihoods = pd.DataFrame(index = range(nrows), columns= range(len(proll.columns)))
# for i in range(nrows):
#     likelihoods.iloc[i,0] = np.prod(likelihood_rand.iloc[i,:])
#     likelihoods.iloc[i,1] = np.prod(likelihood_travis.iloc[i,:])

In [151]:
#Find the posterior probability of each row in class 0 vs 1
postprob = likelihoods.copy()

for j in range(len(likelihoods.columns)):
    postprob.iloc[:,j] = likelihoods.iloc[:,j] * priors[j]
postprob = pd.DataFrame(postprob)
postprob.columns = ["deter_0", "deter_1"]

In [152]:
# create empty dataframe with columns full of zeroes
new_dnd = pd.DataFrame()
new_dnd["roll_generated_pred"] = np.zeros(nrows)
new_dnd["roll_Travis_pred"] = np.zeros(nrows)

# compare to impose classification
for i in range(nrows):
    if postprob["deter_1"][i] > postprob["deter_0"][i]:
        new_dnd["roll_generated_pred"][i] = 1
    else:
        new_dnd["roll_Travis_pred"][i] = 1
new_dnd.head()

Unnamed: 0,roll_generated_pred,roll_Travis_pred
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [153]:
# set up comparisons
dnd_test = pd.DataFrame(dnd_test)
new_dnd = pd.DataFrame(new_dnd)
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(nrows):
    if int(new_dnd["roll_generated_pred"][i]) == 0 and dnd_test["deter"].iloc[i] == 0:
        TN += 1
    elif int(new_dnd["roll_generated_pred"][i]) == 1 and dnd_test["deter"].iloc[i] == 0:
        FN += 1
    elif int(new_dnd["roll_generated_pred"][i]) == 1 and dnd_test["deter"].iloc[i] == 1:
        TP += 1
    else:
        FP += 1

In [154]:
# build the confusion matrix
confusing_dnd = pd.DataFrame(columns = ["Predicted Travis", "Predicted Roll20"], index = ["Actual Travis", "Actual Roll20"])

confusing_dnd["Predicted Travis"] = [TN, FN]
confusing_dnd["Predicted Roll20"] = [FP, TP]
confusing_dnd

Unnamed: 0,Predicted Travis,Predicted Roll20
Actual Travis,57,1667
Actual Roll20,0,0


In [155]:
# run previously built model only on Travis data
travis_data = data2

# change classification to 1 instead of 0
travis_data["deter"] = 1
# bin rolls
travis_data["new_rolls"] = pd.cut(travis_data.roll, bins=[0, 12, 20], labels=["0-11", "12-20"])
travis_data = travis_data.drop(["roll", "id", "season", "episode", "mod", "critical"], axis=1)

In [156]:
# process the test data -- take dummies
travis_dum = pd.get_dummies(travis_data).reset_index(drop=True)

# drop column we will predict
travis_dum = travis_dum.drop(columns=["deter"], inplace=False)
travis_dum.head()

Unnamed: 0,basis_arcana,basis_athletics,basis_charisma,basis_constitution,basis_deception,basis_dexterity,basis_initiative,basis_insight,basis_intimidation,basis_investigation,...,basis_thieves’ tools,basis_wisdom,type_attack,type_check,type_save,adv_advantage,adv_disadvantage,adv_none,new_rolls_0-11,new_rolls_12-20
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


In [157]:
# create dummy dataframes filled with 1s
nrows = len(travis_dum.index)
ncols = len(travis_dum.columns)
likeli_yes = pd.DataFrame(index = range(nrows), columns = range(ncols))
likeli_yes = likeli_yes.fillna(value=1)

likeli_no = pd.DataFrame(index = range(nrows), columns = range(ncols))
likeli_no = likeli_no.fillna(value=1)

In [159]:
# replace 1 with probability of that predictor
# time consuming
for i in range(nrows):
    for j in range(ncols):
            if travis_dum.iloc[i,j] == 1:
                likeli_yes.iloc[i,j] = proll[j,0]

In [160]:
# replace 1 with probability of that predictor
# not a quick calculation
for i in range(nrows):
    for j in range(ncols):
            if travis_dum.iloc[i,j] == 1:
                likeli_no.iloc[i,j] = proll[j,1]

In [162]:
# aggregate likelihoods by taking the product
likelihoods = pd.DataFrame(index = range(nrows), columns= range(k))
for i in range(nrows):
    likelihoods.iloc[i,0] = np.prod(likeli_yes.iloc[i,:])
    likelihoods.iloc[i,1] = np.prod(likeli_no.iloc[i,:])

In [163]:
#Find the posterior probability of each mushroom being edible vs poisonous
postprob = likelihoods.copy()

for j in range(len(likelihoods.columns)):
    postprob.iloc[:,j] = likelihoods.iloc[:,j] * priors[j]
postprob = pd.DataFrame(postprob)
postprob.columns = ["not", "correct"]

In [164]:
# create empty dataframe with columns full of zeroes
new_dnd2 = pd.DataFrame()
new_dnd2["correct_pred"] = np.zeros(nrows)
new_dnd2["not_correct_pred"] = np.zeros(nrows)

# compare to impose classification
for i in range(nrows):
    if postprob["correct"][i] > postprob["not"][i]:
        new_dnd2["correct_pred"][i] = 1
    else:
        new_dnd2["not_correct_pred"][i] = 1

In [165]:
# set up comparisons

travis_data = pd.DataFrame(travis_data)
new_dnd2 = pd.DataFrame(new_dnd2)
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(nrows):
    if int(new_dnd2["correct_pred"][i]) == 0 and travis_data["deter"].iloc[i] == 1:
        TN += 1
    elif int(new_dnd2["correct_pred"][i]) == 1 and travis_data["deter"].iloc[i] == 1:
        FN += 1
    elif int(new_dnd2["correct_pred"][i]) == 1 and travis_data["deter"].iloc[i] == 0:
        TP += 1
    else:
        FP += 1

In [166]:
# build the confusion matrix
confusing_dnd = pd.DataFrame(columns = ["Predicted Travis", "Predicted Roll20"], index = ["Actual Roll20", "Actual Travis"])

confusing_dnd["Predicted Roll20"] = [TN, FN]
confusing_dnd["Predicted Travis"] = [FP, TP]
confusing_dnd

Unnamed: 0,Predicted Travis,Predicted Roll20
Actual Roll20,0,170
Actual Travis,0,0
