In [349]:
import numpy as np
import pandas as pd
import numpy.random as rnd
import seaborn as sns
from matplotlib import animation
import pymc3 as pm
import arviz as az
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [350]:
# Read the data

#path = "/Users/donaldbrown/Dropbox/department/Classes/DS6014/CourseraBayesianML/Week4MCMC/"
file = "2krolls_plus_data.csv"

data = pd.read_csv(file)
data.head()

Unnamed: 0.1,Unnamed: 0,roll,basis_type,adv,basis,type
0,0,2,strength_check,none,strength,check
1,1,17,ranged_attack,none,ranged,attack
2,2,5,strength_check,none,strength,check
3,3,1,melee_attack,none,melee,attack
4,4,7,melee_attack,none,melee,attack


In [351]:
file2 = "TravisRollsDataset.csv"

data2 = pd.read_csv(file2)
data2.head()

Unnamed: 0,id,season,episode,roll,mod,basis,type,adv,critical
0,2,3,1,13,0,wisdom,save,none,none
1,3,3,1,17,0,investigation,check,none,none
2,4,3,1,4,2,dexterity,save,none,none
3,6,3,2,16,7,melee,attack,none,none
4,7,3,2,16,7,melee,attack,advantage,none


In [352]:
# create a column to delineate between simulated data and Travis' rolls
# 0 is a Travis roll
deter = [0] * len(data2)
data2["deter"] = deter

# 1 is a generated roll
deter = [1] * len(data)
data["deter"] = deter

In [353]:
# make list of column names
col_names = list(data.columns)
col_names.remove("Unnamed: 0")
col_names.remove("basis_type")
col_names

['roll', 'adv', 'basis', 'type', 'deter']

In [354]:
# pick out shared columns
merge1 = data[col_names]
merge2 = data2[col_names]

# merge the datasets
total_data_rodeo = pd.concat([merge1, merge2], axis=0).reset_index()

# shuffle the data 3x in replicable way
rodeo = total_data_rodeo.sample(frac=1, random_state=1234567).reset_index(drop=True)
rodeo = rodeo.sample(frac=1, random_state=57389).reset_index(drop=True)
rodeo = rodeo.sample(frac=1, random_state=98754).reset_index(drop=True)
rodeo = rodeo[col_names]

In [355]:
# put rolls into bins
rodeo["new_rolls"] = pd.cut(rodeo.roll, bins=[0, 12, 20], labels=["0-11", "12-20"])
rodeo = rodeo.drop("roll", axis=1)

In [356]:
# sample 2/3 of the data
dnd_train, dnd_test = train_test_split(rodeo, test_size=1/3, random_state = 2263748)

In [357]:
#dnd_train = dnd_train.set_index("deter")
# dummy code the variables to get the word list
dnd_dum = pd.get_dummies(dnd_train)

column_list = list(dnd_dum.columns)
column_list.remove("deter")
#column_list.remove("new_rolls")

In [358]:
agg_dict = {}
for column in column_list:
    agg_dict[column] = "sum"

In [359]:
# group by determination, sum all other columns
groupies = pd.DataFrame(dnd_dum.groupby("deter").agg(agg_dict))

dummies = groupies.T
dummies.head()

deter,0,1
adv_advantage,10.0,109.0
adv_disadvantage,2.0,39.0
adv_none,101.0,1185.0
basis_arcana,1.0,7.0
basis_athletics,5.0,83.0


In [360]:
# calculate the priors
count = dnd_train.groupby("deter").size()
print("There are", count.iloc[0], "Travis rolls and", count.iloc[1], "Roll20 generated rolls in our training set." )

nrolls = np.array([count.iloc[0], count.iloc[1]])

# take sums
total_rolls = sum(nrolls)

# find proportion of edible and poisonous mushrooms to all
priors = nrolls/total_rolls
print("\nThe prior probability of a roll being from Travis is", round(priors[0], 4), "and that of being Roll20 generated is", round(priors[1], 4))


There are 113 Travis rolls and 1333 Roll20 generated rolls in our training set.

The prior probability of a roll being from Travis is 0.0781 and that of being Roll20 generated is 0.9219


In [361]:
# find the sum of the wordlist
dum_sums = dummies.sum(0)

# number of distinct words in the data
v = len(dummies.index)
# smoothing parameter
α = 1.5

In [362]:
# find the likelihood of each trait in the list
proll = np.zeros([len(dummies.index), len(dummies.columns)])
for i in range(len(dummies.index)):
    for j in range(len(dummies.columns)):
        proll[i,j] = (dummies.iloc[i,j] + α)/(dum_sums[j] + v*α)
proll = pd.DataFrame(proll)
proll.head(5)

Unnamed: 0,0,1
0,0.02359,0.02089
1,0.007179,0.007657
2,0.210256,0.224312
3,0.005128,0.001607
4,0.013333,0.015975


In [363]:
# process the test data -- take dummies
dnd2 = pd.get_dummies(dnd_test).reset_index(drop=True)

# drop column we will predict
dnd2 = dnd2.drop(columns=["deter"], inplace=False)
dnd2.head()

Unnamed: 0,adv_advantage,adv_disadvantage,adv_none,basis_arcana,basis_athletics,basis_charisma,basis_constitution,basis_deception,basis_dexterity,basis_initiative,...,basis_sleight of hand,basis_stealth,basis_strength,basis_thieves’ tools,basis_wisdom,type_attack,type_check,type_save,new_rolls_0-11,new_rolls_12-20
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
4,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1


In [364]:
# create dummy dataframes filled with 1s
nrows = len(dnd2.index)
ncols = len(dnd2.columns)
likelihood_travis = pd.DataFrame(index = range(nrows), columns = range(ncols))
likelihood_travis = likelihood_travis.fillna(value=1)

likelihood_rand = pd.DataFrame(index = range(nrows), columns = range(ncols))
likelihood_rand = likelihood_rand.fillna(value=1)

In [365]:
# replace 1 with probability of that predictor
# time consuming
for i in range(nrows):
    for j in range(ncols):
            if dnd2.iloc[i,j] == 1:
                likelihood_rand.iloc[i,j] = proll.iloc[j,0]

In [366]:
# replace 1 with probability of that predictor
# not a quick calculation
for i in range(nrows):
    for j in range(ncols):
            if dnd2.iloc[i,j] == 1:
                likelihood_travis.iloc[i,j] = proll.iloc[j,1]

In [367]:
# aggregate likelihoods by taking the product
likelihoods = pd.DataFrame(index = range(nrows), columns= range(len(proll.columns)))
for i in range(nrows):
    likelihoods.iloc[i,0] = np.prod(likelihood_rand.iloc[i,:])
    likelihoods.iloc[i,1] = np.prod(likelihood_travis.iloc[i,:])

In [368]:
#Find the posterior probability of each mushroom being edible vs poisonous
postprob = likelihoods.copy()

for j in range(len(likelihoods.columns)):
    postprob.iloc[:,j] = likelihoods.iloc[:,j] * priors[j]
postprob = pd.DataFrame(postprob)
postprob.columns = ["deter_0", "deter_1"]

In [369]:
# create empty dataframe with columns full of zeroes
new_dnd = pd.DataFrame()
new_dnd["roll_generated_pred"] = np.zeros(nrows)
new_dnd["roll_Travis_pred"] = np.zeros(nrows)

# compare to impose classification
for i in range(nrows):
    if postprob["deter_1"][i] > postprob["deter_0"][i]:
        new_dnd["roll_generated_pred"][i] = 1
    else:
        new_dnd["roll_Travis_pred"][i] = 1
new_dnd.head()

Unnamed: 0,roll_generated_pred,roll_Travis_pred
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [370]:
# set up comparisons
dnd_test = pd.DataFrame(dnd_test)
new_dnd = pd.DataFrame(new_dnd)
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(nrows):
    if int(new_dnd["roll_generated_pred"][i]) == 0 and dnd_test["deter"].iloc[i] == 0:
        TN += 1
    elif int(new_dnd["roll_generated_pred"][i]) == 1 and dnd_test["deter"].iloc[i] == 0:
        FP += 1
    elif int(new_dnd["roll_generated_pred"][i]) == 1 and dnd_test["deter"].iloc[i] == 1:
        TP += 1
    else:
        FN += 1

In [371]:
# build the confusion matrix
confusing_dnd = pd.DataFrame(columns = ["Actual Roll20", "Actual Travis"], index = ["Predicted Roll20", "Predicted Travis"])

confusing_dnd["Actual Roll20"] = [TP, FN]
confusing_dnd["Actual Travis"] = [FP, TN]
confusing_dnd

Unnamed: 0,Actual Roll20,Actual Travis
Predicted Roll20,667,57
Predicted Travis,0,0


In [337]:
# run previously built model only on Travis data
travis_data = data2

# change classification to 1 instead of 0
travis_data["deter"] = 0
# bin rolls
travis_data["new_rolls"] = pd.cut(travis_data.roll, bins=[0, 12, 20], labels=["0-11", "12-20"])
travis_data = travis_data.drop(["roll", "id", "season", "episode", "mod", "critical"], axis=1)

In [338]:
# process the test data -- take dummies
travis_dum = pd.get_dummies(travis_data).reset_index(drop=True)

# drop column we will predict
travis_dum = travis_dum.drop(columns=["deter"], inplace=False)
travis_dum.head()

Unnamed: 0,basis_arcana,basis_athletics,basis_charisma,basis_constitution,basis_deception,basis_dexterity,basis_initiative,basis_insight,basis_intimidation,basis_investigation,...,basis_thieves’ tools,basis_wisdom,type_attack,type_check,type_save,adv_advantage,adv_disadvantage,adv_none,new_rolls_0-11,new_rolls_12-20
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,1,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,1,0,1
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1


In [339]:
# create dummy dataframes filled with 1s
nrows = len(travis_dum.index)
ncols = len(travis_dum.columns)
likeli_yes = pd.DataFrame(index = range(nrows), columns = range(ncols))
likeli_yes = likeli_yes.fillna(value=1)

likeli_no = pd.DataFrame(index = range(nrows), columns = range(ncols))
likeli_no = likeli_no.fillna(value=1)

In [340]:
# replace 1 with probability of that predictor
# time consuming
for i in range(nrows):
    for j in range(ncols):
            if travis_dum.iloc[i,j] == 1:
                likeli_yes.iloc[i,j] = proll.iloc[j,0]

In [341]:
# replace 1 with probability of that predictor
# not a quick calculation
for i in range(nrows):
    for j in range(ncols):
            if travis_dum.iloc[i,j] == 1:
                likeli_no.iloc[i,j] = proll.iloc[j,1]

In [342]:
# aggregate likelihoods by taking the product
likelihoods = pd.DataFrame(index = range(nrows), columns= range(len(proll.columns)))
for i in range(nrows):
    likelihoods.iloc[i,0] = np.prod(likeli_yes.iloc[i,:])
    likelihoods.iloc[i,1] = np.prod(likeli_no.iloc[i,:])

In [343]:
#Find the posterior probability of each mushroom being edible vs poisonous
postprob = likelihoods.copy()

for j in range(len(likelihoods.columns)):
    postprob.iloc[:,j] = likelihoods.iloc[:,j] * priors[j]
postprob = pd.DataFrame(postprob)
postprob.columns = ["not", "correct"]

In [344]:
# create empty dataframe with columns full of zeroes
new_dnd2 = pd.DataFrame()
new_dnd2["correct_pred"] = np.zeros(nrows)
new_dnd2["not_correct_pred"] = np.zeros(nrows)

# compare to impose classification
for i in range(nrows):
    if postprob["correct"][i] > postprob["not"][i]:
        new_dnd2["correct_pred"][i] = 1
    else:
        new_dnd2["not_correct_pred"][i] = 1

In [347]:
# set up comparisons

travis_data = pd.DataFrame(travis_data)
new_dnd2 = pd.DataFrame(new_dnd2)
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(nrows):
    if int(new_dnd2["correct_pred"][i]) == 0 and travis_data["deter"].iloc[i] == 0:
        TN += 1
    elif int(new_dnd2["correct_pred"][i]) == 1 and travis_data["deter"].iloc[i] == 0:
        FP += 1
    elif int(new_dnd2["correct_pred"][i]) == 1 and travis_data["deter"].iloc[i] == 1:
        TP += 1
    else:
        FN += 1

In [348]:
# build the confusion matrix
confusing_dnd = pd.DataFrame(columns = ["Actual Roll20", "Actual Travis"], index = ["Predicted Roll20", "Predicted Travis"])

confusing_dnd["Actual Roll20"] = [TP, FN]
confusing_dnd["Actual Travis"] = [FP, TN]
confusing_dnd

Unnamed: 0,Actual Roll20,Actual Travis
Predicted Roll20,0,170
Predicted Travis,0,0


In [290]:
# run model on all data together
rodeo = rodeo

# process the test data -- take dummies
all_dum = pd.get_dummies(rodeo).reset_index(drop=True)

# drop column we will predict
all_dum = all_dum.drop(columns=["deter"], inplace=False)
all_dum.head()

Unnamed: 0,adv_advantage,adv_disadvantage,adv_none,basis_arcana,basis_athletics,basis_charisma,basis_constitution,basis_deception,basis_dexterity,basis_initiative,...,basis_sleight of hand,basis_stealth,basis_strength,basis_thieves’ tools,basis_wisdom,type_attack,type_check,type_save,new_rolls_0-11,new_rolls_12-20
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [291]:
# create dummy dataframes filled with 1s
nrows = len(all_dum.index)
ncols = len(all_dum.columns)
likeli_rand = pd.DataFrame(index = range(nrows), columns = range(ncols))
likeli_rand = likeli_rand.fillna(value=1)

likeli_Trav = pd.DataFrame(index = range(nrows), columns = range(ncols))
likeli_Trav = likeli_Trav.fillna(value=1)

In [292]:
# replace 1 with probability of that predictor
# time consuming
for i in range(nrows):
    for j in range(ncols):
            if all_dum.iloc[i,j] == 1:
                likeli_rand.iloc[i,j] = proll.iloc[j,0]

In [293]:
# replace 1 with probability of that predictor
# not a quick calculation
for i in range(nrows):
    for j in range(ncols):
            if all_dum.iloc[i,j] == 1:
                likeli_Trav.iloc[i,j] = proll.iloc[j,1]

In [294]:
# aggregate likelihoods by taking the product
likelihoods = pd.DataFrame(index = range(nrows), columns= range(len(proll.columns)))
for i in range(nrows):
    likelihoods.iloc[i,0] = np.prod(likeli_rand.iloc[i,:])
    likelihoods.iloc[i,1] = np.prod(likeli_Trav.iloc[i,:])

In [299]:
#Find the posterior probability of each roll being randomly generated or from Travis
postprob = likelihoods.copy()

for j in range(len(likelihoods.columns)):
    postprob.iloc[:,j] = likelihoods.iloc[:,j] * priors[j]
postprob = pd.DataFrame(postprob)
postprob.columns = ["Travis", "generated"]

In [300]:
# create empty dataframe with columns full of zeroes
together = pd.DataFrame()
together["gen_pred"] = np.zeros(nrows)
together["trav_pred"] = np.zeros(nrows)

# compare to impose classification
for i in range(nrows):
    if postprob["generated"][i] > postprob["Travis"][i]:
        together["gen_pred"][i] = 1
    else:
        together["trav_pred"][i] = 1

In [309]:
# set up comparisons

rodeo = pd.DataFrame(rodeo)
together = pd.DataFrame(together)
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(nrows):
    if int(together["gen_pred"][i]) == 0 and rodeo["deter"].iloc[i] == 0:
        TN += 1
    elif int(together["gen_pred"][i]) == 1 and rodeo["deter"].iloc[i] == 0:
        FP += 1
    elif int(together["gen_pred"][i]) == 1 and rodeo["deter"].iloc[i] == 1:
        TP += 1
    else:
        FN += 1

In [310]:
# build the confusion matrix
confusing_dnd = pd.DataFrame(columns = ["Actual Roll20", "Actual Travis"], index = ["Predicted Roll20", "Predicted Travis"])

confusing_dnd["Actual Roll20"] = [TP, FN]
confusing_dnd["Actual Travis"] = [FP, TN]
confusing_dnd

Unnamed: 0,Actual Roll20,Actual Travis
Predicted Roll20,2000,170
Predicted Travis,0,0
