# Step 1 import data

In [1]:
import pandas as pd 
import numpy as np
import random
import re 

In [2]:
#first file, in subfolder "data", use "\t" which does not exist in the data
#to ensure each row becomes a single cell. Name single column "raw" for raw text.
df1 = pd.read_csv(r"Data\TravisRolls_ Petals to the Metal.txt", sep="\t", names=["raw"]) 
df1.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data\\TravisRolls_ Petals to the Metal.txt'

## Concatenate multiple text inputs to one dataset

In [None]:
df1["season"] = 3 #label the seasons too

#Extract same data from other two text files, 
#representing two more seasons of recorded data.
df1_2 = pd.read_csv(r"Data\TravisRolls_ The Crystal Kingdom.txt", sep="\t", names=["raw"])
df1_2["season"] = 4
df1_3 = pd.read_csv(r"Data\TravisRolls_ The Eleventh Hour.txt", sep="\t", names=["raw"])
df1_3["season"] = 5

#Concatenate them all together into one dataframe
df1_concat = pd.concat([df1, df1_2, df1_3]).reset_index()
print(len(df1_concat.index))
df1_concat.head(100)

# Step 2: extract episode values and fill

In [None]:
#always make a copy to ensure I can work backwards if necessary
df2 = df1_concat.copy()

#Extract where the beginning of the line is "Episode #"
df2["episode"] = df2["raw"].str.extract("^Episode (\d)")
print(df2.head(5))

#The pad method fills subsequent NaNs with the last value that preceeded it
df2["episode"] = df2["episode"].fillna(method='pad')
df2.head(15)

# Step 3 Extract content w Regex

In [None]:
df3 = df2.copy()

#Using regular expressions, designate groups of text to extract for further analysis
extraction_regex = r"^(\d+)\s+(?:\+\s+)?(\(\watural[ A-Za-z,]*?\)|\d{,2})?\s+" \
    + "([A-Za-z\'’]+(?: Tools| of Hand)?)?\s?(\wheck|\wave|\wttack)?(?: ?\(?w\/ )?(\w+dvantage)?"

#Insert regex groups into a list of columns
df3[["roll", "mod", "basis", "type", "adv"]] = df3["raw"].str.extract(extraction_regex)

#Separately extract if any row mentions "joke" since those should be removed
df3["joke"] = df3["raw"].str.extract("([jJ]oke)")

df3.head(100)

# Step 4 Extract Natural 1s or Natural 20s which have special rules

In [None]:
df3_2 = df3.dropna(subset=['mod'])

nat_test = df3_2[df3_2['mod'].str.contains("atur")]
nat_test

In [None]:
df4 = df3_2.copy()
df4['nat'] = df4['mod'].str.contains("atur")
df4['mod'] = df4.apply(lambda x: 0 if x['nat']==1 else x['mod'], axis=1)
df4.loc[list(nat_test.index)]

# Step 4: Condense to datapoints

In [None]:
df5 = df4[["season", "episode", "roll", "mod", "basis", "type", "adv","joke", "nat"]].copy()
df5 = df5[(df5['roll'].notnull()) & (pd.isna(df5['joke']))]
df5 = df5.drop(columns=["joke"])
advdict = {"dis": "disadvantage", "adv":"advantage"}
df5['adv'] = df5['adv'].replace(advdict).str.lower()
def check_adv(x):
    x = str(x)
    if "dis" in x.lower():
        return -1
    elif "adv" in x.lower():
        return 1
    else:
        return 0
df5['adv_num'] = df5['adv'].apply(check_adv)
df5['basis'] = df5['basis'].str.lower()
df5['type'] = df5['type'].str.lower()

#add base stats from D&D character sheet
stat_dict = {'arcana': 'INT', 'athletics': 'STR', 
             'charisma': 'CHA', 'constitution': 'CON', 
             'deception': 'CHA', 'dexterity': 'DEX', 
             'initiative': 'INIT', 'insight': 'WIS', 
             'intimidation': 'CHA', 'investigation': 'INT', 
             'medicine': 'WIS', 'melee': 'ATTACK', 
             'nature': 'INT', 'perception': 'WIS', 
             'persuasion': 'CHA', 'ranged': 'ATTACK', 
             'sleight of hand': 'DEX', 'stealth': 'DEX', 
             'strength': 'STR', 'wisdom': 'WIS'}

df5['stat'] = df5.basis.replace(stat_dict)

df5.head(20)

# Step 5: Export usable dataset

In [None]:
df5.to_csv("TravisRollsDataset.csv")

# Step 6: Generate Random Null Data Set 

Use the same categorical proportions as the source data for basis, type, and advantage. 

Lastly use the randomly proportional instance of advantage to compare to an additional random d20 roll. 

In [8]:
from pprint import pprint

df = pd.read_csv(r"TravisRollsDataset.csv")

# set seed for reproducability
random.seed(15678)
# generate list of sides
dice_sides = list(range(1,21))
# randomly choose sides
randsamp = list(random.choices(dice_sides, k=2000))
df2k = pd.DataFrame(randsamp, columns=["roll"])
df2k.index.name = 'id'
df2k = df2k.reset_index() 

df["basis_type"] = df["basis"].astype(str) + "_" + df["type"].astype(str)

df.head()

Unnamed: 0.1,Unnamed: 0,season,episode,roll,mod,basis,type,adv,nat,adv_num,stat,basis_type
0,2,3,1,13,0,wisdom,save,,False,0,WIS,wisdom_save
1,3,3,1,17,0,investigation,check,,False,0,INT,investigation_check
2,4,3,1,4,2,dexterity,save,,False,0,DEX,dexterity_save
3,6,3,2,16,7,melee,attack,,False,0,ATTACK,melee_attack
4,7,3,2,16,7,melee,attack,advantage,False,1,ATTACK,melee_attack


In [9]:
def randassign(x, givdict):
    r = random.random()
    for k in givdict.keys():
        if r <= k:
            return givdict[k]
    return pd.NaN

for c in ["basis_type", "adv", "stat"]:
    tmpdf = pd.DataFrame(df[c].value_counts()).reset_index()
    tmpdf['pct'] = tmpdf[c] / sum(tmpdf[c])
    cum1cumall = dict(tmpdf[["pct","index"]].values)
#     pprint(cum1cumall)
#     print("\n")
    tmpdf['cumsum'] = tmpdf.pct.cumsum()
    cumkeys = dict(tmpdf[["cumsum","index"]].values)
#     pprint(cumkeys)
#     print("\n")
    df2k[c] = df2k.roll.apply(randassign, args=(cumkeys,))

df2k[['basis','type']] = df2k.basis_type.str.split("_",expand=True) 

def adv_roll(x):
    r = random.randint(1,20)
    if x['adv'] == "advantage":
        return max(x['roll'], r)
    if x['adv'] == "disadvantage":
        return min(x['roll'], r)
    return x['roll']

df2k['roll'] = df2k.apply(adv_roll, axis=1)

df2k = df2k.drop(['basis_type'], axis=1)

#df2k

df2k.iloc[:, 1:]

Unnamed: 0,roll,adv,stat,basis,type
0,8,disadvantage,ATTACK,melee,attack
1,14,disadvantage,INT,dexterity,check
2,16,advantage,CON,ranged,attack
3,10,advantage,DEX,constitution,save
4,9,disadvantage,ATTACK,initiative,
...,...,...,...,...,...
1995,16,advantage,ATTACK,melee,attack
1996,13,disadvantage,ATTACK,thieves’ tools,check
1997,2,disadvantage,ATTACK,sleight of hand,
1998,3,disadvantage,STR,melee,attack


In [None]:
df2k.to_csv(r"2krolls_plus_data.csv", index=False)