# SubScript Database Builder
This notebook describes the pipeline for building the World of Warcraft Achievement Dataset. Each step in the pipeline was performed in bataches across four machines using standalone scripts in multiple terminals per machine. 

## Defaults, Dependencies,  Definitions

In [1]:
import config
import custom_funcs as cf
import pandas as pd
import os
import numpy as np
import configparser as cp

## Get top 500 leaderboards for major achievment categories

In [2]:
# From dataforazeroth.com (copied directly from reports, no scraping)
dir_dataset = '/Users/haleyspeed/Docs/insight/datasets/dataforazeroth_datasets'
dataforazeroth_set = cf.dataforazeroth (dir_dataset)
cf.xlsx_to_csv (dir_dataset)


# From wowprogress.com (webscraped with BeautifulSoup)
locale = 'en_US'
namespace = 'static-us'
dir_save = 'wowprogress_reports'
base_url = 'https://www.wowprogress.com/export/ranks/'
os.chdir('wowprogress_reports')

cf.get_wowprogress_by_realm(locale, namespace, base_url)
wow_guilds = cf.unpack_wowprogress_guild_ranks()
wow_players = cf.get_wow_guild_rosters('wow_guild_rankings.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/haleyspeed/Docs/insight/datasets/dataforazeroth_datasets'

## Build the raw achievement dataset
raw_player_api_caller.py

In [3]:
f_config = os.path.join(config.home_dir, '../', 'api', 'config.ini')
conf = cp.ConfigParser()
conf.read(f_config)
blizzard_key = conf.get('KEYS', 'blizzard')
blizzard_secret = conf.get('KEYS', 'blizzard_secret')
locale = 'en_US'
namespace = 'static-us'
access_token = cf.get_access_token(blizzard_key, blizzard_secret)

In [4]:
print('starting up...')
final_cols = ['faction', 'guild_name', 'guild_rank', 'id', 'level', 'playable_class',
              'playable_race','player', 'realm', 'realm_id', 'total_achievements',
              'total_achievement_points', 'mounts_collected', 'pets_collected','completed_quests',
              'honor_level']
achievement_list = pd.read_csv(os.path.join(config.raw_dir,'wow_achievements.csv'))
achievement_list.columns = ['unnamed0', 'unnamed1', 'player', 'guild', 'realm', 'id']
achievement_list = achievement_list.drop(['unnamed0', 'unnamed1'], axis = 1)
#print(achievement_list.columns)
for id in achievement_list.id:
    final_cols.append(str(int(id)))
empty_row = dict.fromkeys(final_cols)
i = 100001
for group_num in np.arange(100, 1000, 100):
    print('Group Number: ' + str(group_num))
    f = 'wow_roster' + str(group_num) + '.csv'
    player_roster = pd.read_csv(os.path.join(config.home_dir, '../', 'api', f))
    df = pd.DataFrame()
    for m in player_roster.itertuples():
        if m.level == 120:
            print(i, end=' ')
            player = m.player.lower()
            realm = m.realm
            row = cf.get_player_achievements(player, realm, empty_row, access_token)
            if not isinstance(row, str):
                row['player'] = player
                row['realm'] = realm
                row['level'] = m.level
                row['playable_class'] = m.playable_class
                row['faction'] = m.faction
                row['guild_name'] = m.guild_name
                row['guild_rank'] = m.id
                row['playable_race'] = m.playable_race
                row['realm_id'] = m.realm_id
                row['id']  = player + '_' + realm
                row['mounts_collected'] = cf.get_wow_mounts(player, realm, access_token)
                row['pets_collected'] = cf.get_wow_pets(player, realm, access_token)
                row['completed_quests'] = cf.get_wow_quests(player, realm, access_token)
                row['honor_level'] = cf.get_wow_honor(player, realm, access_token)
                last_login, gear_score = cf.get_validation(player, realm, access_token)
                try:
                    row['gear_score'] = gear_score
                    row['last_login'] = last_login
                    row['time_since_login'] = datetime.datetime.strptime('2020-06-05', '%Y-%m-%d').date()- \
                                              datetime.datetime.strptime(row['last_login'], '%Y-%m-%d').date()
                    row['engagement_score'] = (gear_score + row['total_achievements'])/row['time_since_login'].days
                except:
                    print("error in " + row['id'])
                df = df.append(row, ignore_index=True)
        if i % 100 == 0:
            f_name = f.split('roster')[0] + 'achievement_dates_' + str(group_num) + '_' + str(i) + '.csv'
            df.to_csv(os.path.join(config.processed_dir, f_name))
            df = pd.DataFrame()
            print(f_name + ' saved')
        i = i + 1

starting up...
Index(['player', 'guild', 'realm', 'id'], dtype='object')
sifting through players...
Here's One: Hoyochan
1 Here's One: Nightress
2 {'faction': 'Alliance', 'guild_name': 'formality', 'guild_rank': 37190514.0, 'id': 'nightress_proudmoore', 'level': 120.0, 'playable_class': 11.0, 'playable_race': 4.0, 'player': 'nightress', 'realm': 'proudmoore', 'realm_id': 5.0, 'total_achievements': 3659, 'total_achievement_points': 31155, 'mounts_collected': 586, 'pets_collected': 1328, 'completed_quests': 9700, 'honor_level': 113, '33585': None, '33545': None, '33485': None, '33475': None, '33465': None, '33445': None, '33415': None, '33410': None, '33405': None, '33365': None, '33360': None, '33350': None, '33345': None, '33335': None, '33325': None, '33305': None, '33285': None, '33280': None, '33255': None, '33240': None, '33230': None, '33205': None, '33185': None, '33150': None, '33145': None, '33130': None, '33110': None, '33095': None, '33090': None, '33085': None, '33080': None

3 Here's One: Decae
4 {'faction': 'Alliance', 'guild_name': 'formality', 'guild_rank': 54517736.0, 'id': 'decae_proudmoore', 'level': 120.0, 'playable_class': 6.0, 'playable_race': 11.0, 'player': 'decae', 'realm': 'proudmoore', 'realm_id': 5.0, 'total_achievements': 3658, 'total_achievement_points': 31145, 'mounts_collected': 586, 'pets_collected': 1328, 'completed_quests': 2780, 'honor_level': 112, '33585': None, '33545': None, '33485': None, '33475': None, '33465': None, '33445': None, '33415': None, '33410': None, '33405': None, '33365': None, '33360': None, '33350': None, '33345': None, '33335': None, '33325': None, '33305': None, '33285': None, '33280': None, '33255': None, '33240': None, '33230': None, '33205': None, '33185': None, '33150': None, '33145': None, '33130': None, '33110': None, '33095': None, '33090': None, '33085': None, '33080': None, '33055': None, '33035': None, '33025': None, '33020': None, '33015': None, '33010': None, '32985': None, '32980': None, '32970': No

5 {'faction': 'Alliance', 'guild_name': 'formality', 'guild_rank': 66889648.0, 'id': 'arcanea_proudmoore', 'level': 120.0, 'playable_class': 8.0, 'playable_race': 22.0, 'player': 'arcanea', 'realm': 'proudmoore', 'realm_id': 5.0, 'total_achievements': 3659, 'total_achievement_points': 31155, 'mounts_collected': 586, 'pets_collected': 1328, 'completed_quests': 7400, 'honor_level': 113, '33585': None, '33545': None, '33485': None, '33475': None, '33465': None, '33445': None, '33415': None, '33410': None, '33405': None, '33365': None, '33360': None, '33350': None, '33345': None, '33335': None, '33325': None, '33305': None, '33285': None, '33280': None, '33255': None, '33240': None, '33230': None, '33205': None, '33185': None, '33150': None, '33145': None, '33130': None, '33110': None, '33095': None, '33090': None, '33085': None, '33080': None, '33055': None, '33035': None, '33025': None, '33020': None, '33015': None, '33010': None, '32985': None, '32980': None, '32970': None, '32965': Non

KeyboardInterrupt: 

## Process raw player stats dataset
process_raw_player_stats.py

In [None]:
import pandas as pd
import os
import numpy as np
import config as cn
import custom_funcs as cf
import glob


# Setup IO
f_cat = os.path.join(cn.clean_dir,'achievement_details_list.csv')
folder = cn.processed_dir

# Read in the list of categories with achievements
dfc = pd.read_csv(f_cat)

# Define output file columns
player_cols = ['faction', 'guild_name', 'guild_rank', 'id', 'playable_class',
              'playable_race','player', 'realm', 'realm_id', 'total_achievements',
              'total_achievement_points', 'mounts_collected', 'pets_collected','completed_quests',
              'honor_level', 'gear_score', 'last_login', 'time_since_login','engagement_score']
categories = [name.lower() for name in np.unique(dfc.category_name)]
months = np.arange(1, 13)
years = [2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
timepoints = []

for month in months:
    if month < 10:
        month = str(0) + str(month)
    for year in years:
        if year == 2020 and month == '07':
            break
        if year < 2011 and month != '01':

            continue
        else:
            month = str(month)
            timepoints.append(str(year) + '-' + str(month))



os.chdir (folder)
for f in glob.glob('*{}'.format('csv')):

    print(f)

    # Create the output dataframe
    dfo = pd.DataFrame(columns=player_cols + timepoints + categories)

    # Read in raw player achievement stats
    dfr = pd.read_csv(f)
    achievement_cols = [col for col in dfr.columns.values if col not in player_cols]


    # Build the processed_player_stats.csv dataset
    i = 0
    for index, row in dfr.iterrows():

        # Format output file
        f_out = os.path.join(cn.clean_dir, f.replace('raw', 'processed'))

        # Convert date to month
        row[achievement_cols] = [str(d)[0:7] for d in row[achievement_cols]]

        # Set up df for achievements per month
        t = row[achievement_cols].transpose().reset_index()
        t = t.iloc[1:][:]
        t.columns = ['achievement', 'date']

        # Setup df dates by category
        c = t.copy()
        for indexes, rows in c.iterrows():
            try:
                c.at[indexes, 'category'] = list(dfc[dfc.achievement_id.astype(str) == rows.achievement].category_name)[0]
            except:
                continue
        b = c.copy()
        d = c.copy()

        # Get achievements per month
        t = t.iloc[:][:].groupby('date').count().reset_index() # remove top row (formerly column names before transpose)
        t = t.transpose()
        t.columns = t.iloc[0][:].sort_values()
        t = t.iloc[1:][:]
        to_drop = [n for n in t.columns.values if n not in timepoints]
        t = t.drop(to_drop, axis = 1)

        # Get total categories
        c = c.iloc[:][:].groupby('category').count().reset_index()  # remove top row (formerly column names before transpose)
        c = d.transpose()
        c.columns = c.iloc[0][:].sort_values()
        c.columns = c.iloc[2][:]
        c = c.drop(['achievement', 'category'])

        # Get dates per category
        d = d.groupby('category')['date'].apply(list).reset_index().transpose()
        d.columns = [col.lower() for col in d.iloc[0][:]]
        d = d.iloc[1:][:]

        # Create a new row to append to dfo
        tmp = dict()

        # Add player data to the output row
        for col in player_cols:
            tmp[col.lower()] = row[col.lower()]

        # Add achievements per month to the output row
        for col in t.columns.values:
            tmp[col.lower()] = t[col.lower()].achievement

        # Add category per month data to the output row
        for col in d.columns.values:
            tmp[col.lower()] = d[col.lower()].date

        add_categories = [add for add in dfo.columns.values if add not in tmp.keys()]
        for add in add_categories:
            tmp[add] = 0

        dfo = dfo.append(tmp, ignore_index=True)

        i = i + 1
        print(i)
    dfo.to_csv(f_out)

## Concatenate player stats files
file_concatenator.py

In [2]:
import os
import custom_funcs as cf
import csv
import config
import numpy as np

dir_in = os.path.join(config.processed_dir)
dir_out = config.clean_dir
file_in = os.path.join(dir_in, '*{}')
file_out = os.path.join(dir_out,'clean_player_stats.csv')
df = cf.csv_concatenator (dir_in)
df.to_csv(file_out)

## Get achievement patch number and release date
Needed to tell whether the player completed the achievement retroactively or when the content was current

In [3]:
import os
import config
import pandas as pd
path_in = os.path.join(config.clean_dir, 'achievement_details_list.csv')
df = pd.read_csv(path_in)
df['patch'] = ''
df['release_date'] = ''
df['attained_by'] = ''
for index, row in df.iterrows():
    print(row.achievement_id, end = ' ')
    df.at[index,'patch'], df.at[index,'attained_by'] = cf.achievement_patch_scraper (row.achievement_id)
    df.at[index,'release_date'] = cf.patch_date_scraper (df.loc[index,'patch'])
    #print(df.loc[index,'patch'],df.loc[index,'attained_by'],df.loc[index,'release_date'],)



10 10000 10001 10010 10011 10012 10013 10015 10016 10017 10018 10019 10020 10021 10023 10024 10025 10026 10027 10030 10032 10033 

KeyboardInterrupt: 

In [None]:
df.to_csv(path_in.replace('.csv','_timeseries.csv'))

In [None]:
print(len(np.unique(df.release_date)))

In [None]:
import datetime

df.release_date = df.release_date.dropna()
for i,release_date in enumerate(df.release_date[0:2]):
    print(i, release_date)
    if '2' in release_date:
        print(datetime.datetime.strptime(str(df.release_date).strip().replace(',',''), '%B %d  %Y'))
df_count = df.groupby('release_date').count().sort_values('release_date')
#df_count