# SubScript Database Builder
This notebook describes the pipeline for building the World of Warcraft Achievement Dataset. Each transformation in the pipeline was performed in batches across four machines using standalone scripts in multiple terminals per machine due to the large sample and feature sizes. 

## Defaults, Dependencies,  Definitions

In [42]:
import subscript.config as cn
import subscript.custom_funcs as cf
import pandas as pd
import os
import numpy as np
import mysql.connector
from mysql.connector import errorcode
import configparser as cp
import glob
import requests
import json

## Get top 500 leaderboards for major achievment categories

#### From dataforazeroth.com (copied directly from report tables, no scraping necessary)

In [2]:
dir_dataset = os.path.join(cn.raw_dir, 'dataforazeroth')
dataforazeroth_set = cf.dataforazeroth (dir_dataset)
cf.xlsx_to_csv (dir_dataset)

#### Get Blizzard API access token

In [3]:
f_config = os.path.join(cn.home_dir, '../', 'api', 'config.ini')
conf = cp.ConfigParser()
conf.read(f_config)
blizzard_key = conf.get('KEYS', 'blizzard')
blizzard_secret = conf.get('KEYS', 'blizzard_secret')
locale = 'en_US'
namespace = 'static-us'
access_token = cf.get_access_token(blizzard_key, blizzard_secret)

#### Quick method of processing json files retrieved by APIs

In [40]:
def unpack_json(txt):
    """Converts json data from API calls into dicts and lists for parsing."""
    unpacked = json.loads(txt)
    return unpacked

#### From wowprogress.com (webscraped with BeautifulSoup)

In [4]:
locale = 'en_US'
namespace = 'static-us'
dir_save = 'wowprogress_reports'
base_url = 'https://www.wowprogress.com/export/ranks/'
os.chdir(os.path.join(cn.raw_dir, 'wowprogress_reports'))

cf.get_wowprogress_by_realm(locale, namespace, base_url, access_token)
wow_guilds = cf.unpack_wowprogress_guild_ranks()
wow_players = cf.get_wow_guild_rosters('wow_guild_rankings.csv')

Downloading: us_aegwynn_tier23.json.gz
Downloading: us_aegwynn_tier24.json.gz
Downloading: us_aegwynn_tier25.json.gz
Downloading: us_aegwynn_tier26.json.gz
Downloading: us_aerie-peak_tier23.json.gz


KeyboardInterrupt: 

## Get player profiles from leaderboards

#### Function to retrieve player details from the leaderboard rosters

In [49]:
def get_wow_profile (realm, player, token):
    url = 'https://us.api.blizzard.com/profile/wow/character/' + realm \
          + '/' + player + '?namespace=profile-us&locale=en_US&access_token=' + access_token
    r = requests.get(url)
    unpacked = unpack_json(r.text)
    row = dict(id = unpacked['id'], name = unpacked['name'], gender = unpacked['gender']['name'],
          faction = unpacked['faction']['name'], race = unpacked['race']['name'],
          character_class = unpacked['character_class']['name'],
          active_spec = unpacked['active_spec']['name'], realm = unpacked['realm']['slug'],
          guild = unpacked['guild']['name'], level = unpacked['level'],
          achievement_points = unpacked['achievement_points'],
          last_login = unpacked['last_login_timestamp'],
          average_item_level = unpacked['average_item_level'],
          equipped_item_level = unpacked['equipped_item_level'])
    return row


## Get guild rosters from leaderboards

#### Define function to get friends of leaderboards to expand roster of players 

In [47]:
def get_guild_roster (realm, guild, access_token):
    df = pd.DataFrame()
    url = 'https://us.api.blizzard.com/data/wow/guild/'+ realm \
              + '/' + guild + \
            '/roster?namespace=profile-us&locale=en_US&access_token='\
            + access_token
    r = requests.get(url)
    unpacked = unpack_json(r.text)
    if r.status_code != 200:
        return df
    else:
        try:
            guild_faction = unpacked['guild']['faction']['name']
            for member in unpacked['members']:
                row = dict(player = member['character']['name'], 
                    id = member['character']['id'],
                    realm = member['character']['realm']['slug'],
                    realm_id = member['character']['realm']['id'],
                    level = member['character']['level'], 
                    playable_class = member['character']['playable_class']['id'],
                    playable_race = member['character']['playable_race']['id'], 
                    guild_rank = member['rank'],
                    guild_name = guild, faction = guild_faction)
                df = df.append(row, ignore_index = True)
        except:
            return df
    return df

#### Set up IO

In [46]:
roster = pd.DataFrame()
realms = []
guilds = []
dfa_set = pd.read_csv(os.path.join(cn.raw_dir, 
        'dataforazeroth_complete_dataset.csv'))
dfa_set['player_id'] = ''

#### Blizzard API calls to retrieve guild members of leaderboard players

In [48]:
i = 1
print(os.getcwd())
for row in dfa_set.iloc[:][:].itertuples():
    try:
        realm_slug = row.realm.replace('US-', '').replace("'",'').replace(' ', '-').lower()
        player_slug = row.player.lower()
        dfa_set.at[i,'player_id'] = player_slug + '-' + realm_slug
        print(i, end = ' ')
        if isinstance(row.guild, float): 
            continue
        if row.realm in realms and row.guild in guilds:
            continue
        guild_slug = row.guild.replace(' ','-').lower()
        realms.append(realm_slug)
        guilds.append(guild_slug)
        roster = roster.append(get_guild_roster (realm_slug, guild_slug, access_token), ignore_index = True)
        if i % 100 == 0: # Every 100 guilds, dump the data into a file 
            roster.to_csv('wow_roster' + str(i) + '.csv')
            roster = pd.DataFrame()  # clear memory by resetting the dataframe to empty
        break # For demo purposes. Remove for full run.
        i = i + 1
    except:
        continue

/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/features/engaged
1 

## Get a list of all the US-based World of Warcraft Servers and IDs 

In [43]:
def get_wow_realms_list (namespace, locale):
    directory = 'data/wow/realm/index'
    url = 'https://us.api.blizzard.com/' + directory + '?namespace=dynamic-us' + \
          '&locale=' + locale + '&access_token=' + access_token
    r = requests.get(url)
    unpacked = unpack_json(r.text)
    realm_names = []
    realm_ids = []
    realm_slugs = []
    for realm in unpacked ['realms']:
        realm_names.append(realm['name'])
        realm_ids.append(realm['id'])
        realm_slugs.append(realm['slug'])
    return realm_names, realm_ids, realm_slugs

realm_names,realm_ids,slugs = get_wow_realms_list (namespace, locale)

## Get achievement list from the blizzard API 

For demo purposes only. Originally run in the terminal from file **/scripts/achievement_list_maker.py**

In [11]:
dfa = pd.DataFrame()
for id in cf.get_wow_achievement_ids (access_token):
    ach = cf.get_wow_achievement(id, access_token)
    # Filters out account-wide achievements and 
    # retains only character achievements
    if ach['account_wide'] == False:
        dfa = dfa.append(ach, ignore_index = True)
        print (ach['achievement_id'])
    break # For demo only
dfa.to_csv(os.path.join(cn.clean_dir, 'achievement_list.csv'))

# If the achivement is part of a chain, it will also have the 
# keys: 'criteria' and 'name'

200
unpacked['criteria']['name'] does not exist
6


## Build the raw achievement dataset

This section is for demonstration purposes only. 
Actual processing was performed in the terminal in batches with **scripts/achievement_api_caller.py**

#### Read in wow achievement list (if already exists)

In [7]:
print('starting up...')
final_cols = ['id','player', 'realm']
achievement_list = pd.read_csv(os.path.join(cn.raw_dir,
        'wow_achievements.csv'))
achievement_list.columns = ['unnamed0', 'unnamed1', 'player', 
        'guild', 'realm', 'id']
achievement_list = achievement_list.drop(['unnamed0', 
        'unnamed1'], axis = 1)
#print(achievement_list.columns)

starting up...


#### Make an empty dictionary to add rows to the new player achievement dataset

In [9]:
for id in achievement_list.id:
    final_cols.append(str(int(id)))
empty_row = dict.fromkeys(final_cols)

#### Calls to the Blizzard API to fillout achievement details for each player in the roster csvs

In [None]:
i = 0
for group_num in np.arange(200, 300, 100):
    print('Group Number: ' + str(group_num))
    f = 'wow_roster' + str(group_num) + '.csv'
    player_roster = pd.read_csv(os.path.join(cn.raw_dir,
            'wow_rosters', f))
    df = pd.DataFrame()
    for m in player_roster.itertuples():
        if m.level == 120:
            print(i)
            player = m.player.lower()
            realm = m.realm
            row = cf.get_player_achievements(player, realm, 
                    empty_row, access_token)
            if not isinstance(row, str):
                row['player'] = player
                row['realm'] = realm
                row['id']  = player + '_' + realm
                last_login, gear_score = cf.get_validation(player, 
                        realm, access_token)
                try:
                    row['gear_score'] = gear_score
                    row['last_login'] = last_login
                    row['time_since_login'] = datetime.datetime.strptime('2020-06-05', \
                            '%Y-%m-%d').date()- \
                            datetime.datetime.strptime(row['last_login'], \
                            '%Y-%m-%d').date()
                except:
                    print("error in " + row['id'])
                df = df.append(row, ignore_index=True)
        if i % 100 == 0:
            f_name = f.split('roster')[0] + '6-8_dates_' + str(group_num) \
                    + '_' + str(i) + '.csv'
            df.to_csv(os.path.join(cn.processed_dir,'scrapes', f_name))
            df = pd.DataFrame()
            print(f_name + ' saved')
            break # For Jupyter notebook demo
        i = i + 1

## Achievement processor for time series dataset
This section is for demonstration purposes only. 
Actual processing was performed in the terminal in batches with **scripts/achievement_time_processor.py**

#### Setup IO

In [12]:
f_cat = os.path.join(cn.clean_dir,'6-13_achievement_list.csv')
folder = os.path.join(cn.processed_dir, '6-15_scrapes')

#### Read in the list of categories with achievements

In [13]:
dfc = pd.read_csv(f_cat)
achievements = dfc.achievement_id.values.astype(int).astype(str)

#### Define output file columns

In [14]:
player_cols = ['player', 'realm', 'gear_score', 'last_login', 
               'time_since_login']
months = np.arange(1, 13)
years = [2014, 2015, 2016, 2017, 2018, 2019, 2020]
timepoints = []

#### Generate a list of dates (Year_Month)

In [15]:
for month in months:
    if month < 10:
        month = str(0) + str(month)
    for year in years:
        if year == 2020 and month == '07':
            break
        if year < 2011 and month != '01':

            continue
        else:
            month = str(month)
            timepoints.append(str(year) + '-' + str(month))


#### Calculate achievements per month for each player

In [6]:
os.chdir (folder)
for f in glob.glob('*{}'.format('csv')):

    print(f)

    # Create the output dataframe
    dfo = pd.DataFrame(columns=player_cols + timepoints)

    # Read in raw player achievement stats
    dfr = pd.read_csv(f)
    achievement_cols = [col for col in dfr.columns.values \
                if col in achievements]

    # Build the processed_player_stats.csv dataset
    i = 0
    for index, row in dfr.iloc[:][:].iterrows():

        # Format output file
        f_out = os.path.join(cn.processed_dir, '6-15_scrapes', 
                'processed', 'time', f.replace('wow', 'time'))

        # Convert date to month
        row[achievement_cols] = [str(d)[0:7] for d in \
                row[achievement_cols]]

        # Set up df for achievements per month
        t = row[achievement_cols].transpose().reset_index()
        t = t.iloc[1:][:]
        t.columns = ['achievement', 'date']

        # remove top row (formerly column names before transpose)
        t = t.iloc[:][:].groupby('date').count().reset_index() 
        t = t.transpose()
        t.columns = t.iloc[0][:].sort_values()
        t = t.iloc[1:][:]
        to_drop = [n for n in t.columns.values if n not in timepoints]
        t = t.drop(to_drop, axis = 1)


        # Create a new row to append to dfo
        tmp = dict()

        # Add player data to the output row
        for col in player_cols:
            tmp[col.lower()] = row[col.lower()]

        # Add achievements per month to the output row
        for col in t.columns.values:
            tmp[col.lower()] = t[col.lower()].achievement

        dfo = dfo.append(tmp, ignore_index=True)
        dfo = dfo.fillna(int(0))

        i = i + 1
        print(i)

    dfo.to_csv(f_out)


## Calculate engagement score for time series dataset
Originally ran in the terminal with the file: scripts/engagement_adder_time.py

#### Read in the processed time series csvs

In [18]:
dir_in = os.path.join(os.path.join(cn.processed_dir, 'scrapes',
        'processed','time'))
file_in = os.path.join(dir_in, '*{}')

#### Calculate engagement score based on the days since last login. 

In [19]:
os.chdir (dir_in)
print(os.getcwd())
i = 1
for f in glob.glob('*{}'.format('csv')):
    print(i,f)
    df = pd.read_csv(f,dtype='unicode')

    #if 'engagement' not in df.columns.values:
    df['engagement'] = np.nan
    df['status'] = ''
    for index, row in df.iterrows():
        if int(row.time_since_login.split(' ')[0]) <= 30:
            df.at[index,'engagement'] = 0
            df.at[index,'status'] = 'active'
            continue
        elif int(row.time_since_login.split(' ')[0]) <= 240:
            df.at[index,'engagement'] = 1
            df.at[index,'status'] = 'risk'
            continue
        elif int(row.time_since_login.split(' ')[0]) > 240:
            df.at[index,'engagement'] = 2
            df.at[index,'status'] = 'lapsed'
            continue
        i = i + 1
    df.to_csv(os.path.join(dir_in, 'engaged', 
            f.replace('time','engaged_time')))


/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/time
1 time_6-8_dates_9000_22300.csv


## Concatenate time series csvs into a single file
This section is for demonstration purposes only. 
Actual processing was performed in batches in the terminal with **scripts/file_concatenator_time.py**

In [22]:
dir_in = os.path.join(cn.processed_dir, 'scrapes',
        'processed','time', 'engaged')

df = pd.DataFrame()
os.chdir (dir_in)
print(os.getcwd())
i = 1
for f in glob.glob('*{}'.format('csv')):
    print(i,f)
    df = df.append(pd.read_csv(f,dtype='unicode'))
    df = df.drop_duplicates()
df.to_csv(os.path.join(cn.clean_dir,'random_forest_time',
        'final_time_stats_demo.csv'))

/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/time/engaged
1 engaged_time_6-8_dates_9000_22300.csv


## Build the achievement processor for the features/category dataset

This section is for demonstration purposes only. Actual processing was performed in batches in the terminal with the file: **scripts/achievement_processor_features.py**

#### Read in the dataset and achievement details list

In [24]:
dfa = pd.read_csv(os.path.join(cn.clean_dir,'6-13_achievement_list.csv'))
achievements = list(dfa.achievement_id.values.astype(int).astype(str))

#### Remove unneccessary columns (i.e. those needed for time series). Keep only achievements.

In [25]:
os.chdir (os.path.join(cn.processed_dir,'scrapes'))
for f in glob.glob('*{}'.format('csv')):
    player_cols = ['player', 'realm', 'gear_score', 'last_login', 
                   'time_since_login']
    keep_cols = player_cols + achievements

    df = pd.read_csv(f, dtype = 'unicode')
    keep_cols = [c for c in keep_cols if c in df.columns.values]

    df = df[keep_cols]
    f_out = os.path.join(cn.processed_dir,'scrapes','processed',
            'features', f.replace('wow','features'))
    df = df.to_csv(f_out)
    print(f_out)


/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/features/features_6-8_dates_9000_22300.csv
/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/features/features_6-8_dates_200_0.csv
/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/features/features_6-8_dates_200_100.csv


## Calculate engagement score for features/category dataset

This section is for demonstration purposes only. Actual processing was performed in batches in the terminal with the file: **scripts/engagement_adder_features.py**

#### Set up file IO

In [28]:
dir_in = os.path.join(cn.processed_dir, 'scrapes',
        'processed', 'features')
file_in = os.path.join(dir_in, '*{}')
df = pd.DataFrame()
os.chdir (dir_in)

#### Determine engagement score based on last login

In [30]:
print(os.getcwd())
i = 1
for f in glob.glob('*{}'.format('csv')):
    print(i,f)
    df = pd.read_csv(f,dtype='unicode')

    #if 'engagement' not in df.columns.values:
    df['engagement'] = np.nan
    df['status'] = ''
    for index, row in df.iterrows():
        if int(row.time_since_login.split(' ')[0]) <= 30:
            df.at[index,'engagement'] = 0
            df.at[index,'status'] = 'active'
            continue
        elif int(row.time_since_login.split(' ')[0]) <= 240:
            df.at[index,'engagement'] = 1
            df.at[index,'status'] = 'risk'
            continue
        elif int(row.time_since_login.split(' ')[0]) >240:
            df.at[index,'engagement'] = 2
            df.at[index,'status'] = 'lapsed'
            continue
    df.to_csv(os.path.join(dir_in, 'engaged', 
            f.replace('features', 'engaged')))


/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/features
1 features_6-8_dates_200_0.csv
1 features_6-8_dates_9000_22300.csv


## Concatenate feature/category csvs into the final dataset
This section is for demonstration purposes only. Actual processing was performed in batches in the terminal with the file: **scripts/file_concatenator_features.py**

#### Set up file IO

In [32]:
dir_in = os.path.join(cn.processed_dir, 'scrapes','processed',
        'features', 'engaged')
#file_in = os.path.join(dir_in, '*{}')
df = pd.DataFrame()
os.chdir (dir_in)

#### Sequentially open each file in the 'engaged' folder and add them to a dataframe for the final_features_dataset.csv

In [35]:
print(os.getcwd())
i = 1
for f in glob.glob('*{}'.format('csv')):
    print(i,f)
    df = df.append(pd.read_csv(f,dtype='unicode'))
    
    # For a large number of files, may need to be run in 
    # batches of 250 files per loop
    if i % 250 == 0: 
        df.to_csv(os.path.join(dir_in,'concats','concat_feature_stats_' \
                + str(i) + '.csv'))
        pf = pd.DataFrame()
    i = i + 1
    
# save last df in a batch run
df.to_csv(os.path.join(dir_in,'concats','concat_feature_stats_' \
        + str(i) + '.csv'))

# save last df of a final run
#df.to_csv(os.path.join(dir_in,'concats'.'final_feature_stats.csv'))

/Users/haleyspeed/Docs/insight/insight_data_science/data/processed/scrapes/processed/features/engaged
1 engaged_6-8_dates_200_0.csv
2 engaged_6-8_dates_9000_22300.csv
