# SubScript Leaderboard Explorer

1. Explore the leaderboard data
2. Use clustering to determine the core users of the service based on their activity
3. Find out which activities are most salient for each cluster

## Dependencies and Defaults

In [1]:
import config
import pandas as pd
import numpy as np
import os

In [2]:
dir_home = config.home_dir
dir_clean = config.clean_dir
dir_processed = config.processed_dir
dir_raw = config.raw_dir
f_in = 'leaderboard_stats.csv'
f_dfa = 'dataforazeroth_complete_dataset.csv'
path_in = os.path.join(dir_processed, f_in) # leaderboard_player_stats
path_in2 = os.path.join(dir_raw, f_dfa) # leaderboard_players

## Clean Leaderboard Player Data

In [3]:
df_ps = pd.read_csv(path_in) # Load in the leaderboard_player_stats
df_dfa = pd.read_csv(path_in2) # load in the leaderboard_stats
df_ps['id'] = df_ps.player + '_' + df_ps.realm
df_dfa['id'] = ''
df_dfa['id'] = df_dfa.player + '_' + df_dfa.realm 
df_ps = df_ps.fillna(0) # NA is equivalent to 0 for all columns (player has not attempted or completed)
for index,row in df_dfa.iterrows(): # Format player and realm with lowercase and no special chars
    if isinstance(row.player, str):
        df_dfa.at[index,'player'] = row.player.lower()
    if isinstance(row.realm, str):
        df_dfa.at[index,'realm'] = row.realm.replace('US-', '').replace("'", '').lower()
        df_dfa.at[index,'id'] = df_dfa.loc[index,'player'] + '_' + df_dfa.loc[index,'realm']

In [4]:
df_dfa = df_dfa.set_index('id')
df_ps['leaderboard'] = ''
for index, row in df_ps.iterrows(): # Add leaderboards to player stats
    df_ps.at[index, 'leaderboard'] = df_dfa[df_dfa.index == row.id].leaderboard.values

In [5]:
cols = [col for col in df_ps.columns if 'Unnamed' not in col] # Remove unnamed columns (former indexes)
df_ps = df_ps[cols]

In [6]:
df_ps.to_csv(os.path.join(dir_clean, 'cleaned_leaderboard_player_stats.csv'))
df_dfa.to_csv(os.path.join(dir_clean, 'cleaned_leaderboard_players.csv'))

## Describe the Leaderboard Player Stats Dataframe

In [7]:
df_ps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15983 entries, 0 to 15982
Columns: 5148 entries, 10 to leaderboard
dtypes: float64(5140), object(8)
memory usage: 627.8+ MB


In [8]:
df_ps.describe()

Unnamed: 0,10,10000,10001,10010,10011,10012,10013,10015,10016,10017,...,9998,completed_quests,guild_rank,honor_level,level,mounts_collected,pets_collected,realm_id,total_achievement_points,total_achievements
count,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,...,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0,15983.0
mean,0.994557,0.0,0.0,0.850716,0.848839,0.643684,0.749296,0.365764,0.416943,0.715135,...,0.0,6423.620096,131989800.0,136.49903,118.25208,488.479384,1073.709754,0.0,26060.873115,2997.514734
std,0.07358,0.0,0.0,0.356379,0.358217,0.478925,0.433432,0.481659,0.493069,0.451364,...,0.0,3617.39064,53958120.0,126.389915,10.918314,154.108984,447.911069,0.0,5920.919793,706.624994
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,18188.0,0.0,10.0,0.0,0.0,0.0,2695.0,349.0
25%,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3000.0,106871300.0,40.0,120.0,393.0,746.5,0.0,22290.0,2537.0
50%,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,7425.0,139048800.0,105.0,120.0,532.0,1215.0,0.0,27490.0,3152.0
75%,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,9850.0,174426400.0,192.0,120.0,605.0,1379.0,0.0,31055.0,3593.0
max,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,10938.0,212688300.0,1139.0,120.0,718.0,2018.0,0.0,33585.0,4033.0


## Transform leaderboard player stats reducing dataset to categories

#### Make a new dataframe with categories instead of individual achievements

In [9]:
import custom_funcs as cf


# base_cols contains leaderboard player stats columns without achievements
base_cols = ['leaderboard','level', 'guild_rank','player', 'id', 'realm', 'realm_id', 
        'playable_race', 'playable_class', 'faction', 'guild_name', 'completed_quests',
        'honor_level','mounts_collected','pets_collected','total_achievement_points',
        'total_achievements']

# import a list of achievement categories
df_cat = pd.read_csv(os.path.join(dir_raw, 'wow_achievement_categories.csv'))
category_cols = list(df_cat.id.values.astype(int).astype(str))
engineered_cols = base_cols + category_cols

#### Get category for each achievement (create from scratch)

In [12]:
ach_ids = df_ps.columns.difference(base_cols) # Keeps only the numeric achievement_id Columns
df_ach = pd.DataFrame() # New dataframe for the achievement list
for ach in ach_ids:
    results = cf.get_wow_achievement(ach)
df_ach.category_id = df_ach.category_id.astype(int).astype(str)
df_ach.achievement_id = df_ach.achievement_id.astype(int).astype(str)
df_ach.to_csv(os.path.join(dir_clean, 'achievement_details_list.csv'))


unpacked['criteria']['name'] does not exist


#### Load from previously created category file

In [13]:
df_ach = pd.read_csv(os.path.join(dir_clean, 'achievement_details_list.csv'))
df_ach.head()

Unnamed: 0.1,Unnamed: 0,achievement_id,achievement_name,category_id,category_name,criteria_id,criteria_name,next_id,next_name
0,0,10,Level 50,92,Character,2053.0,,11.0,Level 60
1,1,10000,Guardian of the Horde: Warlords Season 1,15270,Player vs. Player,,,,
2,2,10001,Defender of the Horde: Warlords Season 1,15270,Player vs. Player,,,,
3,3,10010,Gate of the Setting Sun,15106,Pandaria Dungeon,43685.0,,6759.0,Heroic: Gate of the Setting Sun
4,4,10011,Siege of Niuzao Temple,15106,Pandaria Dungeon,43688.0,,6763.0,Heroic: Siege of Niuzao Temple


#### Go through leaderboard player stats and tally achievements by category

In [None]:
# Make a new, empty dataframe with player info and achievement categories
df_ps_cat = pd.DataFrame(columns=engineered_cols)
i = 0
for index, row in df_ps.iterrows():
    print(i, end = ' ')
    tmp = dict.fromkeys(engineered_cols,0) # Achieve cols should be numbers, default value = 0
    for base_col in base_cols:
        tmp[base_col] = row[base_col]  # Copy basic player information from the df_ps dataset
    for col in ach_ids:  # Add category_id as columns instead of achievement_id
        if row[col] == 1:
            category_id = df_ach.loc[df_ach.achievement_id == col].category_id.values[0]
            tmp[category_id] = tmp[category_id] + 1  # Tallies the number of achievements completed in a category
    df_ps_cat = df_ps_cat.append(tmp, ignore_index = True)  
    i = i +1

In [None]:
df_ps_cat['15106']

## Create Chain Content

In [None]:
df_ach = df_ach.sort_values(by = ['criteria_id']) # Sort by criteria
df_chain = pd.DataFrame()
for index, row in df_ach.iterrows():
    if row.criteria == '': # If no criteria
        chain = []
        if row.next_id != '': # Is there a next_id?
            chain.append(row.achievement_id) # First step in the chain
            next_id = row.next_id # Next step in the chain
            end_reached = False  # Stop/Start variable for while loop
            while end_reached == False: # If yes, repeat with the next id
                chain.append(next_id) # Add the next achievement ID to the list
                next_id = df_ach[df_ach.achievement_id == next_id].next_id # Find next achievement after that
                if next_id == '': # If blank, it's the end of the chain
                    end_reached = True # exit the loop
        df_chain['chain_' + row.achievement_id] = chain # Add the achievement chain as a column in a new df

In [None]:
df_chain.to_csv(os.path.join(dir_clean, 'achievement_chains.csv'))
df_chain.head()

## DBSCAN Clustering

#### Remove non-numeric columns 

In [None]:
str_cols = ['leaderboard','level', 'guild_rank','player', 'id', 'realm', 'realm_id', 'playable_race', 'playable_class', 'faction', 'guild_name']
try:
    df_dbscan = df_ps.drop(str_cols, axis = 1)
except:
    pass

In [None]:
df_dbscan

#### Run the DBSCAN clustering

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

# Compute DBSCAN
db = DBSCAN(eps=4, min_samples=10).fit(df_dbscan)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(n_clusters_)


#### Print the results of the clustering

In [None]:
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = df_dbscan[class_member_mask & core_samples_mask]
    plt.plot(xy.iloc[:, 0], xy.iloc[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = df_dbscan[class_member_mask & ~core_samples_mask]
    plt.plot(xy.iloc[:, 0], xy.iloc[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

Did not work because of binary data

Categorizaing data first based on achievement categories

#### Trying again with only a few non-categorical columns

In [None]:
df_dbscan2 = df_ps[['completed_quests','honor_level','level','mounts_collected','pets_collected','total_achievement_points','total_achievements']]

#Normalizing the data
df_dbscan2.completed_quests = df_dbscan2.completed_quests.div(1000)
df_dbscan2.honor_level = df_dbscan2.honor_level.div(100)
df_dbscan2.mounts_collected = df_dbscan2.mounts_collected.div(100)
df_dbscan2.pets_collected = df_dbscan2.pets_collected.div(1000)
df_dbscan2.total_achievement_points = df_dbscan2.total_achievement_points.div(10000) 
df_dbscan2.total_achievements = df_dbscan2.total_achievements.div(1000)

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

# Compute DBSCAN
db = DBSCAN(eps=5, min_samples=50).fit(df_dbscan2)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(n_clusters_)


In [None]:
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = df_dbscan2[class_member_mask & core_samples_mask]
    plt.plot(xy.iloc[:, 0], xy.iloc[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14, alpha = 0.02)

    xy = df_dbscan2[class_member_mask & ~core_samples_mask]
    plt.plot(xy.iloc[:, 0], xy.iloc[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6, alpha = 0.02)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

Sort of two groups but not what we're looking for