# SubScript Leaderboard Explorer

1. Explore the leaderboard data
2. Use clustering to determine the core users of the service based on their activity
3. Find out which activities are most salient for each cluster

## Dependencies and Defaults

In [166]:
import config as cn
import pandas as pd
import numpy as np
import os
from IPython.display import HTML

In [167]:
dir_home = cn.home_dir
dir_clean = cn.clean_dir
dir_processed = cn.processed_dir
dir_raw = cn.raw_dir

## Clean Player Data

#### Load general player data

In [179]:
df = pd.read_csv(os.path.join(dir_clean,  'processed_player_concat_test.csv'))
df = df.drop_duplicates()
del_cols = [c for c in df.columns.values if 'unnamed' in c.lower()]
#del_cols = del_cols + ['guild_rank', 'playable_class', 'playable_race', 'realm_id', 
                       #'faction', 'guild_name', 'player', 'realm']

df = df.drop(del_cols, axis = 1)
#df = df.set_index('id')

f_cat = os.path.join(cn.clean_dir,'achievement_details_list.csv')
dfc = pd.read_csv(f_cat)
categories = [name.lower() for name in np.unique(dfc.category_name)]


#df.head()

#### Remove duplicates and set the index to id column

In [180]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17186 entries, 0 to 20151
Columns: 234 entries, faction to wrath of the lich king
dtypes: float64(12), int64(125), object(97)
memory usage: 30.8+ MB


In [181]:
df.describe()

Unnamed: 0,guild_rank,playable_class,playable_race,realm_id,total_achievements,total_achievement_points,mounts_collected,pets_collected,completed_quests,honor_level,...,2013-12,2014-12,2015-12,2016-12,2017-12,2018-12,2019-12,2020-12,general,guild feats of strength
count,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,...,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0
mean,147632900.0,6.451472,12.037298,798.323403,1821.582393,16077.812464,255.471605,481.879553,2293.523857,43.185442,...,27.881473,65.512277,41.155475,31.463459,36.788898,43.90789,35.005004,0.0,0.0,0.0
std,43455070.0,3.551705,11.130347,1149.234337,770.612607,6608.093107,150.93244,388.704784,1928.750909,61.453749,...,25.28408,36.374167,29.7684,25.371818,25.903381,24.898116,28.84439,0.0,0.0,0.0
min,54418.0,1.0,1.0,1.0,34.0,335.0,2.0,1.0,57.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,126958200.0,3.0,4.0,58.0,1291.0,11610.0,140.25,190.0,908.0,9.0,...,13.0,44.0,24.0,17.0,22.0,28.0,18.0,0.0,0.0,0.0
50%,157277700.0,6.0,8.0,78.0,1802.0,15920.0,231.0,357.0,1745.0,23.0,...,21.0,60.0,34.0,25.0,30.0,39.0,29.0,0.0,0.0,0.0
75%,178720900.0,10.0,22.0,1292.0,2318.0,20370.0,348.0,683.0,3050.75,51.0,...,33.75,79.0,50.0,37.0,43.0,53.0,43.0,0.0,0.0,0.0
max,213687300.0,12.0,37.0,3737.0,4033.0,33405.0,701.0,2004.0,10711.0,762.0,...,423.0,553.0,359.0,360.0,394.0,411.0,515.0,0.0,0.0,0.0


In [182]:
for cat in np.unique(categories):
    print(cat,df.loc[0,cat])
    if not isinstance(df.loc[0,cat]):
        df.loc[0,cat] = len(df.loc[0,cat])

print(df[categories].head())
    

alterac valley ['2013-10', 'none', '2010-01', '2009-05', '2019-11', '2008-10', '2019-02', '2008-10', '2012-02', '2010-06', '2008-10', 'none', '2009-10', '2009-11', '2010-10', '2012-11', 'none', '2008-10']
appearances ['2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2016-07', '2017-08', '2017-10', '2017-03', '2017-03', '2017-03', '2017-05', '2017-05', '2017-05', '2017-05', '2018-06', '2017-05', '2017-05', '2017-05', '2017-05', '2017-05', '2017-05', '2017-05', '2017-05', '2017-10', '2017-03', '2017-03', '2017-05', '2018-01', '2017-03', '2017-03', '2017-11', '2017-08', '2017-11', '2017-03', '2017-11', '2019-05', '2017-03', '2017-03', '2017-03', '2017-03', '2017-04', '2017-07', '2019-01', '2017-03', '2018-08', '2017-10', '2017-03', '2017-03', '2018-07', '2017-12', '2018-12', '2018-10', '2019-12', '2019-03', '2019-12', '2019-09', '2020-01', '2020-03']
arathi basin ['2009-06', '2009-05', '2012-07', '2015-05'

TypeError: object of type 'numpy.int64' has no len()

#### Normalize Data

In [72]:
#Normalizing the data
df.completed_quests = df.completed_quests.div(100)
df.mounts_collected = df.mounts_collected.div(10)
df.pets_collected = df.pets_collected.div(10)
df.total_achievement_points = df.total_achievement_points.div(1000) 
df.total_achievements = df.total_achievements.div(100)
df.engagement_score = df.engagement_score.div(100)
df.gear_score = df.gear_score.div(10)
df = df.dropna()
df.describe()

Unnamed: 0,total_achievements,total_achievement_points,mounts_collected,pets_collected,completed_quests,honor_level,gear_score,engagement_score,2007-01,2008-01,...,2013-12,2014-12,2015-12,2016-12,2017-12,2018-12,2019-12,2020-12,general,guild feats of strength
count,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,...,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0,17186.0
mean,18.215824,16.077812,25.54716,48.187955,22.935239,43.185442,42.41176,-1.447815,0.0,0.0,...,27.881473,65.512277,41.155475,31.463459,36.788898,43.90789,35.005004,0.0,0.0,0.0
std,7.706126,6.608093,15.093244,38.870478,19.287509,61.453749,5.109874,13.20618,0.0,0.0,...,25.28408,36.374167,29.7684,25.371818,25.903381,24.898116,28.84439,0.0,0.0,0.0
min,0.34,0.335,0.2,0.1,0.57,1.0,0.0,-43.47,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12.91,11.61,14.025,19.0,9.08,9.0,40.4,0.171703,0.0,0.0,...,13.0,44.0,24.0,17.0,22.0,28.0,18.0,0.0,0.0,0.0
50%,18.02,15.92,23.1,35.7,17.45,23.0,43.9,0.686944,0.0,0.0,...,21.0,60.0,34.0,25.0,30.0,39.0,29.0,0.0,0.0,0.0
75%,23.18,20.37,34.8,68.3,30.5075,51.0,46.0,3.0275,0.0,0.0,...,33.75,79.0,50.0,37.0,43.0,53.0,43.0,0.0,0.0,0.0
max,40.33,33.405,70.1,200.4,107.11,762.0,48.6,43.54,0.0,0.0,...,423.0,553.0,359.0,360.0,394.0,411.0,515.0,0.0,0.0,0.0


## Random Forest for Feature Selection

In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing

In [84]:
df_tree = df.select_dtypes(exclude=['object'])
from IPython.display import display, HTML
display(HTML(df_tree.head().to_html()))


Unnamed: 0_level_0,total_achievements,total_achievement_points,mounts_collected,pets_collected,completed_quests,honor_level,gear_score,engagement_score,2007-01,2008-01,2009-01,2010-01,2011-01,2012-01,2013-01,2014-01,2015-01,2016-01,2017-01,2018-01,2019-01,2020-01,2011-02,2012-02,2013-02,2014-02,2015-02,2016-02,2017-02,2018-02,2019-02,2020-02,2011-03,2012-03,2013-03,2014-03,2015-03,2016-03,2017-03,2018-03,2019-03,2020-03,2011-04,2012-04,2013-04,2014-04,2015-04,2016-04,2017-04,2018-04,2019-04,2020-04,2011-05,2012-05,2013-05,2014-05,2015-05,2016-05,2017-05,2018-05,2019-05,2020-05,2011-06,2012-06,2013-06,2014-06,2015-06,2016-06,2017-06,2018-06,2019-06,2020-06,2011-07,2012-07,2013-07,2014-07,2015-07,2016-07,2017-07,2018-07,2019-07,2011-08,2012-08,2013-08,2014-08,2015-08,2016-08,2017-08,2018-08,2019-08,2020-08,2011-09,2012-09,2013-09,2014-09,2015-09,2016-09,2017-09,2018-09,2019-09,2020-09,2011-10,2012-10,2013-10,2014-10,2015-10,2016-10,2017-10,2018-10,2019-10,2020-10,2011-11,2012-11,2013-11,2014-11,2015-11,2016-11,2017-11,2018-11,2019-11,2020-11,2011-12,2012-12,2013-12,2014-12,2015-12,2016-12,2017-12,2018-12,2019-12,2020-12,general,guild feats of strength
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1
banepally_sargeras,33.13,28.52,59.6,159.4,27.51,191.0,44.5,4.6975,0,0,17,21,30,22,42,16,24,60,16,18,43,38,40,28,33,34,42,27,22,10,10,18,20,22,40,23,41,22,30,31,20,33,23,17,39,13,20,6,26,25,17,38,14,39,19,12,28,20,20,12,25,20,9,50,44,16,26,10,21,21,20,0,17,32,19,15,24,33,31,65,58,8,29,59,26,11,41,19,64,24,0,8,29,22,23,25,41,47,39,32,0,16,67,39,22,41,25,23,25,55,0,10,53,17,53,45,8,39,18,34,0,29,42,23,51,46,15,30,31,32,0,0,0
huldrych_sargeras,14.19,12.54,10.6,13.7,17.54,13.0,43.9,3.716,0,0,15,24,13,16,25,16,12,52,14,15,38,34,16,23,24,32,30,14,18,8,10,19,14,18,33,23,22,14,28,17,15,33,17,14,35,12,14,5,19,29,13,103,12,31,26,12,9,18,18,70,25,48,17,47,65,15,13,181,9,75,17,0,19,20,24,8,19,22,23,84,68,7,21,23,6,9,52,16,152,24,0,5,17,20,24,15,46,34,58,18,0,13,47,49,23,34,18,70,21,45,0,12,39,21,121,26,8,24,15,34,0,25,27,23,42,23,14,29,19,18,0,0,0
effu_sargeras,21.44,18.695,21.0,44.9,17.0,47.0,40.6,0.344595,0,0,15,15,5,10,18,19,21,49,21,13,29,44,17,27,23,35,30,14,18,39,12,19,26,25,48,111,44,7,26,21,30,41,15,14,39,32,16,5,24,22,19,48,12,22,14,26,24,31,25,8,60,24,9,36,20,83,32,62,79,23,38,0,16,15,15,15,45,45,23,60,69,13,17,34,33,9,111,15,85,26,0,8,17,22,15,11,57,30,46,12,0,11,45,22,37,57,27,27,29,45,0,14,28,28,62,38,15,17,27,37,0,19,25,66,54,26,33,22,27,13,0,0,0
pyright_sargeras,5.03,4.215,3.1,4.9,17.09,1.0,34.7,0.2125,0,0,8,38,9,13,18,16,17,46,18,21,28,44,42,16,21,34,26,13,17,29,12,19,29,12,43,41,29,6,25,19,22,38,45,10,36,24,12,5,24,20,19,68,5,17,14,14,9,13,25,6,38,24,6,26,19,24,18,44,48,23,32,0,17,29,15,13,36,74,21,61,68,10,10,32,23,5,122,27,90,25,0,22,21,19,8,11,214,51,142,12,0,26,32,29,71,47,20,26,29,44,0,5,24,19,136,26,12,17,26,37,0,20,23,25,75,21,26,22,26,12,0,0,0
rhoena_sargeras,23.68,20.625,28.7,73.4,14.88,19.0,39.1,1.061154,0,0,32,14,25,7,10,27,44,40,26,7,54,39,9,10,16,48,29,13,31,9,28,18,7,7,24,41,24,6,34,12,23,33,18,3,24,57,12,6,45,18,18,31,5,8,12,18,6,30,26,4,24,24,4,21,13,15,5,15,45,22,32,0,11,6,12,6,15,47,105,108,65,6,9,19,11,9,136,23,87,22,0,9,5,9,5,4,127,30,38,10,0,6,77,15,34,23,29,20,25,40,0,1,60,15,110,18,24,14,38,35,0,16,49,21,50,17,20,19,37,5,0,0,0


In [77]:
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=10, test_size=.25, random_state=17)
for train_index, test_index in rs.split(df_tree):
    train_set = df_tree.iloc[train_index].copy()
    test_set = df_tree.iloc[test_index].copy()

y_train = train_set.engagement_score
X_train = train_set.drop('engagement_score',axis = 1)
y_test = test_set.engagement_score
X_test = test_set.drop('engagement_score',axis = 1)

encoder = preprocessing.LabelEncoder() # get a type error if not encoded
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

#### Only getting features

In [78]:
# Don't do more than 50 estimators or it will crash
selected = SelectFromModel(RandomForestClassifier(n_estimators = 50,n_jobs = -1,
                           oob_score = True,bootstrap = True,random_state = 42))
selected.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1,
                                                 oob_score=True,
                                                 random_state=42))

In [83]:
selected_features = X_train.columns[(selected.get_support())]
#important_feature_names = []
#df_cat = pd.read_csv(os.path.join(cn.raw_dir, 'wow_achievement_categories.csv'))
#for feature in selected_features:
    #print(feature)

for i, v in enumerate(selected_features):
    print(i,v, end = ' ')

0 total_achievements 1 total_achievement_points 2 mounts_collected 3 pets_collected 4 completed_quests 5 gear_score 6 2009-01 7 2010-01 8 2011-01 9 2013-01 10 2014-01 11 2015-01 12 2016-01 13 2017-01 14 2018-01 15 2019-01 16 2020-01 17 2011-02 18 2013-02 19 2014-02 20 2015-02 21 2016-02 22 2017-02 23 2018-02 24 2019-02 25 2020-02 26 2013-03 27 2014-03 28 2015-03 29 2016-03 30 2017-03 31 2018-03 32 2019-03 33 2020-03 34 2011-04 35 2013-04 36 2015-04 37 2016-04 38 2017-04 39 2018-04 40 2019-04 41 2020-04 42 2011-05 43 2013-05 44 2014-05 45 2015-05 46 2016-05 47 2017-05 48 2018-05 49 2019-05 50 2020-05 51 2013-06 52 2014-06 53 2015-06 54 2016-06 55 2017-06 56 2018-06 57 2019-06 58 2011-07 59 2013-07 60 2014-07 61 2015-07 62 2016-07 63 2017-07 64 2018-07 65 2019-07 66 2011-08 67 2012-08 68 2014-08 69 2015-08 70 2016-08 71 2017-08 72 2018-08 73 2019-08 74 2011-09 75 2012-09 76 2013-09 77 2014-09 78 2015-09 79 2016-09 80 2017-09 81 2018-09 82 2019-09 83 2012-10 84 2013-10 85 2014-10 86 2015-

#### Identifty get names of important achievements

completed_quests []
honor_level []
mounts_collected []
pets_collected []
total_achievement_points []
81 ['Feats of Strength']
92 ['Character']
95 ['Player vs. Player']
96 ['Quests']
97 ['Exploration']
155 ['World Events']
158 ["Hallow's End"]
161 ['Midsummer']
165 ['Arena']
168 ['Dungeons & Raids']
169 ['Professions']
170 ['Cooking']
171 ['Fishing']
201 ['Reputation']
14804 ['Warsong Gulch']
14805 ['The Burning Crusade']
14806 ['Lich King Dungeon']
14808 ['Classic']
14861 ['Eastern Kingdoms']
14865 ['The Burning Crusade']
14866 ['Wrath of the Lich King']
14922 ['Lich King Raid']
14941 ['Argent Tournament']
15067 ['Cataclysm Dungeon']
15068 ['Cataclysm Raid']
15070 ['Cataclysm']
15071 ['Archaeology']
15092 ['Rated Battleground']
15101 ['Darkmoon Faire']
15106 ['Pandaria Dungeon']
15107 ['Pandaria Raid']
15110 ['Pandaria']
15113 ['Pandaria']
15114 ['Pandaria']
15117 ['Pet Battles']
15118 ['Collect']
15119 ['Battle']
15120 ['Level']
15220 ['Draenor']
15228 ['Draenor Dungeon']
15231 ['Draenor Raid']
15234 ['Legacy']
15235 ['Draenor']
15248 ['Mounts']
15252 ['Legion']
15254 ['Legion Dungeon']
15255 ['Legion Raid']
15257 ['Legion']
15259 ['Appearances']
15268 ['Promotions']
15269 ['Mounts']
15270 ['Player vs. Player']
15271 ['Raids']
15272 ['Dungeons']
15274 ['Events']
15277 ['Dungeons']
15278 ['Raids']
15279 ['Player vs. Player']
15280 ['Currencies']
15283 ['World']
15284 ['Battle for Azeroth']
15285 ['Battle Dungeon']
15286 ['Battle Raid']
15298 ['Battle for Azeroth']
15303 ['Draenor Garrison']
15304 ['Legion Class Hall']
15307 ['Island Expeditions']
15308 ['War Effort']
15414 ['Ashran']
15417 ['Heart of Azeroth']
15426 ["Visions of N'Zoth"]
81.1 []
92.1 []
95.1 []
96.1 []
97.1 []
155.1 []
168.1 []
169.1 []
201.1 []
15117.1 []
15234.1 []

#### If I was actually classifying, this would be the code

In [None]:
print(selected.predict([[0, 0, 0, 0]]))

In [None]:
import matplotlib.pyplot as plt
key_features = ['completed_quests','honor_level', 'total_achievements', 'pets_collected',
               'mounts_collected', '81','92','95','97','155','168', '169','201']

key_features_names = ['Completed Quests','Honor Level', 'Total Achievements', 
                      'Pets Collected','mounts_collected', 'Feats of Strength',
                      'Character','PVP','Exploration','World Events','Dungeons and Raids',
                      'Professions','Reputation']


fig = plt.figure(figsize=(19, 15))
plt.matshow(df_all[key_features].corr(), fignum=fig.number)
plt.xticks(range(df_all[key_features].shape[1]), df_all[key_features], fontsize=14, rotation=90)
plt.yticks(range(df_all[key_features].shape[1]), df_all[key_features], fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)

#### Achievement Category Legend
- 81   Feats of Strength
- 92   Character/Leveling
- 95   PVP
- 97   Exploration
- 155  World Events
- 168  Dungeons and Raids
- 169  Professions/Crafting
- 201  Reputation

Working with only 150,000 samples took more than an hour. Will look into spark and aws

In [None]:
# Don't do more than 50 estimators or it will crash
selected = RandomForestClassifier(n_estimators = 50,n_jobs = -1,
                           oob_score = True,bootstrap = True,random_state = 42)
selected.fit(X_train, y_train)

In [None]:
pred = selected.predict(X_train)

In [None]:
print(pred)

In [None]:
print(pred))