# Forward Feature Selection for Achievements

### Import dependencies

In [1]:
import pandas as pd
import os
import subscript.config as cn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

### Read in and prepare data

In [24]:
df = pd.read_csv(os.path.join(cn.clean_dir,'final_feature_categories.csv'), dtype = 'unicode')
player_cols =  ['Unnamed: 0','player','realm','gear_score','last_login',
            'time_since_login','status']
df = df.drop(player_cols, axis = 1)
dfp = pd.DataFrame()
df = df[df.engagement.astype(float) != 1]
df_original = df.copy()
df.head()

Unnamed: 0,6,7,8,9,10,11,12,13,16,31,...,11018,11030,11043,11056,12037,12197,13643,1412,1416,id
0,Character,Character,Character,Character,Character,Character,Character,Character,none,none,...,none,none,none,none,none,none,none,none,none,airgeyser_proudmoore
3,Character,Character,Character,Character,Character,Character,Character,Character,none,Quests,...,none,none,none,none,none,none,none,none,none,tharivool_proudmoore
5,Character,Character,Character,Character,Character,Character,Character,Character,none,Quests,...,none,none,none,none,none,none,none,none,none,yareij_proudmoore
6,Character,Character,Character,Character,Character,Character,Character,Character,none,none,...,none,none,none,none,none,none,none,none,none,oretenbro_proudmoore
7,Character,Character,Character,Character,Character,Character,Character,Character,none,Quests,...,none,none,none,none,none,none,none,none,none,connorpriest_proudmoore


## Convert any completed achievement to 1 for 

In [None]:
df = df_original.copy()
categroies = ['Alterac Valley','Arathi Basin','Archaeology',
              'Arena','Argent Tournament','Ashran','Battle Dungeon',
              'Battle Raid','Battle for Azeroth','Battle for Gilneas',
              'Battlegrounds',"Brawler's Guild",'Brewfest','Cataclysm',
              'Cataclysm Dungeon','Cataclysm Raid','Character',
              "Children's Week",'Classic','Collections','Cooking',
              'Currencies','Darkmoon Faire','Deepwind Gorge','Draenor',
              'Draenor Dungeon','Draenor Garrison','Draenor Raid',
              'Dungeons','Dungeons & Raids','Eastern Kingdoms',
              'Events','Expansion Features','Exploration',
              'Eye of the Storm','Feats of Strength','Fishing',
              'General','Guild Feats of Strength',"Hallow's End",
              'Heart of Azeroth','Honor','Island Expeditions',
              'Isle of Conquest','Kalimdor','Legacy','Legion',
              'Legion Class Hall','Legion Dungeon','Legion Raid',
              'Lich King Dungeon','Lich King Raid','Love is in the Air',
              'Lunar Festival','Midsummer','Mounts','Noblegarden',
              'Northrend','Outland','Pandaria','Pandaria Dungeon',
              'Pandaria Raid','Pandaria Scenarios','Pet Battles',
              "Pilgrim's Bounty",'Player vs. Player','Professions',
              'Promotions','Quests','Raids','Rated Battleground',
              'Reputation','Seething Shore','Silvershard Mines',
              'Temple of Kotmogu','The Burning Crusade','Tol Barad',
              'Twin Peaks',"Visions of N'Zoth",'War Effort',
              'Warsong Gulch','Winter Veil','Wintergrasp','World',
              'World Events','Wrath of the Lich King']
xform = [c for c in df.columns.values if 'engagement' not in c]
df[xform] = df[xform].replace('none', '0')
df[xform] = df[xform].replace(categories, '1')
df.head()

In [5]:
split = StratifiedShuffleSplit(n_splits = 10, test_size = 0.25, random_state = 17)
for train_index, test_index in split.split(df, df.engagement):
    strat_train = df.iloc[train_index][:]
    strat_test = df.iloc[test_index][:]

y_train = strat_train.engagement
X_train = strat_train.drop('engagement', axis = 1)
y_test = strat_test.engagement
X_test = strat_test.drop('engagement', axis = 1)


In [6]:
feature_selector = SequentialFeatureSelector(RandomForestClassifier(n_jobs=1),
           k_features=15,
           forward=True,
           verbose=2,
           scoring='roc_auc',
           cv=4)
features = feature_selector.fit(np.array(X_train.fillna(0)), y_train)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  86 out of  86 | elapsed:  4.7min finished

[2020-06-18 23:22:45] Features: 1/15 -- score: 0.8275319279890125[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  85 out of  85 | elapsed:  6.4min finished

[2020-06-18 23:29:11] Features: 2/15 -- score: 0.8655649323324635[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  84 out of  84 | elapsed:  7.8min finished

[2020-06-18 23:37:01] Features: 3/15 -- score: 0.8657906385351571[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

In [7]:
filtered_features = X_train.columns[list(features.k_feature_idx_)]
print(filtered_features)

Index(['Cataclysm Raid', 'Character', 'Currencies', 'Dungeons & Raids',
       'Events', 'General', 'Guild Feats of Strength', 'Heart of Azeroth',
       'Honor', 'Legion', 'Lich King Raid', 'Northrend', 'Professions',
       'Reputation', 'Wrath of the Lich King'],
      dtype='object')


In [8]:
clf = RandomForestClassifier(n_estimators=100, random_state=17, max_depth=3)
clf.fit(X_train[filtered_features], y_train)


RandomForestClassifier(max_depth=3, random_state=17)

In [9]:
train_pred = clf.predict_proba(X_train[filtered_features])

print('Accuracy on training set: {}'.format(roc_auc_score(y_train, train_pred[:,1])))

test_pred = clf.predict_proba(X_test[filtered_features].fillna(0))
print('Accuracy on test set: {}'.format(roc_auc_score(y_test, test_pred [:,1])))

Accuracy on training set: 0.8914431887695774
Accuracy on test set: 0.8910004372217928
