In [2]:
import pandas as pd
import numpy as np 

In [3]:
# load our datasets
players_df = pd.read_csv('../kaggle/players.csv')
match_df = pd.read_csv('../kaggle/match.csv')

In [4]:
# check and see if we can densify our heroset
heroes = players_df.groupby('hero_id').size().to_frame(name="count")
heroes['percentage'] = (heroes['count'] / heroes['count'].sum()) * 100
heroes = heroes.sort_values(by=['percentage'], ascending=False)
heroes

Unnamed: 0_level_0,count,percentage
hero_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21,20881,4.1762
11,17007,3.4014
74,11676,2.3352
7,11323,2.2646
28,11181,2.2362
...,...,...
80,967,0.1934
78,931,0.1862
103,838,0.1676
66,579,0.1158


In [5]:
abilities = players_df.groupby('unit_order_train_ability').size().to_frame(name="count")
abilities['percentage'] = (abilities['count'] / abilities['count'].sum()) * 100
abilities = abilities.sort_values(by=['percentage'], ascending=False)
abilities

Unnamed: 0_level_0,count,percentage
unit_order_train_ability,Unnamed: 1_level_1,Unnamed: 2_level_1
25.0,43404,8.682068
16.0,41998,8.400827
17.0,41759,8.35302
18.0,41576,8.316414
19.0,39149,7.830943
15.0,36387,7.278463
20.0,36071,7.215253
14.0,33196,6.640169
21.0,31574,6.315722
22.0,27029,5.406589


In [6]:
# from the dataframe above, we see many values below 1%. Let's filter them.
less_than_1 = heroes[heroes['percentage'] > 1] # len -> 75
less_than_1['percentage'].sum()
len(less_than_1)

36

In [7]:
less_than_05 = (heroes[heroes['percentage'] < 1]) # len -> 37
less_than_05.sum()
len(less_than_05)

75

## Densifying

From the above, we see that 75/111 are used less than 1% of the time. We can map ranges of heroes that aren't frequently used to create new heroes that represent these groups.

In [9]:
# these groups can be broken up more or less depending on classification results
picked_heroes = heroes[heroes['percentage'] > 1]
unpicked_group1 = heroes[heroes['percentage'] < 0.25]
group1_percent = unpicked_group1['percentage'].sum()
unpicked_group2 = heroes[(heroes['percentage'] >= 0.25) & (heroes['percentage'] < 0.35)]
group2_percent = unpicked_group2['percentage'].sum()
unpicked_group3 = heroes[(heroes['percentage'] >= 0.35) & (heroes['percentage'] < 0.45)]
group3_percent = unpicked_group3['percentage'].sum()
unpicked_group4 = heroes[(heroes['percentage'] >= 0.45) & (heroes['percentage'] < 0.50)]
group4_percent = unpicked_group4['percentage'].sum()
unpicked_group5 = heroes[(heroes['percentage'] >= 0.50) & (heroes['percentage'] < 0.55)]
group5_percent = unpicked_group5['percentage'].sum()
unpicked_group6 = heroes[(heroes['percentage'] >= 0.55) & (heroes['percentage'] < 0.60)]
group6_percent = unpicked_group6['percentage'].sum()
unpicked_group7 = heroes[(heroes['percentage'] >= 0.6) & (heroes['percentage'] < 0.70)]
group7_percent = unpicked_group7['percentage'].sum()
unpicked_group8 = heroes[(heroes['percentage'] >= 0.7) & (heroes['percentage'] < 0.80)]
group8_percent = unpicked_group8['percentage'].sum()
unpicked_group9 = heroes[(heroes['percentage'] >= 0.8) & (heroes['percentage'] < 1.0)]
group9_percent = unpicked_group9['percentage'].sum()

In [9]:
# group by abilities
ability_groups = []
high_pick_abil = abilities[abilities['percentage'] >= 7.5]
abil_group1 = abilities[(abilities['percentage'] < 7.5) & (abilities['percentage'] >= 5)]
abil_group2 = abilities[(abilities['percentage'] < 5) & (abilities['percentage'] >= 2.5)]
abil_group3 = abilities[(abilities['percentage'] < 2.5) & (abilities['percentage'] >= 0.5)]
abil_group4 = abilities[(abilities['percentage'] < 0.5)]
ability_groups.append(high_pick_abil)
ability_groups.append(abil_group1)
ability_groups.append(abil_group2)
ability_groups.append(abil_group3)
ability_groups.append(abil_group4)

In [10]:
abilities_to_group = {}
ability_dict = {}
for i in range(len(ability_groups)):
    group = f'group-{i}'
    ability_dict[group] = [0]
    [abilities_to_group.update({ability_id: group}) for ability_id in ability_groups[i].index.to_list()]

In [11]:
abilities_to_group

{25.0: 'group-0',
 16.0: 'group-0',
 17.0: 'group-0',
 18.0: 'group-0',
 19.0: 'group-0',
 15.0: 'group-1',
 20.0: 'group-1',
 14.0: 'group-1',
 21.0: 'group-1',
 22.0: 'group-1',
 13.0: 'group-2',
 11.0: 'group-2',
 23.0: 'group-2',
 24.0: 'group-2',
 12.0: 'group-2',
 26.0: 'group-3',
 9.0: 'group-3',
 10.0: 'group-3',
 8.0: 'group-3',
 27.0: 'group-4',
 7.0: 'group-4',
 6.0: 'group-4',
 5.0: 'group-4',
 28.0: 'group-4',
 4.0: 'group-4',
 3.0: 'group-4',
 29.0: 'group-4',
 2.0: 'group-4',
 30.0: 'group-4',
 1.0: 'group-4',
 31.0: 'group-4',
 32.0: 'group-4',
 33.0: 'group-4',
 34.0: 'group-4',
 35.0: 'group-4',
 36.0: 'group-4',
 37.0: 'group-4',
 38.0: 'group-4',
 40.0: 'group-4',
 41.0: 'group-4',
 39.0: 'group-4',
 43.0: 'group-4',
 71.0: 'group-4',
 42.0: 'group-4',
 45.0: 'group-4',
 48.0: 'group-4',
 49.0: 'group-4',
 50.0: 'group-4',
 57.0: 'group-4',
 65.0: 'group-4',
 44.0: 'group-4',
 151.0: 'group-4'}

In [12]:
group1_percent, group2_percent, group3_percent, group4_percent, group5_percent, group6_percent, group7_percent, group8_percent, group9_percent

NameError: name 'group1_percent' is not defined

### Reduced Hero Count

From our dataset, we saw that there were 111 heroes and 36/111 were picked more than 1% of the time. This left 75/111 heroes under 1%. We reduced these 75 heroes into 9 groups, so now instead of 111 heroes, we have 45 heroes. This can be reduced further if we increase the ranges of percentages from the cell above. For now, we need to create a new dataframe which contains game metadata: match_id, players chosen, team, and win/lose.

In [98]:
### Map heroes to groups
unpicked_dict = {}
unpicked_dict['group1_heroes'] = unpicked_group1.index.to_list()
unpicked_dict['group2_heroes'] = unpicked_group2.index.to_list()
unpicked_dict['group3_heroes'] = unpicked_group3.index.to_list()
unpicked_dict['group4_heroes'] = unpicked_group4.index.to_list()
unpicked_dict['group5_heroes'] = unpicked_group5.index.to_list()
unpicked_dict['group6_heroes'] = unpicked_group6.index.to_list()
unpicked_dict['group7_heroes'] = unpicked_group7.index.to_list()
unpicked_dict['group8_heroes'] = unpicked_group8.index.to_list()
unpicked_dict['group9_heroes'] = unpicked_group9.index.to_list()

picked_heroes_list = (picked_heroes.index.to_list())
picked_dict = {} # key -> group, val -> heroes
heroes_to_group = {}
for i in range(0,len(picked_heroes), 4):
    picked_dict[f'group-{i}'] = picked_heroes[i: i + 9]
    for hero in picked_heroes[i: i + 4].index:
        heroes_to_group[hero] = f'group-{i}'


In [13]:
#players_df.tail()
match_df
# problem -> match_ids from players_df do not map directly to match_ids in match_outcome_df
# for now, we will just map every 10 match_ids from players to every 10 in match_outcome

Unnamed: 0,match_id,start_time,duration,tower_status_radiant,tower_status_dire,barracks_status_dire,barracks_status_radiant,first_blood_time,game_mode,radiant_win,negative_votes,positive_votes,cluster
0,0,1446750112,2375,1982,4,3,63,1,22,True,0,1,155
1,1,1446753078,2582,0,1846,63,0,221,22,False,0,2,154
2,2,1446764586,2716,256,1972,63,48,190,22,False,0,0,132
3,3,1446765723,3085,4,1924,51,3,40,22,False,0,0,191
4,4,1446796385,1887,2047,0,0,63,58,22,True,0,0,156
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,1447829192,3025,1824,0,0,63,117,22,True,0,0,111
49996,49996,1447829181,1451,1982,384,48,63,147,22,True,0,0,204
49997,49997,1447829197,2672,1830,0,0,63,130,22,True,0,0,111
49998,49998,1447829189,2211,1982,4,3,63,645,22,True,0,0,151


In [14]:
# Create new dataframe for classification
classification_dataframe = match_df[['match_id']] # 1 row for dire
classification_dataframe = pd.concat([classification_dataframe,classification_dataframe], ignore_index=True)
ability_df = pd.DataFrame(data=ability_dict)
classification_dataframe = pd.concat([classification_dataframe, ability_df], axis=1)
# add columns for each hero. TODO: add unpicked groupings
#heroes = {}; [heroes.update({group: [0]}) for group in list(picked_dict.keys())]
#heroes_df = pd.DataFrame(data=heroes)
# join the heroes dataframe with the classification and add team/win columns
#classification_dataframe = pd.concat([classification_dataframe, heroes_df], axis=1)
# add first blood/duration
#fb_df = match_df[['first_blood_time']]
#fb_df = pd.concat([fb_df, fb_df], ignore_index=True)
#dur_df = match_df[['duration']]
#dur_df = pd.concat([dur_df, dur_df], ignore_index=True)
classification_dataframe['unpicked'] = 0
#classification_dataframe['dire'] = 0; classification_dataframe['win'] = 0
#classification_dataframe['fb_time'] = fb_df['first_blood_time'] 
#classification_dataframe['duration'] = dur_df['duration'] 

In [15]:
classification_dataframe

Unnamed: 0,match_id,group-0,group-1,group-2,group-3,group-4,unpicked
0,0,0.0,0.0,0.0,0.0,0.0,0
1,1,,,,,,0
2,2,,,,,,0
3,3,,,,,,0
4,4,,,,,,0
...,...,...,...,...,...,...,...
99995,49995,,,,,,0
99996,49996,,,,,,0
99997,49997,,,,,,0
99998,49998,,,,,,0


### Classification_dataframe

From the above, we now have a dataframe with a match_id, heroes, team, and win column.
When this is populated, we should have enough data for classification itself.

In [16]:
# now we populate the above dataframe with data from players_df

# Let's grab the hero picks for each game
result = [x for x in players_df['hero_id']]
result = [result[i: i + 5] for i in range(0, len(result), 5)]


In [17]:
# grab the abilities used by each player in each game 
result2 = [x for x in players_df['unit_order_train_ability']]
result2 = [result2[i: i + 5] for i in range(0, len(result2), 5)]

len(result2)

100000

In [34]:
print(result2[5:10])

[[22.0, 20.0, 21.0, 19.0, 17.0], [19.0, 24.0, 25.0, 18.0, 24.0], [25.0, 25.0, 20.0, 17.0, 25.0], [20.0, 20.0, 14.0, 13.0, 18.0], [11.0, 11.0, 13.0, 16.0, 14.0]]


In [18]:
match_index = 0
data = []
# lets format our hero picks to match our dataframe
for i in range(0, len(result2), 2):
    # get the heroes picked for each team
    radiant = result2[i]
    dire = result2[i+1]
    # map column names
    #radiant_heroes_col = []
    # dire_heroes_col = []

    radiant_heroes_col = [abilities_to_group[i] if i in  abilities_to_group else f'unpicked'  for i in radiant]
    dire_heroes_col = [abilities_to_group[i] if i in  abilities_to_group else f'unpicked'  for i in dire]
    radiant_data = [1 for column in radiant_heroes_col]
    dire_data = [1 for column in dire_heroes_col]
    # map extra columns
    radiant_heroes_col.append('dire')
    dire_heroes_col.append('dire')
    radiant_heroes_col.append('win')
    dire_heroes_col.append('win')
    # add team mapping
    radiant_data.append(0)
    dire_data.append(1)
    # add if team won
    radiant_data.append(1 if match_df.loc[match_index]['radiant_win'] else 0)
    dire_data.append(0 if match_df.loc[match_index]['radiant_win'] else 1)
    # update dataframe
    classification_dataframe.loc[match_index, radiant_heroes_col] = radiant_data
    classification_dataframe.loc[match_index + 50000, dire_heroes_col] = dire_data

    match_index += 1
    
    
    

In [19]:
classification_dataframe.fillna(0, inplace=True)

In [44]:
# verify results are true
classification_dataframe

Unnamed: 0,match_id,group-0,group-1,group-2,group-3,group-4,unpicked,dire,win
0,0,1.0,1.0,1.0,0.0,0.0,0,0.0,1.0
1,1,1.0,1.0,1.0,0.0,0.0,0,0.0,0.0
2,2,1.0,1.0,1.0,0.0,0.0,0,0.0,0.0
3,3,1.0,0.0,1.0,0.0,0.0,0,0.0,0.0
4,4,1.0,1.0,1.0,0.0,0.0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
99995,49995,1.0,0.0,1.0,0.0,0.0,0,1.0,0.0
99996,49996,0.0,0.0,1.0,1.0,1.0,0,1.0,0.0
99997,49997,1.0,1.0,0.0,0.0,0.0,0,1.0,0.0
99998,49998,1.0,1.0,1.0,0.0,0.0,0,1.0,0.0


In [114]:
classification_dataframe.to_csv('./classification-parsed-unpick1.csv',index=False)

In [43]:
classification_dataframe.columns

Index(['match_id', 'group-0', 'group-1', 'group-2', 'group-3', 'group-4',
       'unpicked', 'dire', 'win'],
      dtype='object')

### Classification

With our dataset, we can now classify it using scikit learn's random forest classifier.

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [21]:
X=classification_dataframe[['group-0', 'group-1', 'group-2', 'group-3', 'group-4',
       'unpicked', 'dire']]  # Features
y=classification_dataframe['win']  # Labels

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7) # 70% training and 30% test

In [22]:
classification_dataframe

Unnamed: 0,match_id,group-0,group-1,group-2,group-3,group-4,unpicked,dire,win
0,0,1.0,1.0,1.0,0.0,0.0,0,0.0,1.0
1,1,1.0,1.0,1.0,0.0,0.0,0,0.0,0.0
2,2,1.0,1.0,1.0,0.0,0.0,0,0.0,0.0
3,3,1.0,0.0,1.0,0.0,0.0,0,0.0,0.0
4,4,1.0,1.0,1.0,0.0,0.0,0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
99995,49995,1.0,0.0,1.0,0.0,0.0,0,1.0,0.0
99996,49996,0.0,0.0,1.0,1.0,1.0,0,1.0,0.0
99997,49997,1.0,1.0,0.0,0.0,0.0,0,1.0,0.0
99998,49998,1.0,1.0,1.0,0.0,0.0,0,1.0,0.0


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
clf_knn = KNeighborsClassifier(n_neighbors=10)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
clf_knn.fit(X_train,y_train)

y_pred=clf.predict(X_test)
y_pred_knn=clf_knn.predict(X_test)

In [24]:
len(y_test)

70000

In [25]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy rf:",metrics.accuracy_score(y_test, y_pred))
print("Accuracy knn:",metrics.accuracy_score(y_test, y_pred_knn))

Accuracy rf: 0.5624285714285714
Accuracy knn: 0.5391428571428571


In [32]:
from sklearn.metrics import confusion_matrix
matrix_rf = confusion_matrix(y_test, y_pred)
matrix_knn = confusion_matrix(y_test, y_pred_knn)

In [33]:
matrix, matrix_knn

(array([[ 9526, 25457],
        [ 5173, 29844]]),
 array([[16608, 18375],
        [13885, 21132]]))

In [37]:
pd.DataFrame(matrix, columns=["Actual Negative", "Actual Positive"], index=["Predicted Negative", "Predicted Positive"])

Unnamed: 0,Actual Negative,Actual Positive
Predicted Negative,9526,25457
Predicted Positive,5173,29844


In [38]:
pd.DataFrame(matrix_knn, columns=["Actual Negative", "Actual Positive"], index=["Predicted Negative", "Predicted Positive"])

Unnamed: 0,Actual Negative,Actual Positive
Predicted Negative,16608,18375
Predicted Positive,13885,21132


In [42]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=["Lose", "Win"]))

              precision    recall  f1-score   support

        Lose       0.65      0.27      0.38     34983
         Win       0.54      0.85      0.66     35017

    accuracy                           0.56     70000
   macro avg       0.59      0.56      0.52     70000
weighted avg       0.59      0.56      0.52     70000



In [43]:
print(classification_report(y_test, y_pred_knn, target_names=["Lose", "Win"]))

              precision    recall  f1-score   support

        Lose       0.54      0.47      0.51     34983
         Win       0.53      0.60      0.57     35017

    accuracy                           0.54     70000
   macro avg       0.54      0.54      0.54     70000
weighted avg       0.54      0.54      0.54     70000

