In [1]:
# import required libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib


#import data
# matches = pd.read_csv('test_player.csv')
# champions = pd.read_csv('hero_names_category.csv')
# heroes = pd.read_csv('hero_names.csv')
matches = pd.read_csv('./dota-2-matches/test_player.csv')
champions = pd.read_csv('./dota-2-matches/hero_names_category.csv')

heroes = pd.read_csv('./dota-2-matches/hero_names.csv')

In [2]:
#remove account_id column
matches = matches.drop("account_id",axis=1)

In [3]:
#Returns a map of heroId to name
def mapId2Name():
    mapheroes = {}
    for row in heroes.itertuples():
        mapheroes[row.hero_id] = row.localized_name
    return mapheroes

#Returns a map of name to Id
def mapName2Id():
    mapId = {}
    for row in heroes.itertuples():
        mapheroes[row.localized_name] = row.hero_id
    return mapId


def searchHeroName(heroId):
    mapToReturn = mapId2Name()
    return mapToReturn[heroId]


def searchHeroId(heroName):
    mapToReturn = mapName2Id()
    return mapToReturn[heroName]

In [4]:
#understanding player slot
matches.head(n=5)

Unnamed: 0,match_id,hero_id,player_slot
0,50000,96,0
1,50000,84,1
2,50000,46,2
3,50000,85,3
4,50000,39,4


In [5]:
#collapse data into columns
#The interpretation of this table is that the row represent that particular match ID. The column names are the player
#slots. They are not the individual players but rather the slots that the players play in. 
#The value within each cell is the hero id. 
competitor_rows = pd.pivot_table(matches, values='hero_id', index=['match_id'], columns='player_slot')
competitor_rows.head(n=5)

player_slot,0,1,2,3,4,128,129,130,131,132
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
50000,96,84,46,85,39,9,75,106,74,62
50001,44,10,57,2,106,58,61,21,18,14
50002,74,7,42,99,88,69,8,25,26,79
50003,44,15,110,56,94,2,101,32,7,72
50004,98,26,73,51,46,2,106,50,65,21


In [6]:
#Current each row consists of two teams. Therefore, we split dataframe vertically to separate the two teams.
#The 2nd section of the dataframe now gets appended to the end of the 1 section.
#Each row in the resultant dataframe corresponds with only one team rather than two.
section = competitor_rows
sectionA,sectionB  = np.split(section, 2, axis=1)
sectionB.columns = sectionA.columns
sectionA.append(sectionB,ignore_index = True)
sectionA.head(n=5)

player_slot,0,1,2,3,4
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
50000,96,84,46,85,39
50001,44,10,57,2,106
50002,74,7,42,99,88
50003,44,15,110,56,94
50004,98,26,73,51,46


In [7]:
#The table above now gets transformed into a binary matrix. Each column name corresponds to the hero id. 
#Each row corresponds to a particular match id. However, it's worth mentioning that the match id values gets reset to 
#start at one but the current situation does not consider the match id for clustering so the result is not affected. 

#The interpretation of the binary matrix starts off with each row representing a match for one team
#Each column in the binary matrix corresponds to a hero Id
#The value of each cell represent whether that particular hero is chosen for that particular match 
#for that specific team.

#It's worth mentioning that the dataframe starts off at column index 0. However, hero Ids start off at 1 
#This means that every player's index is offset by one. This situation has been accounted for. 

dataset=[]
for row in sectionA.iterrows():
    index, data = row
    dataset.append(data.tolist())
#it transforms the input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
oht = TransactionEncoder()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
df.head(n=5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,109,110,111,112
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
#Converting the binary string matrix to a binary integer matrix
# 0: False
# 1: True
df = df*1
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,109,110,111,112
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# create kmeans object and call fit
#Applies Kmeans algorithm
KM = KMeans(n_clusters=4, init='k-means++', random_state=170)
KM = KM.fit(df)
print("The cluster centroids are: \n", KM.cluster_centers_)
print("Cluster_label:\n", KM.labels_)
print("Cluster_inertia:\n", KM.inertia_)

The cluster centroids are: 
 [[  2.10921516e-05   9.41342726e-02   6.49638270e-02   3.55613676e-02
    3.07945414e-02   8.30819852e-02   2.82212989e-02   1.23009428e-01
    1.18242602e-01   7.59317458e-02   1.79283289e-02  -3.11417558e-14
    4.49262829e-02   2.14929025e-02   1.06916117e-01   2.35177490e-02
    3.06047120e-02   3.57511970e-02   3.61097636e-02   7.61004830e-02
    4.54114024e-02   1.76220150e-13   5.73706524e-02   2.99719474e-02
    1.04638164e-01   7.35694248e-02   3.96321529e-02   1.08371475e-01
    2.63019131e-02   7.72183670e-02   4.89548839e-02   4.49684672e-02
    2.43192508e-02   3.61097636e-02   4.70987745e-02   6.67566598e-02
    2.07757693e-02   1.40051887e-02   1.49859737e-01   3.15960431e-02
    3.09210943e-02   7.61426673e-02   2.11976124e-02   7.53833499e-02
    1.83923562e-02   9.49990509e-02   5.05367953e-02   2.58167936e-02
    2.40661450e-02  -4.78922457e-14   4.38927675e-02   1.48488747e-02
    3.25240978e-02   2.62175445e-02   4.36185695e-02   2.6301

In [10]:
#Based off the results collected, each index corresponds with a HeroID. However, an array starts at 0 which offets
#every monster to the left. Additionally, the dataset not account for hero ID: 24 but there is still a value at index.
#This means that every monster with an original hero id greater than 23 is offset to the left by two. 
#Therefore, the below method brings back the monster to its rightful index
def correctCluster(cluster):
    resultArr = []
    for num in cluster:
        resultArr.append(num+2) if num>23 else resultArr.append(num+1)
    return resultArr

In [11]:
cluster1 = KM.cluster_centers_[0]
cluster2 = KM.cluster_centers_[1]
cluster3 = KM.cluster_centers_[2]
cluster4 = KM.cluster_centers_[3]
# cluster5 = KM.cluster_centers_[4]
# cluster6 = KM.cluster_centers_[5]
# cluster7 = KM.cluster_centers_[6]
# cluster8 = KM.cluster_centers_[7]
# cluster9 = KM.cluster_centers_[8]
# cluster10 = KM.cluster_centers_[9]

#This map's keys would be the cluster's name and each corresponding value would be a tuple of the top 5 hero names
clusterMapName = {}
#This map's keys would be the cluster's name and each corresponding value would be a tuple of the top 5 hero Ids
clusterMapId = {}

##Cluster 1

#The function below sorts the array index from lowest to highest based off the value of each elements
#This means that the first 5 index slots that has highest values will be returned
#For example: given array=[5,1,4,8,0], the result would be [4,1,2,0,3]
A = sorted(range(len(cluster1)), key=lambda i: cluster1[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster1'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster1']=(hero1,hero2,hero3,hero4,hero5)

##Cluster 2
A = sorted(range(len(cluster2)), key=lambda i: cluster2[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster2'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster2']=(hero1,hero2,hero3,hero4,hero5)

##Cluster 3
A = sorted(range(len(cluster3)), key=lambda i: cluster3[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster3'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster3']=(hero1,hero2,hero3,hero4,hero5)

##Cluster 4
A = sorted(range(len(cluster4)), key=lambda i: cluster4[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster4'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster4']=(hero1,hero2,hero3,hero4,hero5)

#Cluster 5
# A = sorted(range(len(cluster5)), key=lambda i: cluster5[i])[-5:]
# top5HerosId = correctCluster(A)
# hero1 = searchHeroName(top5HerosId[0])
# hero2 = searchHeroName(top5HerosId[1])
# hero3 = searchHeroName(top5HerosId[2])
# hero4 = searchHeroName(top5HerosId[3])
# hero5 = searchHeroName(top5HerosId[4])
# clusterMapId['Cluster5'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
# clusterMapName['Cluster5']=(hero1,hero2,hero3,hero4,hero5)

# ##Cluster 6
# A = sorted(range(len(cluster6)), key=lambda i: cluster6[i])[-5:]
# top5HerosId = correctCluster(A)
# hero1 = searchHeroName(top5HerosId[0])
# hero2 = searchHeroName(top5HerosId[1])
# hero3 = searchHeroName(top5HerosId[2])
# hero4 = searchHeroName(top5HerosId[3])
# hero5 = searchHeroName(top5HerosId[4])
# clusterMapId['Cluster6'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
# clusterMapName['Cluster6']=(hero1,hero2,hero3,hero4,hero5)

# ##Cluster 7
# A = sorted(range(len(cluster7)), key=lambda i: cluster7[i])[-5:]
# top5HerosId = correctCluster(A)
# hero1 = searchHeroName(top5HerosId[0])
# hero2 = searchHeroName(top5HerosId[1])
# hero3 = searchHeroName(top5HerosId[2])
# hero4 = searchHeroName(top5HerosId[3])
# hero5 = searchHeroName(top5HerosId[4])
# clusterMapId['Cluster7'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
# clusterMapName['Cluster7']=(hero1,hero2,hero3,hero4,hero5)

# ##Cluster 8
# A = sorted(range(len(cluster8)), key=lambda i: cluster8[i])[-5:]
# top5HerosId = correctCluster(A)
# hero1 = searchHeroName(top5HerosId[0])
# hero2 = searchHeroName(top5HerosId[1])
# hero3 = searchHeroName(top5HerosId[2])
# hero4 = searchHeroName(top5HerosId[3])
# hero5 = searchHeroName(top5HerosId[4])
# clusterMapId['Cluster8'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
# clusterMapName['Cluster8']=(hero1,hero2,hero3,hero4,hero5)

# ##Cluster 9
# A = sorted(range(len(cluster9)), key=lambda i: cluster9[i])[-5:]
# top5HerosId = correctCluster(A)
# hero1 = searchHeroName(top5HerosId[0])
# hero2 = searchHeroName(top5HerosId[1])
# hero3 = searchHeroName(top5HerosId[2])
# hero4 = searchHeroName(top5HerosId[3])
# hero5 = searchHeroName(top5HerosId[4])
# clusterMapId['Cluster9'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
# clusterMapName['Cluster9']=(hero1,hero2,hero3,hero4,hero5)

# ##Cluster 10
# A = sorted(range(len(cluster10)), key=lambda i: cluster10[i])[-5:]
# top5HerosId = correctCluster(A)
# hero1 = searchHeroName(top5HerosId[0])
# hero2 = searchHeroName(top5HerosId[1])
# hero3 = searchHeroName(top5HerosId[2])
# hero4 = searchHeroName(top5HerosId[3])
# hero5 = searchHeroName(top5HerosId[4])
# clusterMapId['Cluster10'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
# clusterMapName['Cluster10']=(hero1,hero2,hero3,hero4,hero5)


In [12]:
#Table of hero names for a given cluster
pd.DataFrame(clusterMapName)

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4
0,Skywrath Mage,Skywrath Mage,Mirana,Mirana
1,Juggernaut,Tidehunter,Tidehunter,Juggernaut
2,Earth Spirit,Mirana,Skywrath Mage,Skywrath Mage
3,Invoker,Juggernaut,Clockwerk,Tidehunter
4,Venomancer,Zeus,Silencer,Phantom Lancer


In [13]:
#Table of hero ids for a given cluster
resultIds = pd.DataFrame(clusterMapId)
resultIds

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4
0,101,101,9,9
1,8,29,29,8
2,107,9,101,101
3,74,8,51,29
4,40,22,75,12


In [14]:
#Given a hero id, we return the role that hero plays
def find_role(heroId):
    champ_row = champions.loc[champions["hero_id"] == heroId]
    role =  champ_row["role"]
    role = role.values
    return ''.join(role)

#In the table of hero Ids, we apply the above function into each cell
resultCategory = resultIds.applymap(lambda x: find_role(x))
#A table of what types of roles each cluster has
resultCategory

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4
0,nuker,nuker,escape,escape
1,carry,initiator,initiator,carry
2,nuker,escape,nuker,nuker
3,nuker,carry,initiator,initiator
4,support,nuker,disabler,carry
