In [1]:
# import required libraries
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
from yellowbrick.cluster import KElbowVisualizer

#import data
matches = pd.read_csv('test_player.csv')
champions = pd.read_csv('hero_names_category.csv')
heroes = pd.read_csv('hero_names.csv')

In [None]:
#remove account_id column
matches = matches.drop("account_id",axis=1)

In [None]:
#Returns a map of heroId to name
def mapId2Name():
    mapheroes = {}
    for row in heroes.itertuples():
        mapheroes[row.hero_id] = row.localized_name
    return mapheroes

#Returns a map of name to Id
def mapName2Id():
    mapId = {}
    for row in heroes.itertuples():
        mapheroes[row.localized_name] = row.hero_id
    return mapId


def searchHeroName(heroId):
    mapToReturn = mapId2Name()
    return mapToReturn[heroId]


def searchHeroId(heroName):
    mapToReturn = mapName2Id()
    return mapToReturn[heroName]

In [None]:
#understanding player slot
matches.head(n=5)

In [None]:
#collapse data into columns
#The interpretation of this table is that the row represent that particular match ID. The column names are the player
#slots. They are not the individual players but rather the slots that the players play in. 
#The value within each cell is the hero id. 
competitor_rows = pd.pivot_table(matches, values='hero_id', index=['match_id'], columns='player_slot')
competitor_rows.head(n=5)

In [None]:
#Current each row consists of two teams. Therefore, we split dataframe vertically to separate the two teams.
#The 2nd section of the dataframe now gets appended to the end of the 1 section.
#Each row in the resultant dataframe corresponds with only one team rather than two.
section = competitor_rows
sectionA,sectionB  = np.split(section, 2, axis=1)
sectionB.columns = sectionA.columns
sectionA.append(sectionB,ignore_index = True)
sectionA.head(n=5)

In [None]:
#The table above now gets transformed into a binary matrix. Each column name corresponds to the hero id. 
#Each row corresponds to a particular match id. However, it's worth mentioning that the match id values gets reset to 
#start at one but the current situation does not consider the match id for clustering so the result is not affected. 

#The interpretation of the binary matrix starts off with each row representing a match for one team
#Each column in the binary matrix corresponds to a hero Id
#The value of each cell represent whether that particular hero is chosen for that particular match 
#for that specific team.

#It's worth mentioning that the dataframe starts off at column index 0. However, hero Ids start off at 1 
#This means that every player's index is offset by one. This situation has been accounted for. 

dataset=[]
for row in sectionA.iterrows():
    index, data = row
    dataset.append(data.tolist())
#it transforms the input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
oht = TransactionEncoder()
oht_ary = oht.fit(dataset).transform(dataset)
df = pd.DataFrame(oht_ary, columns=oht.columns_)
df.head(n=5)

In [None]:
#Converting the binary string matrix to a binary integer matrix
# 0: False
# 1: True
df = df*1
df.head()

In [None]:
# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,10),metric='calinski_harabaz')

visualizer.fit(df)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data

In [19]:
visualizer.poof()    # Draw/show/poof the data

In [19]:
# create kmeans object and call fit
#Applies Kmeans algorithm
KM = KMeans(n_clusters=5, init='k-means++', random_state=170)
KM = KM.fit(df)
# print("The cluster centroids are: \n", KM.cluster_centers_)
# print("Cluster_label:\n", KM.labels_)

In [20]:
#Based off the results collected, each index corresponds with a HeroID. However, an array starts at 0 which offets
#every monster to the left. Additionally, the dataset not account for hero ID: 24 but there is still a value at index.
#This means that every monster with an original hero id greater than 23 is offset to the left by two. 
#Therefore, the below method brings back the monster to its rightful index
def correctCluster(cluster):
    resultArr = []
    for num in cluster:
        resultArr.append(num+2) if num>23 else resultArr.append(num+1)
    return resultArr

In [21]:
cluster1 = KM.cluster_centers_[0]
cluster2 = KM.cluster_centers_[1]
cluster3 = KM.cluster_centers_[2]
cluster4 = KM.cluster_centers_[3]
cluster5 = KM.cluster_centers_[4]

#This map's keys would be the cluster's name and each corresponding value would be a tuple of the top 5 hero names
clusterMapName = {}
#This map's keys would be the cluster's name and each corresponding value would be a tuple of the top 5 hero Ids
clusterMapId = {}

##Cluster 1

#The function below sorts the array index from lowest to highest based off the value of each elements
#This means that the first 5 index slots that has highest values will be returned
#For example: given array=[5,1,4,8,0], the result would be [4,1,2,0,3]
A = sorted(range(len(cluster1)), key=lambda i: cluster1[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster1'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster1']=(hero1,hero2,hero3,hero4,hero5)

##Cluster 2
A = sorted(range(len(cluster2)), key=lambda i: cluster2[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster2'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster2']=(hero1,hero2,hero3,hero4,hero5)

##Cluster 3
A = sorted(range(len(cluster3)), key=lambda i: cluster3[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster3'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster3']=(hero1,hero2,hero3,hero4,hero5)

##Cluster 4
A = sorted(range(len(cluster4)), key=lambda i: cluster4[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster4'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster4']=(hero1,hero2,hero3,hero4,hero5)

##Cluster 5
A = sorted(range(len(cluster5)), key=lambda i: cluster5[i])[-5:]
top5HerosId = correctCluster(A)
hero1 = searchHeroName(top5HerosId[0])
hero2 = searchHeroName(top5HerosId[1])
hero3 = searchHeroName(top5HerosId[2])
hero4 = searchHeroName(top5HerosId[3])
hero5 = searchHeroName(top5HerosId[4])
clusterMapId['Cluster5'] = (top5HerosId[0],top5HerosId[1],top5HerosId[2],top5HerosId[3],top5HerosId[4])
clusterMapName['Cluster5']=(hero1,hero2,hero3,hero4,hero5)


In [22]:
#Table of hero names for a given cluster
pd.DataFrame(clusterMapName)

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5
0,Juggernaut,Winter Wyvern,Clockwerk,Axe,Skywrath Mage
1,Skywrath Mage,Clockwerk,Juggernaut,Skywrath Mage,Tidehunter
2,Mirana,Skywrath Mage,Skywrath Mage,Juggernaut,Mirana
3,Invoker,Juggernaut,Tidehunter,Mirana,Juggernaut
4,Venomancer,Earth Spirit,Phantom Lancer,Silencer,Zeus


In [23]:
#Table of hero ids for a given cluster
resultIds = pd.DataFrame(clusterMapId)
resultIds

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5
0,8,112,51,2,101
1,101,51,8,101,29
2,9,101,101,8,9
3,74,8,29,9,8
4,40,107,12,75,22


In [24]:
#Given a hero id, we return the role that hero plays
def find_role(heroId):
    champ_row = champions.loc[champions["hero_id"] == heroId]
    role =  champ_row["role"]
    role = role.values
    return ''.join(role)

#In the table of hero Ids, we apply the above function into each cell
resultCategory = resultIds.applymap(lambda x: find_role(x))
#A table of what types of roles each cluster has
resultCategory

Unnamed: 0,Cluster1,Cluster2,Cluster3,Cluster4,Cluster5
0,carry,support,initiator,initiator,nuker
1,nuker,initiator,carry,nuker,initiator
2,escape,nuker,nuker,carry,escape
3,nuker,carry,initiator,escape,carry
4,support,nuker,carry,disabler,nuker


In [2]:
#update - using the elbow method to calculate k 
from sklearn.datasets import make_blobs

In [3]:
# Create synthetic dataset with 8 random clusters
X, y = make_blobs(centers=8, n_features=12, shuffle=True, random_state=42)

In [4]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

# Instantiate the clustering model and visualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))

visualizer.fit(X)    # Fit the data to the visualizer
visualizer.poof()    # Draw/show/poof the data