# Data Mining

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import kmedoids
import scipy.spatial.distance as spsd
from sklearn.metrics import silhouette_score
from rule_miner import RuleMiner
from sklearn.cluster import DBSCAN

np.random.seed(2)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Preparing the Dataset for Data Mining

The csv files are imported to dataframes. Specifically, the Showdown dataset will be used for data mining purposes, while the Pokemon Stats dataset is used to convert each Pokemon into an integer. That integer is the index of said Pokemon in the Pokemon Stats dataset. This is done to make one-hot-encoding easier.

The data mining also uses the cleaned up version of the Showdown dataset with no duplicate players to determine the meta based on the choices of multiple players.

In [68]:
pokemon_df = pd.read_csv('showdown.csv').loc[:, ['Pokemon 1','Pokemon 2','Pokemon 3','Pokemon 4','Pokemon 5','Pokemon 6']]
pokedex_df = pd.read_csv('dataset/bulbagarden/pokemon_gen5_stats.csv')
pokedex_df

Unnamed: 0,Pokedex number,Name,HP,Attack,Special Attack,Defense,Special Defense,Speed,Stat total,Stat average
0,1,Bulbasaur,45,49,49,65,65,45,318,53.00
1,2,Ivysaur,60,62,63,80,80,60,405,67.50
2,3,Venusaur,80,82,83,100,100,80,525,87.50
3,4,Charmander,39,52,43,60,50,65,309,51.50
4,5,Charmeleon,58,64,58,80,65,80,405,67.50
...,...,...,...,...,...,...,...,...,...,...
665,647,Keldeo,91,72,90,129,90,108,580,96.67
666,647,Keldeo-Resolute,91,72,90,129,90,108,580,96.67
667,648,Meloetta,100,77,77,128,128,90,600,100.00
668,648,Meloetta-Pirouette,100,128,90,77,77,128,600,100.00


Each Pokemon team in the Showdown dataset is converted to numerical baskets with each Pokemon being represented using their index number in the Pokedex data frame.

In [69]:
baskets = []
for index, row in pokemon_df.iterrows():
    pkmn = row
    pkmn = [int(pokedex_df[pokedex_df['Name']==x].index[0]) for x in pkmn]
    baskets.append(pkmn)

for i, basket in enumerate(baskets):
    print('Basket', i, basket)

Basket 0 [394, 285, 247, 606, 442, 592]
Basket 1 [247, 64, 380, 611, 661, 542]
Basket 2 [247, 604, 120, 64, 495, 226]
Basket 3 [247, 520, 204, 372, 477, 285]
Basket 4 [654, 185, 72, 50, 611, 384]
Basket 5 [247, 486, 611, 661, 64, 379]
Basket 6 [93, 148, 64, 459, 226, 211]
Basket 7 [459, 112, 185, 72, 226, 384]
Basket 8 [226, 427, 592, 247, 379, 495]
Basket 9 [506, 450, 37, 120, 2, 93]
Basket 10 [455, 611, 606, 380, 450, 211]
Basket 11 [450, 495, 611, 606, 134, 211]
Basket 12 [606, 211, 120, 495, 37, 661]
Basket 13 [450, 148, 285, 120, 650, 211]
Basket 14 [455, 611, 606, 380, 450, 211]
Basket 15 [185, 384, 663, 657, 231, 665]
Basket 16 [285, 64, 450, 483, 211, 665]
Basket 17 [37, 349, 473, 584, 450, 476]
Basket 18 [442, 606, 592, 285, 247, 323]
Basket 19 [64, 450, 427, 226, 247, 93]
Basket 20 [247, 64, 285, 648, 661, 211]
Basket 21 [112, 201, 90, 450, 148, 231]
Basket 22 [64, 648, 661, 611, 604, 247]
Basket 23 [380, 372, 606, 611, 236, 477]
Basket 24 [285, 380, 652, 247, 665, 211]
Baske

The baskets are then turned to a data frame of one-hot-encoded values. This is done since some algorithms require one-hot-encoding to work and is more efficient in general.

In [70]:
bool_df = pd.DataFrame([[0 for _ in range(670)] for _ in range(1790)], columns=[i for i in range(670)])

for i, basket in enumerate(baskets):
    bool_df.iloc[i, basket] = 1

bool_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,660,661,662,663,664,665,666,667,668,669
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1785,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1786,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1787,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1788,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


This is a Helper Function to revert the Pokedex Index number to Pokemon Name for analysis.

In [71]:
def revert(list, key_df):
    return [key_df.loc[x,'Name'] for x in list]

## Association Rule Mining

The rule miner is created for associate rule mining.

In [72]:
# Choose a confidence level and support level. Can opt to use percentage of total instead.
rule_miner = RuleMiner(support_t=1790 / 10,confidence_t=0.5)

All itemsets that are equal to or above the support threshold are printed.

In [73]:
all_frequent_itemsets = rule_miner.get_frequent_itemsets(bool_df)

for itemset in all_frequent_itemsets:
    print( revert(itemset, pokedex_df) )

['Tentacruel', 'Politoed']
['Politoed', 'Ferrothorn']
['Latios', 'Tyranitar']
['Landorus-Therian', 'Tyranitar']
['Ferrothorn', 'Latios']
['Latios', 'Landorus-Therian']


After determining the frequent item sets, rules are then determined basesd on the confidence level provided. The helper function is then used to revert back the rules into the actual Pokemon names.

In [74]:
rules = rule_miner.get_association_rules(bool_df)

for rule in rules:
    print(f"{revert(rule[0], pokedex_df)} -> {revert(rule[1], pokedex_df)}")


['Tentacruel'] -> ['Politoed']
['Tentacruel'] -> ['Politoed']
['Politoed'] -> ['Ferrothorn']
['Ferrothorn'] -> ['Politoed']
['Ferrothorn'] -> ['Politoed']
['Politoed'] -> ['Ferrothorn']
['Landorus-Therian'] -> ['Tyranitar']
['Landorus-Therian'] -> ['Tyranitar']


Due to the rule miner code, duplicates may appear on the rules generated. Because of this, the unique rules are extracted from the original list.

In [75]:
unique_rules = set([(tuple(x[0]), tuple(x[1])) for x in rules])

for rule in unique_rules:
    print(f"{revert(rule[0], pokedex_df)} -> {revert(rule[1], pokedex_df)}")

['Landorus-Therian'] -> ['Tyranitar']
['Politoed'] -> ['Ferrothorn']
['Tentacruel'] -> ['Politoed']
['Ferrothorn'] -> ['Politoed']


## DBSCAN Clustering with Jaccard Distance

In [76]:
# array_of_teams = bool_df.to_numpy()
# spsd.pdist(array_of_teams, metric='jaccard')

Clustering is done using one-hot-encoded version of the teams. The distance metric used is Jaccard distance, as each team is a set of Pokemon, making other distance metrics such as Euclidean and cosine unavailable. DBSCAN clustering is used as other clustering algorithms such as K-Medoids have been used but factors such as low silhouette scores have led to K-Medoids being undesirable. DBSCAN is also used in order to determine which teams are considererd noise, and what clusters can form, with the restriction that each cluster should have at least 25 teams.

In [77]:
array_of_teams = bool_df.to_numpy()
# k = 1000
distmatrix = spsd.squareform(spsd.pdist(array_of_teams, metric='jaccard'))

db = DBSCAN(eps=0.3 , min_samples=25, metric='precomputed')
labels = db.fit_predict(distmatrix)

print("Cluster labels:", labels)

for label in labels:
    print(label)


Cluster labels: [-1  0 -1 ... -1 -1 -1]
-1
0
-1
-1
-1
0
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
2
-1
-1
-1
-1
-1
-1
-1
-1
-1
2
-1
-1
-1
-1
-1
-1
-1
-1
1
2
-1
-1
2
0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
2
-1
-1
0
-1
-1
0
-1
-1
-1
-1
2
-1
-1
-1
-1
-1
-1
-1
2
-1
-1
-1
-1
-1
0
-1
-1
-1
-1
-1
-1
-1
-1
-1
2
0
-1
-1
3
-1
2
-1
-1
-1
-1
-1
-1
0
-1
-1
-1
-1
-1
0
-1
-1
3
-1
-1
-1
-1
2
-1
-1
-1
-1
-1
-1
-1
-1
3
2
-1
-1
-1
-1
-1
-1
2
-1
-1
-1
1
-1
-1
0
-1
-1
0
0
-1
-1
-1
-1
0
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
0
-1
-1
-1
-1
-1
-1
-1
-1
-1
0
-1
-1
3
2
-1
-1
3
-1
0
-1
-1
-1
-1
-1
-1
2
3
-1
-1
-1
-1
0
3
2
-1
-1
-1
1
-1
-1
-1
-1
2
-1
0
-1
-1
-1
-1
-1
3
-1
-1
-1
0
-1
-1
-1
-1
3
-1
-1
-1
-1
-1
-1
2
-1
-1
2
-1
-1
0
-1
-1
-1
2
-1
1
-1
3
-1
-1
-1
-1
-1
-1
0
-1
-1
0
3
2
-1
-1
-1
-1
0
-1
-1
-1
2
2
-1
-1
-1
-1
3
-1
-1
-1
0
-1
-1
-1
-1
-1
-1
-1
2
-1
1
-1
-1
-1
2
-1
-1
0
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1

In [78]:
for index, label in enumerate(labels):
    if label == 0:
        print(revert(baskets[index], pokedex_df))

['Tyranitar', 'Alakazam', 'Latios', 'Ferrothorn', 'Landorus-Therian', 'Excadrill']
['Tyranitar', 'Rotom-Wash', 'Ferrothorn', 'Landorus-Therian', 'Alakazam', 'Latias']
['Landorus-Therian', 'Jirachi', 'Latios', 'Rotom-Wash', 'Tyranitar', 'Ferrothorn']
['Tyranitar', 'Landorus-Therian', 'Garchomp', 'Latios', 'Jirachi', 'Kingdra']
['Tyranitar', 'Rotom-Wash', 'Excadrill', 'Landorus-Therian', 'Alakazam', 'Ferrothorn']
['Tyranitar', 'Rotom-Wash', 'Alakazam', 'Ferrothorn', 'Landorus-Therian', 'Latios']
['Bisharp', 'Latios', 'Landorus-Therian', 'Ferrothorn', 'Rotom-Wash', 'Tyranitar']
['Tyranitar', 'Rotom-Wash', 'Alakazam', 'Ferrothorn', 'Landorus-Therian', 'Latios']
['Tyranitar', 'Landorus-Therian', 'Garchomp', 'Rotom-Wash', 'Jirachi', 'Keldeo']
['Tyranitar', 'Breloom', 'Landorus-Therian', 'Ferrothorn', 'Latios', 'Rotom-Wash']
['Garchomp', 'Alakazam', 'Tyranitar', 'Landorus-Therian', 'Ferrothorn', 'Latios']
['Tyranitar', 'Landorus-Therian', 'Ferrothorn', 'Excadrill', 'Latios', 'Alakazam']
['Lan