In [21]:
import pandas as pd
from scipy.sparse import lil_matrix
from scipy.spatial.distance import cdist
import numpy as np

In [2]:
df = pd.read_csv('pokedex.csv')
df.head()

Unnamed: 0,id,name,height,weight,hp,attack,defense,s_attack,s_defense,speed,type,evo_set,info
0,1,bulbasaur,7,69,45,49,49,65,65,45,"{grass,poison}",1,A strange seed was planted on its back at birt...
1,2,ivysaur,10,130,60,62,63,80,80,60,"{grass,poison}",1,"When the bulb on its back grows large, it appe..."
2,3,venusaur,20,1000,80,82,83,100,100,80,"{grass,poison}",1,The plant blooms when it is absorbing solar en...
3,4,charmander,6,85,39,52,43,60,50,65,{fire},2,"Obviously prefers hot places. When it rains, s..."
4,5,charmeleon,11,190,58,64,58,80,65,80,{fire},2,"When it swings its burning tail, it elevates t..."


Calculating similarity based on all attributes:

In [33]:
feature_columns = ['height', 'weight', 'hp', 'attack', 'defense', 's_attack', 's_defense', 'speed']
feature_df = df[feature_columns]
display(feature_df.head())

# Normalize the feature values to ensure fair comparisons
feature_df_norm = feature_df.divide(feature_df.sum(axis=1), axis=0)

# Convert the normalized DataFrame into a sparse matrix
feature_matrix = lil_matrix(feature_df_norm.values)
display(feature_df_norm.head())


Unnamed: 0,height,weight,hp,attack,defense,s_attack,s_defense,speed
0,7,69,45,49,49,65,65,45
1,10,130,60,62,63,80,80,60
2,20,1000,80,82,83,100,100,80
3,6,85,39,52,43,60,50,65
4,11,190,58,64,58,80,65,80


Unnamed: 0,height,weight,hp,attack,defense,s_attack,s_defense,speed
0,0.017766,0.175127,0.114213,0.124365,0.124365,0.164975,0.164975,0.114213
1,0.018349,0.238532,0.110092,0.113761,0.115596,0.146789,0.146789,0.110092
2,0.012945,0.647249,0.05178,0.053074,0.053722,0.064725,0.064725,0.05178
3,0.015,0.2125,0.0975,0.13,0.1075,0.15,0.125,0.1625
4,0.018152,0.313531,0.09571,0.105611,0.09571,0.132013,0.107261,0.132013


In [29]:
target_pokemon_name = 'charmander'  # Change this to any Pokémon name

# Find the index of the target Pokémon
if target_pokemon_name in df['name'].values:
    target_pokemon_index = df[df['name'] == target_pokemon_name].index[0]
    target_pokemon_vector = feature_matrix[target_pokemon_index].toarray().reshape(1, -1)

    # Compute cosine distances
    distances = cdist(feature_matrix.toarray(), target_pokemon_vector, metric='cosine').flatten()

    # Rank Pokémon by similarity (excluding the target itself)
    similarity_scores = list(zip(df['name'], distances))
    similar_pokemon = sorted(similarity_scores, key=lambda x: x[1], reverse=False)[1:11]  # Exclude itself

    similar_pokemon
else:
    similar_pokemon = f"Pokémon '{target_pokemon_name}' not found in dataset."

target_pokemon_name, similar_pokemon

# Ensure the 'attack' column is numeric
# attack_df['attack'] = pd.to_numeric(attack_df['attack'], errors='coerce')

# # Normalize the 'attack' column
# df_norm = attack_df['attack'] / attack_df['attack'].sum()

# df_norm.head()


('charmander',
 [('cyndaquil', 0.0005681423436760813),
  ('venomoth', 0.003679826940539477),
  ('frogadier', 0.0038231640001679112),
  ('xatu', 0.003987896478033992),
  ('fennekin', 0.005150144199578377),
  ('galvantula', 0.005312069845177625),
  ('froakie', 0.006858722436921982),
  ('drizzile', 0.00705868324315706),
  ('sigilyph', 0.00808804130799956),
  ('pansage', 0.008326115528239031)])

Now to create a comparrison for important numerical columns

In [None]:
attack_columns = [ 'attack', 's_attack']
attack_df = df[attack_columns]


# # Normalize the 'attack' column
attack_df_norm = attack_df.divide(attack_df.sum(axis=1), axis=0)

print(attack_df_norm)

attack_matrix = lil_matrix(attack_df_norm.values)

        attack  s_attack
0     0.429825  0.570175
1     0.436620  0.563380
2     0.450549  0.549451
3     0.464286  0.535714
4     0.444444  0.555556
...        ...       ...
1020  0.347619  0.652381
1021  0.638298  0.361702
1022  0.371134  0.628866
1023  0.500000  0.500000
1024  0.500000  0.500000

[1025 rows x 2 columns]


In [61]:
target_pokemon_name = 'charmander'  # Change this to any Pokémon name

# Find the index of the target Pokémon
if target_pokemon_name in df['name'].values:
    target_pokemon_index = df[df['name'] == target_pokemon_name].index[0]
    target_pokemon_vector = attack_matrix[target_pokemon_index].toarray().reshape(1, -1)

    # Compute cosine distances
    distances = cdist(attack_matrix.toarray(), target_pokemon_vector, metric='cosine').flatten()

    # Rank Pokémon by similarity (excluding the target itself)
    similarity_scores = list(zip(df['name'], distances))
    similar_pokemon = sorted(similarity_scores, key=lambda x: x[1], reverse=False)[1:11]  # Exclude itself

    similar_pokemon
else:
    similar_pokemon = f"Pokémon '{target_pokemon_name}' not found in dataset."

target_pokemon_name, similar_pokemon

('charmander',
 [('cyndaquil', 0.0),
  ('yanma', 0.0),
  ('clamperl', 5.303116881183456e-07),
  ('vanilluxe', 1.5018183265880225e-06),
  ('whimsicott', 1.9489910073611227e-06),
  ('golduck', 2.014472047728866e-06),
  ('oshawott', 6.532088704491734e-06),
  ('electabuzz', 7.97436592270273e-06),
  ('toxtricity', 8.085922812650637e-06),
  ('tentacruel', 1.1230466008593787e-05)])