In [40]:
import pandas as pd
import numpy as np

pokemon_df = pd.read_csv('data/pokemon.csv', sep="\t", engine="python", encoding="utf-8")
print(pokemon_df.head())

   national_number gen english_name japanese_name primary_type secondary_type  \
0                1   I    Bulbasaur   Fushigidane        grass         poison   
1                2   I      Ivysaur    Fushigisou        grass         poison   
2                3   I     Venusaur   Fushigibana        grass         poison   
3                4   I   Charmander      Hitokage         fire            NaN   
4                5   I   Charmeleon       Lizardo         fire            NaN   

   classification percent_male percent_female  height_m  ...  evochain_1  \
0    Seed Pokémon        88.14          11.86       0.7  ...      Level    
1    Seed Pokémon        88.14          11.86       1.0  ...      Level    
2    Seed Pokémon        88.14          11.86       2.0  ...      Level    
3  Lizard Pokémon        88.14          11.86       0.6  ...      Level    
4   Flame Pokémon        88.14          11.86       1.1  ...      Level    

   evochain_2  evochain_3  evochain_4  evochain_5  evoch

In [41]:
print(pokemon_df.columns)
pokemon_df = pokemon_df.set_index('national_number')
pokemon_df = pokemon_df.drop(columns=[
    'english_name', 'japanese_name', 'description',
    'abilities_0', 'abilities_1', 'abilities_2', 'abilities_hidden',
    'evochain_0', 'evochain_1', 'evochain_2', 'evochain_3', 'evochain_4', 'evochain_5', 'evochain_6'
])
print(pokemon_df.columns)

Index(['national_number', 'gen', 'english_name', 'japanese_name',
       'primary_type', 'secondary_type', 'classification', 'percent_male',
       'percent_female', 'height_m', 'weight_kg', 'capture_rate',
       'base_egg_steps', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense',
       'speed', 'abilities_0', 'abilities_1', 'abilities_2',
       'abilities_hidden', 'against_normal', 'against_fire', 'against_water',
       'against_electric', 'against_grass', 'against_ice', 'against_fighting',
       'against_poison', 'against_ground', 'against_flying',
       'against_psychict', 'against_bug', 'against_rock', 'against_ghost',
       'against_dragon', 'against_dark', 'against_steel', 'against_fairy',
       'is_sublegendary', 'is_legendary', 'is_mythical', 'evochain_0',
       'evochain_1', 'evochain_2', 'evochain_3', 'evochain_4', 'evochain_5',
       'evochain_6', 'gigantamax', 'mega_evolution', 'mega_evolution_alt',
       'description'],
      dtype='object')
Index(['gen', 'pr

In [42]:
unique_classes =pokemon_df["classification"].unique()
print("Number of unique classes:", len(unique_classes))
unique_primary_types = pokemon_df["primary_type"].unique()
print("Number of unique primary types:", len(unique_primary_types))
unique_secondary_types = pokemon_df["secondary_type"].unique()
print("Number of unique secondary types:", len(unique_secondary_types))

pokemon_df = pokemon_df.drop(columns=['classification'])

Number of unique classes: 647
Number of unique primary types: 18
Number of unique secondary types: 19


In [43]:
pokemon_df['percent_male'] = pokemon_df['percent_male'].fillna('0')
pokemon_df['percent_female'] = pokemon_df['percent_female'].fillna('0')
pokemon_df.loc[pokemon_df['percent_male'].str.match(r'50\*'), 'percent_male'] = '50'
pokemon_df.loc[pokemon_df['percent_female'].str.match(r'50\*'), 'percent_female'] = '50'

print("Unique values in percent_male:", pokemon_df['percent_male'].unique())


Unique values in percent_male: ['88.14' '50.2' '50' '0' '100' '24.9' '75.49' '88.1' '24.6' '75.4' '11.2']


In [44]:
pokemon_df['has_mega_evolution'] = (
    pokemon_df['mega_evolution'].notna() | pokemon_df['mega_evolution_alt'].notna()
).astype(int)

pokemon_df['has_gigantamax'] = pokemon_df['gigantamax'].notna().astype(int)

pokemon_df = pokemon_df.drop(columns=['mega_evolution', 'mega_evolution_alt', 'gigantamax'])

In [45]:
pokemon_df['rarity'] = (
    1 * pokemon_df['is_sublegendary'] +
    2 * pokemon_df['is_legendary'] +
    3 * pokemon_df['is_mythical']
)
pokemon_df = pokemon_df.drop(columns=['is_sublegendary', 'is_legendary', 'is_mythical'])
print(pokemon_df.columns)

Index(['gen', 'primary_type', 'secondary_type', 'percent_male',
       'percent_female', 'height_m', 'weight_kg', 'capture_rate',
       'base_egg_steps', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense',
       'speed', 'against_normal', 'against_fire', 'against_water',
       'against_electric', 'against_grass', 'against_ice', 'against_fighting',
       'against_poison', 'against_ground', 'against_flying',
       'against_psychict', 'against_bug', 'against_rock', 'against_ghost',
       'against_dragon', 'against_dark', 'against_steel', 'against_fairy',
       'has_mega_evolution', 'has_gigantamax', 'rarity'],
      dtype='object')


In [46]:
type_columns = ['primary_type', 'secondary_type']
gen_map = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7, 'VIII': 8}
pokemon_df['gen'] = pokemon_df['gen'].map(gen_map)

columns_to_cast = [col for col in pokemon_df.columns if col not in type_columns]

pokemon_df[columns_to_cast] = pokemon_df[columns_to_cast].astype(float)

In [47]:
print("Primary Types:", unique_primary_types)
print("Secondary Types:", unique_secondary_types)
pokemon_df['secondary_type'] = pokemon_df['secondary_type'].fillna('No secondary type')
unique_secondary_types_clean = pokemon_df['secondary_type'].unique()
print("Secondary Types (clean):", unique_secondary_types_clean)

Primary Types: ['grass' 'fire' 'water' 'bug' 'normal' 'poison' 'electric' 'ground'
 'fairy' 'fighting' 'psychic' 'rock' 'ghost' 'ice' 'dragon' 'dark' 'steel'
 'flying']
Secondary Types: ['poison' nan 'flying' 'dark' 'electric' 'ice' 'ground' 'fairy' 'grass'
 'fighting' 'psychic' 'steel' 'fire' 'rock' 'water' 'dragon' 'ghost' 'bug'
 'normal']
Secondary Types (clean): ['poison' 'No secondary type' 'flying' 'dark' 'electric' 'ice' 'ground'
 'fairy' 'grass' 'fighting' 'psychic' 'steel' 'fire' 'rock' 'water'
 'dragon' 'ghost' 'bug' 'normal']


In [48]:
for col in pokemon_df.columns:
    unique_vals = pokemon_df[col].unique()
    print(f"Column: {col}")
    print(f"Unique values ({len(unique_vals)}): {unique_vals}\n")

Column: gen
Unique values (8): [1. 2. 3. 4. 5. 6. 7. 8.]

Column: primary_type
Unique values (18): ['grass' 'fire' 'water' 'bug' 'normal' 'poison' 'electric' 'ground'
 'fairy' 'fighting' 'psychic' 'rock' 'ghost' 'ice' 'dragon' 'dark' 'steel'
 'flying']

Column: secondary_type
Unique values (19): ['poison' 'No secondary type' 'flying' 'dark' 'electric' 'ice' 'ground'
 'fairy' 'grass' 'fighting' 'psychic' 'steel' 'fire' 'rock' 'water'
 'dragon' 'ghost' 'bug' 'normal']

Column: percent_male
Unique values (11): [ 88.14  50.2   50.     0.   100.    24.9   75.49  88.1   24.6   75.4
  11.2 ]

Column: percent_female
Unique values (11): [ 11.86  49.8   50.   100.     0.    75.1   24.51  11.9   75.4   24.6
  88.8 ]

Column: height_m
Unique values (53): [ 0.7  1.   2.   0.6  1.1  1.7  0.5  1.6  0.3  1.5  1.2  3.5  0.4  0.8
  1.3  0.9  1.4  0.2  1.9  1.8  8.8  2.2  6.5  2.5  2.1  4.   2.3  9.2
  5.2  3.8 14.5  2.7  6.2  4.5  7.   2.4  5.4  4.2  3.7  3.2  3.3  0.1
  2.6  2.8  2.9  3.   5.8  5.   3.