Formatting this [Pokemon dataset](https://www.kaggle.com/datasets/abcsds/pokemon)

Keeping their stats and one-hot encoding the types. 

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("pokemon_legendary.csv")
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [2]:
df["Type 2"].fillna("NoType", inplace=True)
df["Legendary"].value_counts()

Legendary
False    735
True      65
Name: count, dtype: int64

In [3]:
df["Legendary"] = df["Legendary"].astype(int)
df["Legendary"]

0      0
1      0
2      0
3      0
4      0
      ..
795    1
796    1
797    1
798    1
799    1
Name: Legendary, Length: 800, dtype: int32

In [4]:
# one hot encode the types
types = list(set(df["Type 1"].to_list())) + ["NoType"]
type1_vectors = np.zeros((df.shape[0], len(types)))
type2_vectors = np.zeros((df.shape[0], len(types)))

for i, ptypes in enumerate(zip(df["Type 1"].to_list(), df["Type 2"].to_list())):
    ptype1, ptype2 = ptypes
    type1_vectors[i][types.index(ptype1)] = 1
    type2_vectors[i][types.index(ptype2)] = 1

type1_vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
# normalize stats
stat_cols = ["HP","Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed"]

for col in stat_cols:
    vals = df[col].to_numpy()
    df[col] = (vals - np.mean(vals))/np.std(vals)

In [6]:
features = df[["HP","Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed"]].to_numpy()
full_features = pd.DataFrame(np.concatenate((features, type1_vectors, type2_vectors), axis=1))
full_features["name"] = df["Name"]
full_features["legend"] = df["Legendary"]
full_features.to_csv("pokemon_data.csv", index=False)
full_features


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,name,legend
0,-0.950626,-0.924906,-0.797154,-0.239130,-0.248189,-0.801503,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Bulbasaur,0
1,-0.362822,-0.524130,-0.347917,0.219560,0.291156,-0.285015,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Ivysaur,0
2,0.420917,0.092448,0.293849,0.831146,1.010283,0.403635,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Venusaur,0
3,0.420917,0.647369,1.577381,1.503891,1.729409,0.403635,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,VenusaurMega Venusaur,0
4,-1.185748,-0.832419,-0.989683,-0.392027,-0.787533,-0.112853,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Charmander,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-0.754692,0.647369,2.443765,0.831146,2.808099,-0.629341,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Diancie,1
796,-0.754692,2.497104,1.160233,2.665905,1.369846,1.436611,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,DiancieMega Diancie,1
797,0.420917,0.955658,-0.444182,2.360112,2.088973,0.059310,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HoopaHoopa Confined,1
798,0.420917,2.497104,-0.444182,2.971699,2.088973,0.403635,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,HoopaHoopa Unbound,1
