# Data Preparation

- Splits data into a global Training and Test Set.
- Drops Price Column.

In [1]:
import sklearn
import sys
import numpy as np
import pandas as pd
import swifter
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from kedro.extras.datasets.pandas import CSVDataSet

2020-05-26 10:54:25,370 - numexpr.utils - INFO - Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-05-26 10:54:25,370 - numexpr.utils - INFO - NumExpr defaulting to 8 threads.


### Loading Data

In [2]:
Battle_Results = catalog.load('Battle_Results')
Weakness_Pokemon = catalog.load('Weakness_Pokemon')
AllPokemons = catalog.load('All_Pokemons')

2020-05-26 10:54:26,424 - kedro.io.data_catalog - INFO - Loading data from `Battle_Results` (CSVDataSet)...
2020-05-26 10:54:29,185 - kedro.io.data_catalog - INFO - Loading data from `Weakness_Pokemon` (CSVDataSet)...
2020-05-26 10:54:29,189 - kedro.io.data_catalog - INFO - Loading data from `All_Pokemons` (CSVDataSet)...


### Split Data & Save Test Set

In [3]:
Battle_Results.drop(columns=["Price_1", "Price_2"], inplace=True)

In [11]:
train_val_set, test_set = train_test_split(Battle_Results, test_size=0.1, random_state=42)

In [7]:
catalog.add("Battle_Results_Test", CSVDataSet(filepath="../../data/02_intermediate/Battle_Results_Test.csv"))
catalog.save("Battle_Results_Test", test_set)

2020-05-21 22:24:29,452 - kedro.io.data_catalog - INFO - Saving data to `Battle_Results_Test` (CSVDataSet)...


In [10]:
catalog.add("Battle_Results_Train", CSVDataSet(filepath="../../data/02_intermediate/Battle_Results_Train.csv"))
catalog.save("Battle_Results_Train", train_val_set)

2020-05-21 22:25:14,495 - kedro.io.data_catalog - INFO - Saving data to `Battle_Results_Train` (CSVDataSet)...


### Create Extended Integer-Only Data with Pokemon Natures and Weaknesses

In [83]:
def cleanAllPokemons(df):
    """Fill NaN Type_2 values with Type_1 values"""
    return df.Type_2.fillna(df.Type_1)
AllPokemons.Type_2 = cleanAllPokemons(AllPokemons)
AllPokemons.head()

Unnamed: 0,ID,Name,Type_1,Type_2
0,1,Bulbasaur,Grass,Poison
1,2,Ivysaur,Grass,Poison
2,3,Venusaur,Grass,Poison
3,4,Charmander,Fire,Fire
4,5,Charmeleon,Fire,Fire


In [103]:
def transform(df):
    """Transform the Data. For Performance we work with the underlying numpy arrays."""
    # Dictionarries and values for Fast Access
    typetoInt = {sys.intern(t): Weakness_Pokemon.loc[Weakness_Pokemon["Types"]==t].index[0]+1 for t in Weakness_Pokemon.Types}
    NametoID = {sys.intern(name): AllPokemons.loc[AllPokemons["Name"]==name].iloc[0].ID for name in AllPokemons.Name}
    WeathertoInt = {sys.intern(weather): idx+1 for (idx, weather) in enumerate(Battle_Results.WeatherAndTime.unique())}
    values = df.values
    AllPokemonTypes = AllPokemons[['Type_1', 'Type_2']].values
    
    # Old Column indexes
    name1_idx = df.columns.get_loc("Name_1")
    name2_idx = df.columns.get_loc("Name_2")
    weather_idx = df.columns.get_loc("WeatherAndTime")
    
    # New Columns
    ID_1 = np.empty(len(df), dtype=int)
    ID_2 = np.empty(len(df), dtype=int)
    weather = np.empty(len(df), dtype=int)
    Types = np.empty((len(df),4), dtype=int)
    
    for i in range(len(df)):
        ID_1[i] = NametoID[values[i,name1_idx]]
        ID_2[i] = NametoID[values[i,name2_idx]]
        weather[i] = WeathertoInt[values[i, weather_idx]]
        
        # Types
        Types[i, 0] = typetoInt[AllPokemonTypes[ID_1[i]-1, 0]]
        Types[i, 1] = typetoInt[AllPokemonTypes[ID_1[i]-1, 1]]
        Types[i, 2] = typetoInt[AllPokemonTypes[ID_2[i]-1, 0]]
        Types[i, 3] = typetoInt[AllPokemonTypes[ID_2[i]-1, 1]]
        
    newdf = df.copy()
    
    # Add columns
    newdf['ID_1'] = ID_1
    newdf['ID_2'] = ID_2
    newdf[['Type_1_1', 'Type_1_2', 'Type_2_1', 'Type_2_2']] = pd.DataFrame(Types, index=newdf.index)
    
    # Drop columns
    newdf.drop(columns=['Name_1', 'Name_2'], inplace=True)
    
    # Legendary to Integer
    newdf['Legendary_1'] = df.Legendary_1.values.astype(int)
    newdf['Legendary_2'] = df.Legendary_2.values.astype(int)
    
    # Weather to Integer
    newdf['WeatherAndTime'] = weather
        
    return newdf

In [104]:
transformed = transform(Battle_Results)

In [105]:
transformed.head()

Unnamed: 0,Level_1,HP_1,Attack_1,Defense_1,Sp_Atk_1,Sp_Def_1,Speed_1,Legendary_1,Level_2,HP_2,...,Speed_2,Legendary_2,WeatherAndTime,BattleResult,ID_1,ID_2,Type_1_1,Type_1_2,Type_2_1,Type_2_2
0,30,150,36,103,47,47,58,0,47,262,...,253,0,1,-262,11,127,12,12,12,12
1,36,196,304,237,129,164,197,0,86,547,...,200,0,2,-424,127,11,12,12,12,12
2,15,92,26,73,33,33,40,0,89,821,...,781,0,1,-821,11,127,12,12,12,12
3,40,228,346,270,146,187,224,0,44,202,...,75,0,2,214,127,11,12,12,12,12
4,12,91,157,124,70,86,106,0,24,126,...,51,0,1,41,127,11,12,12,12,12


In [111]:
### Save to Catalog
train_val_set, test_set = train_test_split(transformed, test_size=0.1, random_state=42)
catalog.add("Battle_Results_Train_Extended", CSVDataSet(filepath="../../data/02_intermediate/Battle_Results_Train_Extended.csv"))
catalog.save("Battle_Results_Train_Extended", train_val_set)

catalog.add("Battle_Results_Test_Extended", CSVDataSet(filepath="../../data/02_intermediate/Battle_Results_Test_Extended.csv"))
catalog.save("Battle_Results_Test_Extended", test_set)

2020-05-26 12:49:35,789 - kedro.io.data_catalog - INFO - Saving data to `Battle_Results_Train_Extended` (CSVDataSet)...
2020-05-26 12:49:44,693 - kedro.io.data_catalog - INFO - Saving data to `Battle_Results_Test_Extended` (CSVDataSet)...


### Check Correctness

In [106]:
transformed.iloc[500000:500001]

Unnamed: 0,Level_1,HP_1,Attack_1,Defense_1,Sp_Atk_1,Sp_Def_1,Speed_1,Legendary_1,Level_2,HP_2,...,Speed_2,Legendary_2,WeatherAndTime,BattleResult,ID_1,ID_2,Type_1_1,Type_1_2,Type_2_1,Type_2_2
500000,1,70,80,50,35,35,35,0,72,534,...,302,0,4,-534,66,103,7,7,5,11


In [107]:
Battle_Results.iloc[500000:500001]

Unnamed: 0,Name_1,Level_1,HP_1,Attack_1,Defense_1,Sp_Atk_1,Sp_Def_1,Speed_1,Legendary_1,Name_2,Level_2,HP_2,Attack_2,Defense_2,Sp_Atk_2,Sp_Def_2,Speed_2,Legendary_2,WeatherAndTime,BattleResult
500000,Machop,1,70,80,50,35,35,35,False,Exeggutor,72,534,528,473,689,362,302,False,Rain,-534


In [108]:
AllPokemons.loc[AllPokemons["Name"]=='Machop']

Unnamed: 0,ID,Name,Type_1,Type_2
65,66,Machop,Fighting,Fighting


In [109]:
AllPokemons.loc[AllPokemons["Name"]=='Exeggutor']

Unnamed: 0,ID,Name,Type_1,Type_2
102,103,Exeggutor,Grass,Psychic


In [110]:
Weakness_Pokemon

Unnamed: 0,Types,Normal,Fire,Water,Electric,Grass,Ice,Fighting,Poison,Ground,Flying,Psychic,Bug,Rock,Ghost,Dragon,Dark,Steel,Fairy
0,Normal,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,0.6,1.0,1.0,0.6,1.0
1,Fire,1.0,0.6,0.6,1.0,50.0,6.0,1.0,1.0,1.0,1.0,1.0,6.0,0.6,1.0,0.6,1.0,6.0,1.0
2,Water,1.0,20.0,0.6,1.0,0.6,1.0,1.0,1.0,6.0,1.0,1.0,1.0,6.0,1.0,0.6,1.0,1.0,1.0
3,Electric,1.0,1.0,6.0,0.6,0.6,1.0,1.0,1.0,0.6,6.0,1.0,1.0,1.0,1.0,0.6,1.0,1.0,1.0
4,Grass,1.0,0.6,6.0,1.0,0.6,1.0,1.0,0.6,6.0,0.6,1.0,0.6,6.0,1.0,0.6,1.0,0.6,1.0
5,Ice,1.0,0.6,0.6,1.0,6.0,0.6,1.0,1.0,6.0,6.0,1.0,1.0,1.0,1.0,6.0,1.0,0.6,1.0
6,Fighting,6.0,1.0,1.0,1.0,1.0,6.0,1.0,0.6,1.0,0.6,0.6,0.6,6.0,0.6,1.0,6.0,6.0,0.6
7,Poison,1.0,1.0,1.0,1.0,6.0,1.0,1.0,0.6,0.6,1.0,1.0,1.0,0.6,0.6,1.0,1.0,0.6,6.0
8,Ground,1.0,6.0,1.0,6.0,0.6,1.0,1.0,6.0,1.0,0.6,1.0,0.6,6.0,1.0,1.0,1.0,6.0,1.0
9,Flying,1.0,1.0,1.0,0.6,6.0,1.0,6.0,1.0,1.0,1.0,1.0,6.0,0.6,1.0,1.0,1.0,0.6,1.0
