# Pokemòn

## Imports

In [85]:
import pandas as pd
from tkinter import Grid
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier, StackingClassifier
from matplotlib import pyplot as plt 
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score
import missingno as msno
from pandas_profiling import ProfileReport
import pickle
from sklearn.linear_model import Lasso, Ridge
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import time

## Import Data

In [5]:
pokemon = pd.read_csv('pokemon.csv')
combats = pd.read_csv('combats.csv')

## Basic EDA

### Pokemon EDA
Reference table for all Pokemon

In [6]:
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,4,Mega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,5,Charmander,Fire,,39,52,43,60,50,65,1,False


In [7]:
pokemon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   HP          800 non-null    int64 
 5   Attack      800 non-null    int64 
 6   Defense     800 non-null    int64 
 7   Sp. Atk     800 non-null    int64 
 8   Sp. Def     800 non-null    int64 
 9   Speed       800 non-null    int64 
 10  Generation  800 non-null    int64 
 11  Legendary   800 non-null    bool  
dtypes: bool(1), int64(8), object(3)
memory usage: 69.7+ KB


In [None]:
profile = ProfileReport(pokemon, title='Pokemon')
profile.to_notebook_iframe()

### Combats

In [None]:
profile_com = ProfileReport(combats, title='Pokemon Combats')
profile_com.to_notebook_iframe()

In [8]:
combats['First_pokemon'] = combats['First_pokemon'].astype('int16')
combats['Second_pokemon'] = combats['Second_pokemon'].astype('int16')
combats['first_wins'] = combats['First_pokemon'] == combats['Winner']
# combats['first_wins'] = combats['first_wins'].map({True:1, False:0})
# combats['first_wins'] = combats['first_wins'].astype(np.float32)
combats['first_wins'] = combats['first_wins'].astype('category')
combats = combats.drop('Winner', axis=1)

In [9]:
combats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   First_pokemon   50000 non-null  int16   
 1   Second_pokemon  50000 non-null  int16   
 2   first_wins      50000 non-null  category
dtypes: category(1), int16(2)
memory usage: 244.4 KB


## Impute and Prepare Data

In [10]:
def prepare_pokemon(poke, encode=True):
    poke = poke.drop(['Legendary', 'Name', 'Type 2', 'Generation'], axis=1)\
                .rename(columns={
                            'Type 1':'Type',
                             'Sp. Atk':'sp_atk',
                             'Sp. Def':'sp_def',
                             '#':'id'}
                       )

    poke['HP'] = poke['HP'].astype('int16')
    poke['Attack'] = poke['Attack'].astype('int16')
    poke['Defense'] = poke['Defense'].astype('int16')
    poke['sp_atk'] = poke['sp_atk'].astype('int16')
    poke['sp_def'] = poke['sp_def'].astype('int16')
    poke['Speed'] = poke['Speed'].astype('int16')
    poke['Type'] = poke['Type'].astype('category')
    poke['id'] = poke['id'].astype('int16')

    if encode:
        poke = pd.get_dummies(poke, drop_first=False)
    return poke

In [11]:
pokemon1 = prepare_pokemon(pokemon)
pokemon1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 25 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   id             800 non-null    int16
 1   HP             800 non-null    int16
 2   Attack         800 non-null    int16
 3   Defense        800 non-null    int16
 4   sp_atk         800 non-null    int16
 5   sp_def         800 non-null    int16
 6   Speed          800 non-null    int16
 7   Type_Bug       800 non-null    uint8
 8   Type_Dark      800 non-null    uint8
 9   Type_Dragon    800 non-null    uint8
 10  Type_Electric  800 non-null    uint8
 11  Type_Fairy     800 non-null    uint8
 12  Type_Fighting  800 non-null    uint8
 13  Type_Fire      800 non-null    uint8
 14  Type_Flying    800 non-null    uint8
 15  Type_Ghost     800 non-null    uint8
 16  Type_Grass     800 non-null    uint8
 17  Type_Ground    800 non-null    uint8
 18  Type_Ice       800 non-null    uint8
 19  Type_Nor

## Join data frames together

In [12]:
# Join the combat df to pokemon reference table
# Renmed columns afterwards with underscore #
df = combats.merge(pokemon1, left_on='First_pokemon', right_on='id')
df = df.rename(columns={col:col+'_1' for col in pokemon1.columns})

df = df.merge(pokemon1, left_on='Second_pokemon', right_on='id')
df = df.rename(columns={col:col+'_2' for col in pokemon1.columns})

In [13]:
# Drop extra columns from join
df = df.drop(['First_pokemon', 'Second_pokemon', 'id_1', 'id_2'], axis=1)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 49 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   first_wins       50000 non-null  category
 1   HP_1             50000 non-null  int16   
 2   Attack_1         50000 non-null  int16   
 3   Defense_1        50000 non-null  int16   
 4   sp_atk_1         50000 non-null  int16   
 5   sp_def_1         50000 non-null  int16   
 6   Speed_1          50000 non-null  int16   
 7   Type_Bug_1       50000 non-null  uint8   
 8   Type_Dark_1      50000 non-null  uint8   
 9   Type_Dragon_1    50000 non-null  uint8   
 10  Type_Electric_1  50000 non-null  uint8   
 11  Type_Fairy_1     50000 non-null  uint8   
 12  Type_Fighting_1  50000 non-null  uint8   
 13  Type_Fire_1      50000 non-null  uint8   
 14  Type_Flying_1    50000 non-null  uint8   
 15  Type_Ghost_1     50000 non-null  uint8   
 16  Type_Grass_1     50000 non-null  uint8  

## Split Data


In [15]:
X = df.drop(['first_wins'], axis=1)
y = df['first_wins']

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [17]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

## Setup Models

In [150]:
# Setup ML Pipelines
pipelines = {
    'rf' : make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1234)), 
    'gb' : make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1234)),
    'ad' : make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1234)),
    'et' : make_pipeline(StandardScaler(), ExtraTreesClassifier(random_state=1234)),
    'bg' : make_pipeline(StandardScaler(), BaggingClassifier(random_state=1234)),
    'xg' : make_pipeline(StandardScaler(), xgb.XGBClassifier(random_state=1234)),
    'lx' : make_pipeline(StandardScaler(), lgb.LGBMClassifier(random_state=1234)),
    'ca' : make_pipeline(StandardScaler(), CatBoostClassifier(random_state=1234, silent=True))
}

In [151]:
# See params
BaggingClassifier().get_params().keys()

dict_keys(['base_estimator', 'bootstrap', 'bootstrap_features', 'max_features', 'max_samples', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

In [152]:
grid = {
    'rf': {
        'randomforestclassifier__n_estimators': [100,200,300]
    },
    'gb': {
        'gradientboostingclassifier__n_estimators': [100, 200, 300]
    },
    'ad': {
        'adaboostclassifier__n_estimators': [100, 200, 300]
    },
    'et': {
        'extratreesclassifier__n_estimators': [100, 200, 300]
    },
    'bg': {
        'baggingclassifier__n_estimators': [100, 200, 300]
    },
    'xg': {
        'xgbclassifier__n_estimators': [100, 200, 300], 'verbose': [0]
    },
    'lx': {
        'lgbmclassifier__n_estimators': [100, 200, 300]
    },
    'ca': {
        'catboostclassifier__n_estimators': [100, 200, 300]
    }
}

## Train Models

In [153]:
# Do cross validation and tuning
fit_models = {}
for algo, pipeline in pipelines.items():
    start = time.time()
    print(f'Training the {algo} model')
    model = GridSearchCV(pipeline, grid[algo], n_jobs=-1, cv=10)
    model.fit(X_train, Y_train)
    fit_models[algo] = model
    end = time.time()
    total = end - start
    print(f'{algo} took {total:.3} seconds to train')

Training the rf model
rf took 41.50921630859375 seconds to train
Training the gb model
gb took 45.3526873588562 seconds to train
Training the ad model
ad took 32.75372791290283 seconds to train
Training the et model
et took 50.17235803604126 seconds to train
Training the bg model
bg took 211.05249118804932 seconds to train
Training the xg model




xg took 39.75634503364563 seconds to train
Training the lx model
lx took 3.511533498764038 seconds to train
Training the ca model
ca took 47.36176133155823 seconds to train


In [154]:
# Evaluate performance of models
for algo, model in fit_models.items():
    start = time.time()
    yhat = model.predict(X_test)
    acc = accuracy_score(Y_test, yhat)
    recall = recall_score(Y_test, yhat)
    prec = precision_score(Y_test, yhat)
    end = time.time()
    total = end - start
    print(f'Metrics for {algo}: Accuracy={acc:.5f}, Recall={recall:.5f}, Precision={prec:.5f} and predicted in {total:.3} sec')

Metrics for rf: Accuracy=0.94060, Recall=0.94073, Precision=0.93345 and predicted in 0.601686954498291 sec
Metrics for gb: Accuracy=0.94133, Recall=0.93661, Precision=0.93848 and predicted in 0.05900907516479492 sec
Metrics for ad: Accuracy=0.85913, Recall=0.84359, Precision=0.85486 and predicted in 0.38013219833374023 sec
Metrics for et: Accuracy=0.92787, Recall=0.92201, Precision=0.92437 and predicted in 0.492215633392334 sec
Metrics for bg: Accuracy=0.95960, Recall=0.96058, Precision=0.95382 and predicted in 0.6316523551940918 sec
Metrics for xg: Accuracy=0.96293, Recall=0.96058, Precision=0.96058 and predicted in 0.02541518211364746 sec
Metrics for lx: Accuracy=0.96227, Recall=0.96157, Precision=0.95831 and predicted in 0.03523755073547363 sec
Metrics for ca: Accuracy=0.95967, Recall=0.95916, Precision=0.95523 and predicted in 0.01785731315612793 sec


In [155]:
draw_tree(fit_models['rf'], X_train, size=10, leaves_parallel=True, precision=2)

NameError: name 'draw_tree' is not defined

## Save Best Model

In [252]:
with open('model.pkl', 'wb') as f:
    pickle.dump(fit_models['gb'], f)

# Use Best Model to predict

In [253]:
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [287]:
test_df = pd.read_csv('tests.csv')

# Prepare test data

In [288]:
test_df.head()

Unnamed: 0,First_pokemon,Second_pokemon
0,129,117
1,660,211
2,706,115
3,195,618
4,27,656


In [262]:
tests = test_df.merge(pokemon1, left_on='First_pokemon', right_on='id')
tests = tests.rename(columns={col:col+'_1' for col in pokemon1.columns})
tests = tests.merge(pokemon1, left_on='Second_pokemon', right_on='id')
tests = tests.rename(columns={col:col+'_2' for col in pokemon1.columns})

In [265]:
# Drop extra columns from join
tests = tests.drop(['First_pokemon', 'Second_pokemon', 'id_1', 'id_2'], axis=1)

In [267]:
preds = model.predict(tests)

In [291]:
submission = test_df.copy()

In [292]:
submission['first_wins'] = preds

In [293]:
submission

Unnamed: 0,First_pokemon,Second_pokemon,first_wins
0,129,117,True
1,660,211,True
2,706,115,True
3,195,618,True
4,27,656,True
...,...,...,...
9995,216,498,False
9996,113,404,True
9997,493,104,False
9998,643,259,False


## Try Voting Classifier

In [26]:
clf1 = RandomForestClassifier(random_state=1234)
clf2 = GradientBoostingClassifier(random_state=1234)
clf3 = AdaBoostClassifier(random_state=1234)
clf4 = ExtraTreesClassifier(random_state=1234)
clf5 = BaggingClassifier(random_state=1234)

In [27]:
eclf1 = VotingClassifier(estimators=[('rf', clf1), ('gb', clf2), ('ad', clf3), ('et', clf4), ('bg', clf5)], voting='hard')

In [28]:
eclf1 = eclf1.fit(X_train, Y_train)

In [29]:
# Evaluate performance of models
yhat = eclf1.predict(X_test)
acc = accuracy_score(Y_test, yhat)
recall = recall_score(Y_test, yhat)
prec = precision_score(Y_test, yhat)
print(f'Metrics for Voting Classifier: Accuracy={acc}, Recall={recall}, Precision={prec}')

Metrics for Voting Classifier: Accuracy=0.9393333333333334, Recall=0.9381735677821894, Precision=0.9331452750352609
