In [79]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
pokemon_df = pd.read_csv('pokemon.csv').set_index('pid')
combats_df = pd.read_csv('combats.csv')

### Data Preparation

##### Standardization, Normalization & Dummies

In [2]:
def standardize(x):
    return (x - x.mean()) / x.std()

def normalize(x):
    return (x - x.min()) / (x.max() - x.min())

In [32]:
types_dummies_df = pd.get_dummies(pokemon_df['Class 1']) + pd.get_dummies(pokemon_df['Class 2'])
stats_df = pokemon_df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']]
legendary_dummies_df = pokemon_df['Legendary'].astype(int)

vectorized_pokemon_df = pd.concat([types_dummies_df, stats_df, legendary_dummies_df], axis=1)
vectorized_pokemon_df = vectorized_pokemon_df.apply(standardize)
    
# Create a copy of combats with all pairs swapped
combats_reversed_df = combats_df.copy()
combats_reversed_df['First_pokemon'] = combats_df['Second_pokemon']
combats_reversed_df['Second_pokemon'] = combats_df['First_pokemon']
combats_reversed_df.index += combats_df.shape[0]

# Create an augmented training dataset using both combats in normal and reversed forms
augmented_combats_df = pd.concat([combats_df,combats_reversed_df])

# Change labels into [0, 1]
augmented_combats_df['w'] = np.where(augmented_combats_df['Winner'] == augmented_combats_df['First_pokemon'], 0, 1)

# Vectorize combats
left_vectorized_pokemon_df = vectorized_pokemon_df.reset_index().rename(columns=lambda c: c + '_left').rename(columns={'pid_left': 'First_pokemon'})
right_vectorized_pokemon_df = vectorized_pokemon_df.reset_index().rename(columns=lambda c: c + '_right').rename(columns={'pid_right': 'Second_pokemon'})
vectorized_combats_df = augmented_combats_df.merge(left_vectorized_pokemon_df, how='left', on='First_pokemon')
vectorized_combats_df = vectorized_combats_df.merge(right_vectorized_pokemon_df, how='left', on='Second_pokemon')
vectorized_combats_df = vectorized_combats_df.drop(columns=['Winner', 'First_pokemon', 'Second_pokemon'])

# Create x and y as numpy arrays
x = vectorized_combats_df.drop(columns=['w']).values
y = vectorized_combats_df['w']

##### Test / Train Split

In [36]:
def test_train_split(x, y, train_ratio):
    
    # Generate split index at correct ratio
    n = y.shape[0]
    split = int(n * train_ratio)
    
    # Random permutation of all indices
    random_indices = np.random.permutation(n)
    
    # Separate indices according to split
    i_tr = random_indices[:split]
    i_te = random_indices[split:]
    
    x_tr = x[i_tr]
    x_te = x[i_te]
    y_tr = y[i_tr]
    y_te = y[i_te]
    
    # Return split
    return x_tr, x_te, y_tr, y_te

In [37]:
# Split data with .9 ratio
x_tr, x_te, y_tr, y_te = test_train_split(x, y, 0.9)

In [42]:
(x_tr.shape, x_te.shape, y_tr.shape, y_te.shape)

((90000, 50), (10000, 50), (90000,), (10000,))

##### Cross Validation Split

In [54]:
def build_k_indices(y, k_fold, seed):
    '''
    Build k randomly selected disjoint lists of indices to be used in 
    cross-validation. 
    '''
    
    num_row = y.shape[0]
    interval = int(num_row / k_fold)
    np.random.seed(seed)
    
    # Create a random permutation of all numbers from 1 to num_row
    indices = np.random.permutation(num_row)
    
    # Select k partitions of the random permutation
    k_indices = [indices[k * interval: (k + 1) * interval]
                    for k in range(k_fold)]
    
    return np.array(k_indices)

def cross_data(x, y, k_indices, k):
    '''
    Given a feature set, label set, and k disjoint lists of indices, 
    return the k-th partition (training + test for both features and labels). 
    '''

    # get k'th subgroup in test, others in train:
    x_tr = x[k_indices[np.arange(len(k_indices)) != k].flatten()]
    x_te = x[k_indices[k]]

    # Same for y
    y_tr = y[k_indices[np.arange(len(k_indices)) != k].flatten()]
    y_te = y[k_indices[k]]

    return x_tr, x_te, y_tr, y_te

def cross_evaluate(x, y, k, evaluate):
    '''
    Calculate a given metric using cross-validation over k_fold partitions.
    '''
    
    results = 0
    
    for i in range(0, k):
        
        # Get split
        x_tr, x_te, y_tr, y_te = cross_data(x, y, k, i)
        
        # Calculate results for current split
        results += evaluate(x_tr, x_te, y_tr, y_te) / k
        
    return results

In [55]:
# Build indices for cross-validation
k_indices = build_k_indices(y, 5, seed=0)

# Split data with .9 ratio
x_tr, x_te, y_tr, y_te = cross_data(x, y, k_indices, 0)

In [56]:
(x_tr.shape, x_te.shape, y_tr.shape, y_te.shape)

((80000, 50), (20000, 50), (80000,), (20000,))

### Classifiers

##### Random Forest

In [67]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=10)

rfc.fit(x_tr, y_tr).score(x_te, y_te)

0.91025

##### Logistic Regression

In [65]:
cls = LogisticRegression(solver='lbfgs')

cls.fit(x_tr, y_tr).score(x_te, y_te)

0.8858

### Feature Selection

##### Greedy Backward Selection

In [6]:
def greedy_backward_selection(x, y, evaluate):
    
    # Initialize dataframes for scores and features selected
    n = x.shape[1]
    feature_masks = pd.DataFrame(np.full((n, n), False), columns=x.columns)
    scores = pd.DataFrame(np.full(n, 0), columns=['score'])
    
    for step in range(0, n):

        if step == 0:

            # Calculate score and feature mask for initial step
            feature_masks.loc[0] = True
            scores.loc[0] = evaluate(x, y)

        else :

            features = x.columns[feature_masks.loc[step - 1]]

            for feature in features:

                feature_mask = (feature_masks.loc[step - 1]) & (x.columns != feature)

                # Select all previously selected features except selected one
                x_iter = x[x.columns[feature_mask]]

                # Compute score with given set of features
                score = evaluate(x_iter, y)

                # If score is better than previous scores, store features
                if score > scores.loc[step]['score']:
                    scores.loc[step] = score
                    feature_masks.loc[step][feature_mask] = True
                    feature_masks.loc[step][~feature_mask] = False

        # At the end of the loop, calculates score and feature
        score_selected = scores.loc[step]
        features_selected = x.columns[feature_masks.loc[step]]

        print('step: {} - score: {} - features: {}'.format(step, score_selected, features_selected))

    return scores, feature_masks

In [7]:
def evaluate_rfc(x_tr, x_te, y_tr, y_te):
    rfc = RandomForestClassifier(n_estimators=10, max_depth=2)
    return rfc.fit(x_tr, y_tr).score(x_te, y_te)

scores, feature_masks = greedy_backward_selection(x.drop(columns=['First_pokemon', 'Second_pokemon']), y, lambda x, y: cross_evaluate(x, y, 5, evaluate_rfc))

In [None]:
scores.plot.line()

##### Feature Importance

In [None]:
pd.DataFrame(rfc.feature_importances_, index=x.columns, columns=['feature_score'])

### Analysis of Results

- $\displaystyle\text{Precision} = \frac{\text{TP}}{\text{TP}\;+\;\text{FP}}$


- $\displaystyle\text{Recall} = \frac{\text{TP}}{\text{TP} + \text{FN}}$


- $\displaystyle\text{Accuracy} = \frac{\text{TP} + \text{TN}}{\text{TP} + \text{TN} + \text{FP} + \text{FN}}$


- $\displaystyle\text{F1-Score} = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}$

In [73]:
def cm_item(y_te, y_pred, actual, pred):
    return (y_te[(y_te == actual) & (y_pred == pred)]).shape[0]

def confusion_matrix(y_te, y_pred):
    return [
        [cm_item(y_te, y_pred, actual=1, pred=1), cm_item(y_te, y_pred, actual=0, pred=1)],
        [cm_item(y_te, y_pred, actual=1, pred=0), cm_item(y_te, y_pred, actual=0, pred=0)]
    ]

tp = lambda m: m[0][0]
fp = lambda m: m[0][1]
fn = lambda m: m[1][0]
tn = lambda m: m[1][1]

In [74]:
def precision(m, pos=True):
    if pos:
        return tp(m) / (tp(m) + fp(m))
    else:
        return tn(m) / (tn(m) + fn(m))

def recall(m, pos=True):
    if pos:
        return tp(m) / (tp(m) + fn(m))
    else:
        return tn(m) / (tn(m) + fp(m))

def accuracy(m, pos=True):
    return (tp(m) + tn(m)) / np.sum(m)

def f1_score(m, pos=True):
    return 2 * (precision(m, pos) * recall(m, pos)) / (precision(m, pos) + recall(m, pos))

In [82]:
y_pred = cls.fit(x_tr, y_tr).predict(x_te)

precision(confusion_matrix(y_te, y_pred))
recall(confusion_matrix(y_te, y_pred))
accuracy(confusion_matrix(y_te, y_pred))
f1_score(confusion_matrix(y_te, y_pred))

0.8858397365532382

0.8861934710991315

0.8858

0.8860165685198125