# Game of Thrones - Battle Prediction
Game of Thrones is a popular fantasy TV show based on a series of books written by George RR Martin.

This notebook showcases the analysis and predictions of the battles in the series.

Load packages

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import seaborn as sns
from pylab import rcParams
from collections import Counter
from time import time
from pandas_profiling import ProfileReport
from IPython.display import display
# Import supplementary visualization code visuals.py
import visuals as vs
rcParams['figure.figsize'] = 5, 6
plt.style.use('ggplot')

In [None]:
# @hidden_cell
import warnings
warnings.filterwarnings("ignore")

Load dataset

In [None]:
battles_df = pd.read_csv('../../data/battles.csv')
battles_df.head()


In reviewing the other kernels to see what has been done, [one particular kernel](https://www.kaggle.com/chrisbuetti/predicting-battle-outcomes-start-rmd) on Kaggle pointed out the data entry mistake on the Battle of Castle Rock. Having watched the TV series, I know for a fact that Mance Rayder has 100K wildings and Stannis Baratheon has 1,240 troops. I flipped the names in the dataset. This should be a major callout to anyone using this dataset

# Data Cleaning

Change `attacker_outcome` to boolean and fill nan with zeros for `major_death`,	`major_capture`, `summer`.

In [None]:
battles_df['attacker_outcome_flag'] = battles_df['attacker_outcome'].map({'win': 1, 'loss': 0})

battles_df['attacker_outcome_flag'] = battles_df['attacker_outcome_flag'].fillna(0)
battles_df['major_death'] = battles_df['major_death'].fillna(0)
battles_df['major_capture'] = battles_df['major_capture'].fillna(0)
battles_df['summer'] = battles_df['summer'].fillna(0)
battles_df[['attacker_outcome','major_death','major_capture','summer']]


### Run pandas profiler for fast EDA

In [None]:
profile = ProfileReport(battles_df, title='Game of Thrones Battles - Pandas Profiling Report', style={'full_width':True})
profile

In [None]:
profile.to_file(output_file="got_battles_data_profile.html")

The columns `attacker_2`,`attacker_3`,`attacker_4`,`defender_2`,`defender_3`,`defender_4`, `attacker_commander`, `defender_commander` can be used to count the number of houses involved in the battle. 

Three columns will be created for:   

* Number of attacking houses
* Number of defending houses 
* Number of attacker_commander	
* Number of defender_commander
* Battle Size


In [None]:
battles_df['attack_houses'] = battles_df[['attacker_1','attacker_2','attacker_3','attacker_4']].notnull().sum(axis=1)
battles_df['attack_houses'] = pd.to_numeric(battles_df.attack_houses)

battles_df['defender_houses'] = battles_df[['defender_1','defender_2','defender_3','defender_4']].notnull().sum(axis=1)
battles_df['defender_houses'] = pd.to_numeric(battles_df.defender_houses)

# Check data
battles_df[['attacker_1','attacker_2','attacker_3','attacker_4','attack_houses','defender_1','defender_2','defender_3','defender_4','defender_houses']].sort_values(by=['attack_houses','defender_houses'],ascending=[False,False])

Count occurence of `attacker_commander` and `defender_commander`

In [None]:
battles_df['attacker_commander'].str.split(',', expand=True)

In [None]:
battles_df['attacker_commander_count'] = battles_df['attacker_commander'].str.split(',', expand=True).notnull().sum(axis=1)
battles_df[['attacker_commander','attacker_commander_count']]

In [None]:
battles_df['defender_commander'].str.split(',', expand=True)

In [None]:
battles_df['defender_commander_count'] = battles_df['defender_commander'].str.split(',', expand=True).notnull().sum(axis=1)
battles_df[['defender_commander','defender_commander_count']]

Drop columns with missing data

In [None]:
battles_df = battles_df.drop(columns = ['battle_number','attacker_2','attacker_3','attacker_4','defender_2','defender_3','defender_4','note'])
battles_df.head()

Create `battle_size` for the total number of people involved in a battle.

In [None]:
battles_df['battle_size'] = battles_df['attacker_size'] + battles_df['defender_size']
battles_df[['attacker_size','defender_size','battle_size']]

Plot correlation

In [None]:
corr_plot = battles_df.corr(method='pearson').style.set_caption('Correlation for Game of Thrones Battles').background_gradient(cmap='coolwarm').set_precision(4)
corr_plot

Run profiler again

In [None]:
profile = ProfileReport(battles_df, title='Game of Thrones Battles - Pandas Profiling Report', style={'full_width':True})
profile
profile.to_file(output_file="got_battles_data_profile.html")


# Write function to clean battles dataset

In [None]:
def clean_battle_data(df):
    df['attacker_outcome_flag'] = df['attacker_outcome'].map({'win': 1, 'loss': 0})

    # Fill NaN with zero
    df['attacker_outcome_flag'] = df['attacker_outcome_flag'].fillna(0)
    df['major_death'] = df['major_death'].fillna(0)
    df['major_capture'] = df['major_capture'].fillna(0)
    df['summer'] = df['summer'].fillna(0)
    df['attacker_size'] = df['attacker_size'].fillna(0)
    df['defender_size'] = df['defender_size'].fillna(0)

    # The columns attacker_2,attacker_3,attacker_4,defender_2,defender_3,defender_4, attacker_commander, defender_commander can be used to count the number of houses involved in the battle.
    df['attack_houses'] = df[['attacker_1','attacker_2','attacker_3','attacker_4']].notnull().sum(axis=1)
    df['attack_houses'] = pd.to_numeric(df.attack_houses)

    df['defender_houses'] = df[['defender_1','defender_2','defender_3','defender_4']].notnull().sum(axis=1)
    df['defender_houses'] = pd.to_numeric(df.defender_houses)

    # Count attacker_commander
    df['attacker_commander_count'] = df['attacker_commander'].str.split(',', expand=True).notnull().sum(axis=1)

    # Count defender_commander
    df['defender_commander_count'] = df['defender_commander'].str.split(',', expand=True).notnull().sum(axis=1)

    # Drop columns with missing data
    df = df.drop(columns = ['battle_number','attacker_2','attacker_3','attacker_4','defender_2','defender_3','defender_4','note'])

    # Create battle_size columns
    df['battle_size'] = df['attacker_size'] + df['defender_size']
    df['battle_size'] = df['battle_size'].fillna(0)

    return df


Test function

In [None]:
battles_df = pd.read_csv('../../data/battles.csv')
battles_df = clean_battle_data(battles_df)
battles_df.head()

Export clean data

In [None]:
battles_df.to_csv('../../data/battles_clean.csv', index = False)

# Exploratory Data Analysis

In [None]:
profile

### Using the Fast EDA (one dimension):

**`attacker_outcome`**  
32 battles out of 38 battles were won (84.2%).

**`attacker_king`**  
On the offense, Joffrey/Tommen Baratheon were the attacking kings 36.8% of the time (14 battles) while Mance Rayder was only the attacking king once (5.3%).

**`defender_king`**  
On the defense, Robb Stark has been attacked 36.8% of the times (14 battles) while Joffrey/Tommen Baratheon were second (34.2% or 13 battles). Renly Baratheon defended once (2.6%).

**`battle_type`**  
We can see that the most common `battle_type` is pitched battle, appearing 36.8% (14 times) while razing was only 5.3% (twice).

**`region`**    
Most of the battles were fought in The Riverlands (44.7% or 17 battles) while the second most battles fought were in The North (26.3% or 10 battles). There was only one battle Beyond The Wall (2.6%).

**`summer`**  
Most of the battles were fought in the summer (26 or 68.4%) while the remaing were fought in the winter (12 battles or 31.6%).

**`year`**  
Majority of the battles were fought in the year 299 (52.6% or 20 battles) and the second in the year 300 (28.9% or 11 battles). The remainder year 298 had only 7 battles (18.4%).










## Multiple dimensional view

We will examine using multiple variables to see how they play together.

* attacker_king vs attacker_outcome  
* defender_king vs attacker_outcome  
* battle_type vs attacker_outcome  
* summer vs battle_type vs attacker_outcome  
* battle_type vs attacker_king vs attacker_outcome  
* battle_type vs defender_king vs attacker_outcome  






In [None]:
df_grouped = battles_df.groupby(by=['attacker_king']).agg(
    attacker_outcome_flag_count = ('attacker_outcome_flag','count'),
    attacker_outcome_wins = ('attacker_outcome_flag','sum'),
    attacker_size_mean = ('attacker_size', 'mean'),
    defender_size_mean = ('defender_size','mean')).reset_index().sort_values(by = 'attacker_outcome_flag_count', ascending = False)

df_grouped['attacker_outcome_loss'] = df_grouped['attacker_outcome_flag_count'] - df_grouped['attacker_outcome_wins']
df_grouped['attacker_outcome_wins_pct'] = (df_grouped['attacker_outcome_wins']/df_grouped['attacker_outcome_flag_count']) * 100
df_grouped['attacker_outcome_loss_pct'] = 100 - df_grouped['attacker_outcome_wins_pct']
df_grouped


In [None]:
df_grouped[['attacker_king','attacker_outcome_wins_pct','attacker_outcome_loss_pct']].plot.bar(x='attacker_king')

In [None]:
df_grouped[['attacker_king','attacker_size_mean','defender_size_mean']].plot.bar(x='attacker_king')

In [None]:
# Remove Mance Rayder
df_grouped[['attacker_king','attacker_size_mean','defender_size_mean']][df_grouped.attacker_king != 'Mance Rayder'].plot.bar(x='attacker_king')

## Battle year

In [None]:
df_year = battles_df.groupby(by=['year']).agg(
    battles = ('name','count'),
    major_death = ('major_death', 'sum'),
    major_capture = ('major_capture','sum')).reset_index().sort_values(by = 'year', ascending = True)
    
df_year.plot.bar(x='year')
display(df_year)

## Battle vs Region

In [None]:
df_region = battles_df.groupby(by=['region']).agg(
    battles_count = ('name','count'),
    major_death = ('major_death', 'sum'),
    major_capture = ('major_capture','sum')).reset_index().sort_values(by = 'battles_count', ascending = False)
    
df_region.plot.bar(x='region')
display(df_region)

## Battle_Type vs Kings

In [None]:
df_battle_type = battles_df.groupby(by=['battle_type']).agg(
    battles_count = ('name','count'),
    major_death = ('major_death', 'sum'),
    major_capture = ('major_capture','sum')).reset_index().sort_values(by = 'battles_count', ascending = False)
    
df_battle_type.plot.bar(x='battle_type')
display(df_battle_type)

In [None]:
pd.value_counts(battles_df['region']).plot.bar()

In [None]:
pd.value_counts(battles_df['battle_type']).plot.bar()

In [None]:
pd.value_counts(battles_df['attacker_1']).plot.bar()

In [None]:
pd.value_counts(battles_df['defender_1']).plot.bar()


In [None]:
pd.value_counts(battles_df['summer']).plot.bar()

In [None]:
battles_df['attacker_size'].hist(bins=20)

In [None]:
battles_df['defender_size'].hist(bins=20)

In [None]:
battles_df['attack_houses'].hist(bins=10)

In [None]:
battles_df['defender_houses'].hist(bins=10)

In [None]:
battles_df['attacker_commander_count'].hist(bins=10)

In [None]:
battles_df['defender_commander_count'].hist(bins=10)

## battle_type vs attacker_outcome

In [None]:
df_battle_type = battles_df.groupby(by=['battle_type']).agg(
    battles_count = ('name','count'),
    attacker_outcome_flag_count = ('attacker_outcome_flag','count'),
    attacker_outcome_wins = ('attacker_outcome_flag','sum'),
    attacker_size_mean = ('attacker_size', 'mean'),
    defender_size_mean = ('defender_size','mean')).reset_index().sort_values(by = 'battles_count', ascending = False)

df_battle_type['attacker_outcome_loss'] = df_battle_type['attacker_outcome_flag_count'] - df_battle_type['attacker_outcome_wins']
df_battle_type['attacker_outcome_wins_pct'] = (df_battle_type['attacker_outcome_wins']/df_battle_type['attacker_outcome_flag_count']) * 100
df_battle_type['attacker_outcome_loss_pct'] = 100 - df_battle_type['attacker_outcome_wins_pct']
df_battle_type

In [None]:
sns.factorplot(x="battle_type", y="attacker_outcome_wins_pct",
            aspect=0.8,
            kind="bar", data=df_battle_type)

## Summer vs battle_type vs attacker_outcome

In [None]:
df_battle_type_summer = battles_df.groupby(by=['summer','battle_type','attacker_outcome']).agg(
    battles_count = ('name','count'),
    attacker_outcome_flag_count = ('attacker_outcome_flag','count'),
    attacker_outcome_wins = ('attacker_outcome_flag','sum'),
    attacker_size_mean = ('attacker_size', 'mean'),
    defender_size_mean = ('defender_size','mean')).reset_index().sort_values(by = 'battles_count', ascending = False)

df_battle_type_summer['attacker_outcome_loss'] = df_battle_type_summer['attacker_outcome_flag_count'] - df_battle_type_summer['attacker_outcome_wins']
df_battle_type_summer['attacker_outcome_wins_pct'] = (df_battle_type_summer['attacker_outcome_wins']/df_battle_type_summer['attacker_outcome_flag_count']) * 100
df_battle_type_summer['attacker_outcome_loss_pct'] = 100 - df_battle_type_summer['attacker_outcome_wins_pct']
df_battle_type_summer

In [None]:
sns.factorplot(x="battle_type", y="attacker_outcome_wins_pct",
            col="summer", aspect=1,
            kind="bar", data=df_battle_type_summer)

## battle_type vs. attacker_king

In [None]:
df_battle_attacker_king = battles_df.groupby(by=['attacker_king','battle_type']).agg(
    battles_count = ('name','count'),
    attacker_outcome_flag_count = ('attacker_outcome_flag','count'),
    attacker_outcome_wins = ('attacker_outcome_flag','sum'),
    attacker_size_mean = ('attacker_size', 'mean'),
    defender_size_mean = ('defender_size','mean')).reset_index().sort_values(by = 'battles_count', ascending = False)

df_battle_attacker_king['attacker_outcome_loss'] = df_battle_attacker_king['attacker_outcome_flag_count'] - df_battle_attacker_king['attacker_outcome_wins']
df_battle_attacker_king['attacker_outcome_wins_pct'] = (df_battle_attacker_king['attacker_outcome_wins']/df_battle_attacker_king['attacker_outcome_flag_count']) * 100
df_battle_attacker_king['attacker_outcome_loss_pct'] = 100 - df_battle_attacker_king['attacker_outcome_wins_pct']
df_battle_attacker_king

In [None]:
chart = sns.catplot(x="attacker_king", y="attacker_outcome_wins_pct",
                       col="battle_type", aspect=1,
                       kind="bar", data=df_battle_attacker_king)
chart.set_xticklabels(rotation=45, horizontalalignment='right')

## battle_type vs. defender_king

In [None]:
df_battle_defender_king = battles_df.groupby(by=['defender_king','battle_type']).agg(
    battles_count = ('name','count'),
    attacker_outcome_flag_count = ('attacker_outcome_flag','count'),
    attacker_outcome_wins = ('attacker_outcome_flag','sum'),
    attacker_size_mean = ('attacker_size', 'mean'),
    defender_size_mean = ('defender_size','mean')).reset_index().sort_values(by = 'battles_count', ascending = False)

df_battle_defender_king['attacker_outcome_loss'] = df_battle_defender_king['attacker_outcome_flag_count'] - df_battle_defender_king['attacker_outcome_wins']
df_battle_defender_king['attacker_outcome_wins_pct'] = (df_battle_defender_king['attacker_outcome_wins']/df_battle_defender_king['attacker_outcome_flag_count']) * 100
df_battle_defender_king['attacker_outcome_loss_pct'] = 100 - df_battle_defender_king['attacker_outcome_wins_pct']
df_battle_defender_king

In [None]:
chart = sns.catplot(x="defender_king", y="attacker_outcome_loss_pct",
                       col="battle_type", aspect=1,
                       kind="bar", data=df_battle_defender_king)
chart.set_xticklabels(rotation=45, horizontalalignment='right')

## attacker_king vs defender_king

In [None]:
df_attack_defend = battles_df.groupby(by=['attacker_king','defender_king']).agg(
    battles_count = ('name','count'),
    attacker_outcome_flag_count = ('attacker_outcome_flag','count'),
    attacker_outcome_wins = ('attacker_outcome_flag','sum'),
    attacker_size_mean = ('attacker_size', 'mean'),
    defender_size_mean = ('defender_size','mean')).reset_index().sort_values(by = 'battles_count', ascending = False)

df_attack_defend['attacker_outcome_loss'] = df_attack_defend['attacker_outcome_flag_count'] - df_attack_defend['attacker_outcome_wins']
df_attack_defend['attacker_outcome_wins_pct'] = (df_attack_defend['attacker_outcome_wins']/df_attack_defend['attacker_outcome_flag_count']) * 100
df_attack_defend['attacker_outcome_loss_pct'] = 100 - df_attack_defend['attacker_outcome_wins_pct']
df_attack_defend

In [None]:
chart = sns.catplot(x="attacker_king", y="battles_count",
                       col="defender_king", aspect=1,
                       kind="bar", data=df_attack_defend)
chart.set_xticklabels(rotation=45, horizontalalignment='right')

In [None]:
chart = sns.catplot(x="attacker_king", y="attacker_outcome_loss_pct",
                       col="defender_king", aspect=1,
                       kind="bar", data=df_attack_defend)
chart.set_xticklabels(rotation=45, horizontalalignment='right')


In [None]:
chart = sns.catplot(x="attacker_king", y="attacker_outcome_wins_pct",
                       col="defender_king", aspect=1,
                       kind="bar", data=df_attack_defend)
chart.set_xticklabels(rotation=45, horizontalalignment='right')

## Battle Size 

In [None]:
chart = sns.boxplot(x="attacker_king", y="attacker_size", data=battles_df[battles_df.attacker_king != 'Mance Rayder'], palette="Set1")
chart.set_xticklabels(chart.get_xticklabels(),rotation=30)






In [None]:
chart = sns.boxplot(x="attacker_king", y="attacker_size", data=battles_df[(battles_df.attacker_king != 'Mance Rayder') & (battles_df.attacker_outcome == 'win')], palette="Set1")
chart.set_xticklabels(chart.get_xticklabels(),rotation=30)

In [None]:
chart = sns.boxplot(x="attacker_king", y="attacker_size", data=battles_df[(battles_df.attacker_king != 'Mance Rayder') & (battles_df.attacker_outcome == 'loss')], palette="Set1")
chart.set_xticklabels(chart.get_xticklabels(),rotation=30)

# Start Modeling Here

There are two types of machine learning types that I will be performing:

1) Regression Model for `attacker_size` vs 'defender_size` to determine the army size a house will be fighting against

2) Classification Model - What factors determine the `attacker_outcome`?


## Regression Model

In [None]:
sns.regplot(x='attacker_size',y='defender_size',data=battles_df)
display(battles_df[['attacker_size','defender_size']].corr())

In [None]:


battles_df1 = battles_df[(battles_df.attacker_king != 'Mance Rayder') & (battles_df.attacker_size > 0) & (battles_df.defender_size > 0)]

sns.lmplot(x='attacker_size', y='defender_size', data=battles_df1[['attacker_size','defender_size']],
           robust=True, ci=None, scatter_kws={"s": 95})
display(battles_df1[['attacker_size','defender_size']].corr())


In [None]:
battles_df1 = battles_df[(battles_df.attacker_king != 'Mance Rayder') & (battles_df.attacker_size > 0) & (battles_df.defender_size > 0)& (battles_df.attacker_size < 14000) & (battles_df.defender_size < 20000)]
sns.lmplot(x='attacker_size', y='defender_size', data=battles_df1[['attacker_size','defender_size']],
           robust=True, ci=None, scatter_kws={"s": 80})
display(battles_df1[['attacker_size','defender_size']].corr())

In [None]:
battles_df1 = battles_df[(battles_df.attacker_king != 'Mance Rayder') & (battles_df.attacker_outcome_flag == 1) & (battles_df.attacker_size > 0) & (battles_df.defender_size > 0) & (battles_df.attacker_size < 14000) & (battles_df.defender_size < 20000)]
sns.lmplot(x='attacker_size', y='defender_size', data=battles_df1[['attacker_size','defender_size']],
           robust=True, ci=None, scatter_kws={"s": 80})
display(battles_df1[['attacker_size','defender_size']].corr())

## Classification Modeling

In [None]:
 # Drop columns with missing data
model_df = battles_df.drop(columns = ['name','location','attacker_commander','defender_commander','attacker_outcome','attacker_outcome_flag','battle_size'])
attacker_outcome = battles_df['attacker_outcome_flag']

In [None]:
model_df.dtypes

In [None]:
model_df[model_df.columns[0:10]]

In [None]:
model_df[model_df.columns[11:23]]

In [None]:
model_df[['attacker_king','defender_king','attacker_1','defender_1']] = model_df[['attacker_king','defender_king','attacker_1','defender_1']].replace('/','_',regex=True)
model_df[['attacker_king','defender_king','region','battle_type','attacker_1','defender_1']] = model_df[['attacker_king','defender_king','region','battle_type','attacker_1','defender_1']].replace(' ','_',regex=True)
model_df[model_df.columns[0:10]]

In [None]:
model_df[model_df.columns[10:24]]

In [None]:
categorical_feature_mask = model_df.dtypes==object
categorical_cols = model_df.columns[categorical_feature_mask].tolist()
categorical_cols

In [None]:
model_df1 = pd.get_dummies(model_df, columns=categorical_cols, prefix = categorical_cols)
model_df1.head()

In [None]:
model_df1.columns

## Log transform `defender_size` and `attacker_size`  


In [None]:
# Log-transform the skewed features
skewed = ['attacker_size', 'defender_size']
features_log_transformed = pd.DataFrame(data = model_df1)
features_log_transformed[skewed] = model_df1[skewed].apply(lambda x: np.log(x + 1))

In [None]:
features_log_transformed['attacker_size'].hist(bins=20)

In [None]:
features_log_transformed['defender_size'].hist(bins=20)

## Normalizing Numerical Features

In [None]:

# attack_houses,defender_houses,attacker_commander_count,defender_commander_count,attacker_size,defender_size
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['attack_houses','defender_houses','attacker_commander_count','defender_commander_count','attacker_size','defender_size']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

# Show an example of a record with scaling applied
display(features_log_minmax_transform.head(n = 5))

In [None]:
features_final = features_log_transformed
features_final.to_csv('../../data/battles_data_model.csv',index = False)

## Shuffle and split data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    attacker_outcome, 
                                                    random_state = 0,
                                                    test_size = 0.25)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

## Model Selection

* Logistic Regression  

* Random Forest

* XG Boost




## Logistic Regression

In [None]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 20)
display(accuracies.mean())
display(accuracies.std())

The accuracy using logistic regression is 91.6% with a standard deviation of 0.2327 and we have a good baseline classification model already which is good news.

## Random Forest

In [None]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy',random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 20)
display(accuracies.mean())
display(accuracies.std())

In [None]:
# TODO: Extract the feature importances using .feature_importances_ 
importances = classifier.feature_importances_

# Plot
vs.feature_plot(importances, X_train, y_train)

The accuracy using random forest is 95.0% with a standard deviation of 0.119. The accuracy improved by approximately 3.6% and the standard deviation decreased by almost half.

## XGBoost

In [None]:
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 20)
display(accuracies.mean())
display(accuracies.std())

Surprisingly, XGBoost has the worst accuracy compared to the Logistic Regression and Random Forest models (90%) but it is still a good model.

Random Forest has the best accuracy of 95%. The features importance are `attacker_size`, `attacker_commander_count`,`attack_houses`,`defender_size`, and `defender_houses`.

## Implementation - Extracting Feature Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy',random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 20)
display(accuracies.mean())
display(accuracies.std())

In [None]:
# TODO: Extract the feature importances using .feature_importances_ 
importances = classifier.feature_importances_
vs.feature_plot(importances, X_train, y_train)

In [None]:
# Import functionality for cloning a model
from sklearn.base import clone

# Reduce the feature space
X_train_reduced = X_train[X_train.columns.values[(np.argsort(importances)[::-1])[:5]]]
X_test_reduced = X_test[X_test.columns.values[(np.argsort(importances)[::-1])[:5]]]

# Train on the "best" model found from grid search earlier
clf = classifier.fit(X_train_reduced, y_train)

# Make new predictions
reduced_predictions = clf.predict(X_test_reduced)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Applying k-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = clf, X = X_train_reduced, y = y_train, cv = 20)
display(accuracies.mean())
display(accuracies.std())

In [None]:
importances = clf.feature_importances_
vs.feature_plot(importances, X_train_reduced, y_train)

After we reduced the features down to the 5 most significant predictors, we get a slight improvement in accuracy of 96.6% and the standard deviation decreased slightly to 0.1. This is okay as we now have a model that can predict battle outcomes for Game of Thrones!

The most important factors are:

`attacker_size` - The size of the attacking house matter (unless you are Balon/Euron Greyjoy who wins with smaller armies)  
`attacker_commander_count` - The count of the attacker commander matters as well.  
`attack_houses`  - The number of attacking houses that are in the battle.  
`defender_size`  - The size of the defender house matter (unless you are Balon/Euron Greyjoy who wins with smaller armies)  
`defender_houses` - The number of defending houses that are in the battle.  

