In [1]:
 # Import the required modules
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, RocCurveDisplay

# Pre-Processing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# The path to the CSV file
filepath = "Resources/fictional_character_battles_complex.csv"

#reading the csv file
battle_info = pd.read_csv(filepath)

print(battle_info.shape)
battle_info.head(15)

(2351, 8)


Unnamed: 0,Character,Universe,Strength,Speed,Intelligence,SpecialAbilities,Weaknesses,BattleOutcome
0,Wonder Woman,Marvel,7,8,3,Telekinesis,Kryptonite,0
1,Iron Man,Marvel,4,7,9,Telekinesis,Kryptonite,0
2,Iron Man,DC Comics,8,7,5,Telekinesis,Magic,0
3,Spider-Man,DC Comics,5,6,10,Telekinesis,Kryptonite,0
4,Flash,Marvel,7,6,2,Invisibility,Magic,0
5,Spider-Man,DC Comics,10,9,7,Invisibility,Wooden Stake,1
6,Wonder Woman,Marvel,3,6,2,Super Strength,Silver,1
7,Thor,DC Comics,7,2,4,Invisibility,Magic,1
8,Batman,DC Comics,8,2,7,Flight,Silver,0
9,Iron Man,DC Comics,5,5,4,Flight,Wooden Stake,0


In [4]:
battle_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Character         2351 non-null   object
 1   Universe          2351 non-null   object
 2   Strength          2351 non-null   int64 
 3   Speed             2351 non-null   int64 
 4   Intelligence      2351 non-null   int64 
 5   SpecialAbilities  2351 non-null   object
 6   Weaknesses        2351 non-null   object
 7   BattleOutcome     2351 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 147.1+ KB


In [6]:
battle_info.nunique()

Character            8
Universe             2
Strength            10
Speed               10
Intelligence        10
SpecialAbilities     4
Weaknesses           4
BattleOutcome        2
dtype: int64

In [7]:
# we talked about making a new column to make the characters have the correct universe
# making a copy of the data frame to see what the data would look like 
battle_info2 = battle_info.copy()

In [8]:
#dropping the old universe column
battle_info2 = battle_info2.drop(['Universe'], axis =1)
battle_info2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2351 entries, 0 to 2350
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Character         2351 non-null   object
 1   Strength          2351 non-null   int64 
 2   Speed             2351 non-null   int64 
 3   Intelligence      2351 non-null   int64 
 4   SpecialAbilities  2351 non-null   object
 5   Weaknesses        2351 non-null   object
 6   BattleOutcome     2351 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 128.7+ KB


In [9]:
#making a list of the values in the character column to make sure i dont miss anyone
battle_info2['Character'].unique()

array(['Wonder Woman', 'Iron Man', 'Spider-Man', 'Flash', 'Thor',
       'Batman', 'Superman', 'Captain America'], dtype=object)

In [10]:
#making list of character names to make it eaiser
marvel_characters = ['Iron Man','Spider-Man','Thor','Captain America']
dc_characters = ['Wonder Woman','Flash','Batman','Superman']

In [12]:
# Creating a NEW universe column based on the condition for the correct character universe
battle_info2['Universe'] = battle_info2['Character'].apply(lambda x: 'Marvel' if x in marvel_characters else 'DC Comics')
battle_info2.head(25)

Unnamed: 0,Character,Strength,Speed,Intelligence,SpecialAbilities,Weaknesses,BattleOutcome,Universe
0,Wonder Woman,7,8,3,Telekinesis,Kryptonite,0,DC Comics
1,Iron Man,4,7,9,Telekinesis,Kryptonite,0,Marvel
2,Iron Man,8,7,5,Telekinesis,Magic,0,Marvel
3,Spider-Man,5,6,10,Telekinesis,Kryptonite,0,Marvel
4,Flash,7,6,2,Invisibility,Magic,0,DC Comics
5,Spider-Man,10,9,7,Invisibility,Wooden Stake,1,Marvel
6,Wonder Woman,3,6,2,Super Strength,Silver,1,DC Comics
7,Thor,7,2,4,Invisibility,Magic,1,Marvel
8,Batman,8,2,7,Flight,Silver,0,DC Comics
9,Iron Man,5,5,4,Flight,Wooden Stake,0,Marvel


In [13]:
# data is clean, will leave the encoding and feature engineering for the ML notebook
#saving as a new csv to load into the ML notebook
battle_info2.to_csv('clean_battle_info.csv', index=False)