# Women in Comic Books
Predicting the Prominence of Female Heroes in the future

## Load Libraries 

In [391]:
#Import Python Libraries
import pandas as pd
import matplotlib.pyplot as mpl 
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import preprocessing

## Import Data Sets 

In [392]:
#Read csv file
marvel = pd.read_csv('../data/marvel-wikia-data.csv')

In [393]:
#Read csv file
dc = pd.read_csv("../data/dc-wikia-data.csv")

In [394]:
# Reset marvel year column to all caps YEAR so to keep the column names cohesive
marvel['YEAR'] = marvel['Year']

## Setting new Data frames

In [395]:
# Marvel DataFrame with name, ID, Alignment, eye color, hair color, sex, alive status, appearances, and year
marvel = pd.DataFrame(marvel, columns = ['name' , 'ID', 'ALIGN' , 'EYE', 'HAIR', 'SEX', 'YEAR'])

# Create a new column named Universe and set all the column to DC
marvel['UNIVERSE'] = 'Marvel'

marvel.head()

Unnamed: 0,name,ID,ALIGN,EYE,HAIR,SEX,YEAR,UNIVERSE
0,Spider-Man (Peter Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,1962.0,Marvel
1,Captain America (Steven Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,1941.0,Marvel
2,"Wolverine (James \""Logan\"" Howlett)",Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,1974.0,Marvel
3,"Iron Man (Anthony \""Tony\"" Stark)",Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,1963.0,Marvel
4,Thor (Thor Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,1950.0,Marvel


In [396]:
# DC DataFrame with name, ID, Alignment, eye color, hair color, sex, alive status, appearances, and year
dc = pd.DataFrame(dc, columns = ['name' , 'ID', 'ALIGN' , 'EYE', 'HAIR', 'SEX', 'YEAR'])

# Create a new column named Universe and set all the column to DC
dc['UNIVERSE'] = 'DC'

dc.head()

Unnamed: 0,name,ID,ALIGN,EYE,HAIR,SEX,YEAR,UNIVERSE
0,Batman (Bruce Wayne),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,1939.0,DC
1,Superman (Clark Kent),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,1938.0,DC
2,Green Lantern (Hal Jordan),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Male Characters,1940.0,DC
3,James Gordon (New Earth),Public Identity,Good Characters,Brown Eyes,White Hair,Male Characters,1939.0,DC
4,Richard Grayson (New Earth),Secret Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,1940.0,DC


## Making one large Data frame

In [397]:
# ignore_index means it will reassign new indexes
comicCharacters = pd.concat([marvel, dc], ignore_index = True, sort = False)
comicCharacters.head()

Unnamed: 0,name,ID,ALIGN,EYE,HAIR,SEX,YEAR,UNIVERSE
0,Spider-Man (Peter Parker),Secret Identity,Good Characters,Hazel Eyes,Brown Hair,Male Characters,1962.0,Marvel
1,Captain America (Steven Rogers),Public Identity,Good Characters,Blue Eyes,White Hair,Male Characters,1941.0,Marvel
2,"Wolverine (James \""Logan\"" Howlett)",Public Identity,Neutral Characters,Blue Eyes,Black Hair,Male Characters,1974.0,Marvel
3,"Iron Man (Anthony \""Tony\"" Stark)",Public Identity,Good Characters,Blue Eyes,Black Hair,Male Characters,1963.0,Marvel
4,Thor (Thor Odinson),No Dual Identity,Good Characters,Blue Eyes,Blond Hair,Male Characters,1950.0,Marvel


## Cleaning up the Data Frame

In [398]:
# Get a dataframe with only female heroes indexes  
femaleComicCharacters = comicCharacters[comicCharacters['SEX'] ==  'Female Characters']

# Drop all characters with NAN as their eye color, hair color, and ID
femaleComicCharacters.dropna(subset = ['EYE'], inplace=True) 
femaleComicCharacters.dropna(subset = ['HAIR'], inplace=True) 
femaleComicCharacters.dropna(subset = ['ID'], inplace=True) 
femaleComicCharacters.dropna(subset = ['ALIGN'], inplace=True) 


# Sorting the data by Year
femaleComicCharacters = femaleComicCharacters.sort_values(by ='YEAR' )

femaleComicCharacters.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,name,ID,ALIGN,EYE,HAIR,SEX,YEAR,UNIVERSE
18605,Rose Psychic (New Earth),Public Identity,Good Characters,Black Eyes,Black Hair,Female Characters,1937.0,DC
16390,Lois Lane (New Earth),Public Identity,Good Characters,Blue Eyes,Black Hair,Female Characters,1938.0,DC
16760,Abigail Hunkel (New Earth),Secret Identity,Good Characters,Black Eyes,Brown Hair,Female Characters,1939.0,DC
16874,Martha Roberts (New Earth),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Female Characters,1939.0,DC
366,Dorma (Earth-616),No Dual Identity,Neutral Characters,Blue Eyes,Auburn Hair,Female Characters,1939.0,Marvel


#### In the chunk below the data in each of the stated columns will be asigned a numaric value. This was done so the Categorical data columns could be used in the dicision tree analysis. 

In [399]:
# Encodes categorical data into numbers
le = preprocessing.LabelEncoder()
# Assigning numerical values and storing in another column
femaleComicCharacters['eye color'] = le.fit_transform(femaleComicCharacters['EYE'])
femaleComicCharacters['hair color'] = le.fit_transform(femaleComicCharacters['HAIR'])
femaleComicCharacters['id'] = le.fit_transform(femaleComicCharacters['ID'])
femaleComicCharacters['align'] = le.fit_transform(femaleComicCharacters['ALIGN'])
femaleComicCharacters

Unnamed: 0,name,ID,ALIGN,EYE,HAIR,SEX,YEAR,UNIVERSE,eye color,hair color,id,align
18605,Rose Psychic (New Earth),Public Identity,Good Characters,Black Eyes,Black Hair,Female Characters,1937.0,DC,2,2,2,1
16390,Lois Lane (New Earth),Public Identity,Good Characters,Blue Eyes,Black Hair,Female Characters,1938.0,DC,3,2,2,1
16760,Abigail Hunkel (New Earth),Secret Identity,Good Characters,Black Eyes,Brown Hair,Female Characters,1939.0,DC,2,5,3,1
16874,Martha Roberts (New Earth),Secret Identity,Good Characters,Brown Eyes,Brown Hair,Female Characters,1939.0,DC,4,5,3,1
366,Dorma (Earth-616),No Dual Identity,Neutral Characters,Blue Eyes,Auburn Hair,Female Characters,1939.0,Marvel,3,0,1,2
18310,Dala (New Earth),Secret Identity,Bad Characters,Red Eyes,Black Hair,Female Characters,1939.0,DC,15,2,3,0
17843,Julie Madison (New Earth),Public Identity,Good Characters,Blue Eyes,Black Hair,Female Characters,1939.0,DC,3,2,2,1
16989,Irene Miller (New Earth),Public Identity,Good Characters,Grey Eyes,Black Hair,Female Characters,1940.0,DC,7,2,2,1
16633,Dian Belmont (New Earth),Public Identity,Good Characters,Hazel Eyes,Brown Hair,Female Characters,1940.0,DC,8,5,2,1
16610,Inza Cramer (New Earth),Secret Identity,Good Characters,Green Eyes,Strawberry Blond Hair,Female Characters,1940.0,DC,6,18,3,1


In [400]:
# this is to check that the numaric values match the correct characteristic
femaleComicCharacters.loc[18310]
# the characteristics and their values match correctly

name           Dala (New Earth)
ID              Secret Identity
ALIGN            Bad Characters
EYE                    Red Eyes
HAIR                 Black Hair
SEX           Female Characters
YEAR                       1939
UNIVERSE                     DC
eye color                    15
hair color                    2
id                            3
align                         0
Name: 18310, dtype: object

### The Characteristics and their Numerical Variable 
Align: 0 = Bad, 1 = Good, 2 = Neutral

## Looking at the Data 

In [401]:
# Counts the total number characters by align, eye color, hair color, and id
# this was done so that I could clearly see the total variablees for each characteristic
femaleAlignmentCount = femaleComicCharacters['ALIGN'].value_counts()
femaleEyeCount = femaleComicCharacters['EYE'].value_counts()
femaleHairCount = femaleComicCharacters['HAIR'].value_counts()
femaleIdCount = femaleComicCharacters['ID'].value_counts()

print(femaleAlignmentCount)
print(femaleEyeCount)
print(femaleHairCount)
print(femaleIdCount)

Good Characters       1242
Bad Characters         598
Neutral Characters     428
Name: ALIGN, dtype: int64
Blue Eyes          821
Brown Eyes         633
Green Eyes         326
Black Eyes         164
White Eyes          76
Red Eyes            73
Yellow Eyes         52
Grey Eyes           27
Hazel Eyes          22
Purple Eyes         19
Violet Eyes         16
Orange Eyes          6
Variable Eyes        6
Pink Eyes            6
Gold Eyes            5
Silver Eyes          4
Amber Eyes           4
Magenta Eyes         2
Yellow Eyeballs      2
Black Eyeballs       2
No Eyes              1
Multiple Eyes        1
Name: EYE, dtype: int64
Black Hair               768
Blond Hair               472
Brown Hair               382
Red Hair                 260
White Hair                94
No Hair                   50
Green Hair                39
Grey Hair                 39
Purple Hair               30
Strawberry Blond Hair     26
Auburn Hair               24
Bald                      21
Pink Hair      

In [402]:
# To know the object type for each column
femaleComicCharacters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2268 entries, 18605 to 21903
Data columns (total 12 columns):
name          2268 non-null object
ID            2268 non-null object
ALIGN         2268 non-null object
EYE           2268 non-null object
HAIR          2268 non-null object
SEX           2268 non-null object
YEAR          2168 non-null float64
UNIVERSE      2268 non-null object
eye color     2268 non-null int64
hair color    2268 non-null int64
id            2268 non-null int64
align         2268 non-null int64
dtypes: float64(1), int64(4), object(7)
memory usage: 310.3+ KB


## Gathering Data for Classification

In [403]:
# data that will be used to determine if a character is bad, good or neutral
characteristics = femaleComicCharacters[['eye color', 'hair color', 'id']].values
alliance = femaleComicCharacters[['align']].values

In [404]:
# 70% training and 30% test
trainCharacteristics, testCharacteristics, trainAlliance, testAlliance = train_test_split(characteristics, 
                                                                                          alliance, test_size = 0.3, 
                                                                                          random_state = 1)

## Building the Decision Tree
If given a set x (Eye color, Hair color, ID), we will try to predit the character's Alliance (Hero, Villain, Neutral)

In [405]:
# Creating the Tree classifer
tree = DecisionTreeClassifier()

# Training our model on training data
model = tree.fit(trainCharacteristics, trainAlliance)

# Predict each phase of the testing data
predictedAlliance = model.predict(testCharacteristics)

In [406]:
# Make a data frame of the actual Alliance and the predicted Alliance
results = pd.DataFrame({'Actual': testAlliance.flatten(), 'Predicted': predictedAlliance.flatten()})
results.head()

Unnamed: 0,Actual,Predicted
0,0,1
1,1,1
2,1,1
3,1,1
4,0,1


### Accuracy

In [407]:
# Model Accurracy
print('Accuracy:', metrics.accuracy_score(testAlliance, predictedAlliance))

Accuracy: 0.5139500734214391


### The above value is how well my model does with predicting a character's alliance when given set of characteristics (eye color, hair color, and ID). The model has a 51.4% accuracy, which is fairly accurrate. The module's accuracy may improved by using a different set of characteristics.

## Visualization

In [384]:
# Installing a package in Jupyter notebooks
!pip install pydotplus

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [385]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

In [365]:
# Prepare labels for the tree
predictors = ['eye color', 'hair color', 'id']
classes = ['0', '1', '2']

# Generate the graphics
dot_data = StringIO()
export_graphviz(model, out_file = dot_data,
                filled = True, rounded = True,
                special_characters=True,
                feature_names= predictors,
                class_names = classes)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

# Write the file to file
graph.write_png('Female_Comic_Characters_DecisionTree.png')
Image(graph.create_png)

TypeError: a bytes-like object is required, not 'function'

TypeError: a bytes-like object is required, not 'function'

<IPython.core.display.Image object>