In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
def read_csv_without_unnamed(path):
    df = pd.read_csv(path)
    return df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [None]:
heroes_information_df = read_csv_without_unnamed('../input/superhero-set/heroes_information.csv')
super_hero_powers_df = read_csv_without_unnamed('../input/superhero-set/super_hero_powers.csv')

## Data Overview

In [None]:
heroes_information_df.shape

In [None]:
heroes_information_df.columns

In [None]:
heroes_information_df.dtypes

In [None]:
heroes_information_df = heroes_information_df.replace(['-', -99], np.nan)

In [None]:
heroes_information_df.isna().sum()

In [None]:
heroes_information_df.describe()

In [None]:
heroes_information_df.head()

In [None]:
super_hero_powers_df.shape

In [None]:
super_hero_powers_df.columns

In [None]:
super_hero_powers_df.dtypes

In [None]:
super_hero_powers_df.isna().sum()

In [None]:
super_hero_powers_df.describe()

In [None]:
super_hero_powers_df.head()

## EDA

There are 734 listed superheroes (667 with powers indicated) and 167 unique superpowers in the dataset.

In [None]:
listed_heroes = set(heroes_information_df['name'])
heroes_with_powers = set(super_hero_powers_df['hero_names'])

There are some superheroes listed in super_hero_powers that are not in heroes_information and vice versa

In [None]:
len(listed_heroes.intersection(heroes_with_powers))

Mainly consists of Marvel and DC superheroes but also interestingly includes those from Harry Potter and South Park Franchise

In [None]:
heroes_information_df['Publisher'].value_counts()

Popular characters not coming from comic books are also interestingly listed as superheroes.

In [None]:
heroes_information_df.loc[heroes_information_df['Publisher'].isna(), 'name']

In [None]:
super_hero_powers_df.loc[super_hero_powers_df['hero_names'] == 'Chuck Norris', ]

In [None]:
def get_powers(hero_name):
    series = super_hero_powers_df.loc[super_hero_powers_df['hero_names'] == hero_name].T.squeeze()
    return series[series == True].index

Here are some interesting superpowers from interesting superheroes I found

In [None]:
get_powers('Chuck Norris')

In [None]:
get_powers('Kool-Aid Man')

Most common and unique superpowers

In [None]:
superpowers_df = super_hero_powers_df.drop('hero_names', axis=1)
superpower_count = superpowers_df.sum().sort_values(ascending=False)

In [None]:
superpower_count.head(10).plot.bar(figsize=(8, 5))
plt.title('Most common superpowers', fontsize=20)
plt.xlabel('Superpower')
plt.ylabel('Frequency')
plt.xticks(rotation=50, horizontalalignment='right')

Even though these are the most common superpowers, the degree as to how strong or how fast a superhero is not indicated. The dataset will not be able to tell you that, for example, Superman is stronger than Hulk.

In [None]:
unique_powers = superpower_count[superpower_count == 1].index

In [None]:
matching_hero = []
for unique_power in unique_powers:
    matching_hero.append(super_hero_powers_df.loc[super_hero_powers_df[unique_power] == True, 'hero_names'].iloc[0])

Unique powers and the matching superhero

In [None]:
list(zip(matching_hero, unique_powers))

The superhero with the most listed superpowers with 49 superpowers is Spectre from DC Comics

In [None]:
superhero_power_count = superpowers_df.sum(axis=1)
max_number_of_powers = superhero_power_count.max()
max_number_of_powers

In [None]:
super_hero_powers_df.loc[super_hero_powers_df.index.isin(superhero_power_count[superhero_power_count == max_number_of_powers].index.values), 'hero_names']

In [None]:
get_powers('Spectre')

In [None]:
heroes_information_df[heroes_information_df['name'] == 'Spectre']

### Marvel and DC 

In [None]:
marvel_heroes_df = heroes_information_df[heroes_information_df['Publisher'] == 'Marvel Comics']
dc_heroes_df = heroes_information_df[heroes_information_df['Publisher'] == 'DC Comics']
combined_heroes_df = pd.concat([marvel_heroes_df, dc_heroes_df])

Interestingly, there are a lot more good superheroes than bad ones taking into account that some superheroes like Batman deals with multiple villains like Joker, Penguin, etc.

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sns.countplot(ax=ax, x='Alignment', hue='Publisher', data=combined_heroes_df.fillna('Not Indicated'))

The superheroes are mostly male.

In [None]:
fig, ax = plt.subplots(figsize=(8, 5))
sns.countplot(ax=ax, x='Gender', hue='Publisher', data=combined_heroes_df.fillna('Not Indicated'))