# Applying Advanced Transformations (Core)

In [1]:
#Full Name: Hannah Ploutz

## Imports

In [2]:
## Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
## Importing the OS and JSON Modules
import os,json
import warnings
warnings.filterwarnings('ignore')

## Tasks

1. Clean the files and combine them into one final DataFrame.
- This dataframe should have the following columns:
> - [X] Hero (Just the name of the Hero)
> - [X] Publisher
> - [X] Gender
> - [X] Eye color
> - [X] Race
> - [X] Hair color
> - [X] Height (numeric)
> - [X] Skin color
> - [X] Alignment
> - [X] Weight (numeric)
> - [X] Plus, one-hot-encoded columns for every power that appears in the dataset. E.g.:
>> - Agility
>> - Flight
>> - Superspeed
>> - etc.

2. Use your combined DataFrame to answer the following questions.
> 1. Compare the average weight of super powers who have Super Speed to those who do not.
> 2. What is the average height of heroes for each publisher?

### Load the Data

In [3]:
#loading in the superhero info data
info = pd.read_csv('Data/superhero_info - superhero_info.csv')
info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}"
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}"
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}"
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}"


In [4]:
#loading in the superhero powers data
powers = pd.read_csv('Data/superhero_powers - superhero_powers.csv')
powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


## Cleaning the Data

### Info Dataframe

In [5]:
#Splitting the Hero column at the |
info[['Hero','Publisher']] = info['Hero|Publisher'].str.split('|',expand=True)

In [6]:
info.head()

Unnamed: 0,Hero|Publisher,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,A-Bomb|Marvel Comics,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Abe Sapien|Dark Horse Comics,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Abin Sur|DC Comics,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Abomination|Marvel Comics,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Absorbing Man|Marvel Comics,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


#### Adjust Hero|Publisher Column

In [7]:
#Dropping the Hero|Published column since we no longer need it
info = info.drop(columns=['Hero|Publisher'])
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics


In [8]:
#Viewing the information for the superhero info dataframe
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Gender        463 non-null    object
 1   Race          463 non-null    object
 2   Alignment     463 non-null    object
 3   Hair color    463 non-null    object
 4   Eye color     463 non-null    object
 5   Skin color    463 non-null    object
 6   Measurements  463 non-null    object
 7   Hero          463 non-null    object
 8   Publisher     463 non-null    object
dtypes: object(9)
memory usage: 32.7+ KB


In [9]:
#Create variable that houses columns that are object types
data_types = info.dtypes
object_data_types_filter = data_types[(data_types == "object")]
object_data_types_filter

Gender          object
Race            object
Alignment       object
Hair color      object
Eye color       object
Skin color      object
Measurements    object
Hero            object
Publisher       object
dtype: object

In [10]:
object_data_types_filter.index

Index(['Gender', 'Race', 'Alignment', 'Hair color', 'Eye color', 'Skin color',
       'Measurements', 'Hero', 'Publisher'],
      dtype='object')

In [11]:
#For loop to loop through the index of the object columsn and display the unique values in each
for column in object_data_types_filter.index:
    print(column)
    print(info[column].unique())
    print('\n')

Gender
['Male' 'Female' 'Unknown']


Race
['Human' 'Icthyo Sapien' 'Ungaran' 'Human / Radiation' 'Unknown' 'Cyborg'
 'Xenomorph XX121' 'Android' 'Mutant' 'Symbiote' 'Atlantean' 'Alien'
 'God / Eternal' 'Neyaphem' 'New God' 'Bizarro' 'Inhuman' 'Metahuman'
 'Demon' 'Vampire' 'Human-Kree' 'Amazon' 'Human / Altered'
 'Kakarantharaian' 'Cosmic Entity' 'Zen-Whoberian' 'Strontian' 'Saiyan'
 'Gorilla' 'Flora Colossus' 'Human-Vuldarian' 'Asgardian' 'Demi-God'
 'Eternal' 'Bolovaxian' 'Kryptonian' 'Czarnian' 'Martian' 'Luphomoid'
 'Human / Cosmic' 'Yautja' 'Animal' 'Clone' 'Talokite' 'Korugaran'
 'Zombie' 'Human-Vulcan' 'Human-Spartoi' 'Tamaranean' 'Frost Giant'
 'Mutant / Clone' "Yoda's species"]


Alignment
['good' 'bad' 'Unknown' 'neutral']


Hair color
['No Hair' 'Blond' 'Brown' 'Unknown' 'White' 'Black' 'Purple' 'Orange'
 'Pink' 'Red' 'Auburn' 'Strawberry Blond' 'black' 'Blue' 'Green' 'Magenta'
 'Brown / White' 'blond' 'Silver' 'Red / Grey' 'Grey' 'Orange / White'
 'Yellow' 'Brownn' 'Gold' '

#### Adjust Measurements Column 

Working on the Height/Weight Column Next

In [12]:
## use .str.replace to replace all single quotes
info['Measurements'] = info['Measurements'].str.replace("'",'"')
## Apply the json.loads to the full column
info['Measurements'] = info['Measurements'].apply(json.loads)
info['Measurements'].head()

0    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
1     {'Height': '191.0 cm', 'Weight': '65.0 kg'}
2     {'Height': '185.0 cm', 'Weight': '90.0 kg'}
3    {'Height': '203.0 cm', 'Weight': '441.0 kg'}
4    {'Height': '193.0 cm', 'Weight': '122.0 kg'}
Name: Measurements, dtype: object

In [13]:
#Checking to ensure that column is now properly a dictionary
test_measure = info.loc[0, 'Measurements']
print(type(test_measure))
test_measure

<class 'dict'>


{'Height': '203.0 cm', 'Weight': '441.0 kg'}

In [14]:
#Use pd.Series to convert the Measurement column into 2 seperate columns "Height" and "Weight"
height_weight = info['Measurements'].apply(pd.Series)
height_weight

Unnamed: 0,Height,Weight
0,203.0 cm,441.0 kg
1,191.0 cm,65.0 kg
2,185.0 cm,90.0 kg
3,203.0 cm,441.0 kg
4,193.0 cm,122.0 kg
...,...,...
458,183.0 cm,83.0 kg
459,165.0 cm,52.0 kg
460,66.0 cm,17.0 kg
461,170.0 cm,57.0 kg


In [15]:
#Concatenate those two columns to the original DF and then drop Measurements column
info = pd.concat((info, height_weight), axis = 1)
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Measurements,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,"{'Height': '191.0 cm', 'Weight': '65.0 kg'}",Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg
2,Male,Ungaran,good,No Hair,blue,red,"{'Height': '185.0 cm', 'Weight': '90.0 kg'}",Abin Sur,DC Comics,185.0 cm,90.0 kg
3,Male,Human / Radiation,bad,No Hair,green,Unknown,"{'Height': '203.0 cm', 'Weight': '441.0 kg'}",Abomination,Marvel Comics,203.0 cm,441.0 kg
4,Male,Human,bad,No Hair,blue,Unknown,"{'Height': '193.0 cm', 'Weight': '122.0 kg'}",Absorbing Man,Marvel Comics,193.0 cm,122.0 kg


In [16]:
#Dropping Measurement column
info = info.drop(columns=['Measurements'])

In [17]:
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0 cm,441.0 kg
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0 cm,65.0 kg
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0 cm,90.0 kg
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0 cm,441.0 kg
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0 cm,122.0 kg


In [18]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Gender      463 non-null    object
 1   Race        463 non-null    object
 2   Alignment   463 non-null    object
 3   Hair color  463 non-null    object
 4   Eye color   463 non-null    object
 5   Skin color  463 non-null    object
 6   Hero        463 non-null    object
 7   Publisher   463 non-null    object
 8   Height      463 non-null    object
 9   Weight      463 non-null    object
dtypes: object(10)
memory usage: 36.3+ KB


In [19]:
#Now to ensure Height and Weight are numeric, remove the " cm" and " kg" from the numbers
info['Height'] = info['Height'].str.replace(" cm",'')
info['Weight'] = info['Weight'].str.replace(" kg",'')

In [20]:
#convert Height and Weight to floats
info['Height'] = info['Height'].astype(float)
info['Weight'] = info['Weight'].astype(float)

In [21]:
info.head()

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0
2,Male,Ungaran,good,No Hair,blue,red,Abin Sur,DC Comics,185.0,90.0
3,Male,Human / Radiation,bad,No Hair,green,Unknown,Abomination,Marvel Comics,203.0,441.0
4,Male,Human,bad,No Hair,blue,Unknown,Absorbing Man,Marvel Comics,193.0,122.0


In [22]:
#Confirming both Height and Weight are now floats
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 463 entries, 0 to 462
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gender      463 non-null    object 
 1   Race        463 non-null    object 
 2   Alignment   463 non-null    object 
 3   Hair color  463 non-null    object 
 4   Eye color   463 non-null    object 
 5   Skin color  463 non-null    object 
 6   Hero        463 non-null    object 
 7   Publisher   463 non-null    object 
 8   Height      463 non-null    float64
 9   Weight      463 non-null    float64
dtypes: float64(2), object(8)
memory usage: 36.3+ KB


### Powers DataFrame

#### One Hot Encode Powers Column

In [23]:
powers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   hero_names  667 non-null    object
 1   Powers      667 non-null    object
dtypes: object(2)
memory usage: 10.5+ KB


In [24]:
powers.head()

Unnamed: 0,hero_names,Powers
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed"
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super..."
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du..."
3,Abin Sur,Lantern Power Ring
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt..."


In [25]:
power_test = powers.loc[2,'Powers']
print(type(power_test))

<class 'str'>


In [26]:
#convert string to list
powers['powers_split'] = powers['Powers'].str.split(',')

In [27]:
#view the first couple rows to ensure they are lists
powers['powers_split'].head()

0      [Agility, Super Strength, Stamina, Super Speed]
1    [Accelerated Healing, Durability, Longevity, S...
2    [Agility, Accelerated Healing, Cold Resistance...
3                                 [Lantern Power Ring]
4    [Accelerated Healing, Intelligence, Super Stre...
Name: powers_split, dtype: object

In [28]:
#get the value counts to check and see how they look 
powers['powers_split'].value_counts()

[Intelligence]                                                                                                                                                                                                                                                                          8
[Durability, Super Strength]                                                                                                                                                                                                                                                            5
[Agility, Stealth, Marksmanship, Weapons Master, Stamina]                                                                                                                                                                                                                               4
[Marksmanship]                                                                                                                                            

In [29]:
## exploding the column of lists
exploded = powers.explode('powers_split')
exploded[['hero_names','Powers','powers_split']].head(5)

Unnamed: 0,hero_names,Powers,powers_split
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Agility
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Strength
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Stamina
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed",Super Speed
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...",Accelerated Healing


In [30]:
## saving the unique values from the exploded column
cols_to_make = exploded['powers_split'].dropna().unique()
cols_to_make

array(['Agility', 'Super Strength', 'Stamina', 'Super Speed',
       'Accelerated Healing', 'Durability', 'Longevity', 'Camouflage',
       'Self-Sustenance', 'Cold Resistance', 'Underwater breathing',
       'Marksmanship', 'Weapons Master', 'Intelligence', 'Telepathy',
       'Immortality', 'Reflexes', 'Enhanced Sight', 'Sub-Mariner',
       'Lantern Power Ring', 'Invulnerability', 'Animation',
       'Super Breath', 'Dimensional Awareness', 'Flight', 'Size Changing',
       'Teleportation', 'Magic', 'Dimensional Travel',
       'Molecular Manipulation', 'Energy Manipulation', 'Power Cosmic',
       'Energy Absorption', 'Elemental Transmogrification',
       'Fire Resistance', 'Natural Armor', 'Heat Resistance',
       'Matter Absorption', 'Regeneration', 'Stealth', 'Power Suit',
       'Energy Blasts', 'Energy Beams', 'Heat Generation', 'Danger Sense',
       'Phasing', 'Force Fields', 'Hypnokinesis', 'Invisibility',
       'Enhanced Senses', 'Jump', 'Shapeshifting', 'Elasticity',
 

In [31]:
#For loop to one hot encode the columns with True/False values for each super ability
for col in cols_to_make:
    powers[col] = powers['Powers'].str.contains(col)
powers.head()

Unnamed: 0,hero_names,Powers,powers_split,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,"Agility,Super Strength,Stamina,Super Speed","[Agility, Super Strength, Stamina, Super Speed]",True,True,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,"Accelerated Healing,Durability,Longevity,Super...","[Accelerated Healing, Durability, Longevity, S...",False,True,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,"Agility,Accelerated Healing,Cold Resistance,Du...","[Agility, Accelerated Healing, Cold Resistance...",True,True,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,Lantern Power Ring,[Lantern Power Ring],False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,"Accelerated Healing,Intelligence,Super Strengt...","[Accelerated Healing, Intelligence, Super Stre...",False,True,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False


In [32]:
#dropping the powers_split and Powers column since they have been One Hot encoded
powers = powers.drop(columns=['Powers','powers_split'])
powers.head()

Unnamed: 0,hero_names,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,Camouflage,Self-Sustenance,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,False,True,True,False,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,True,False,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Merging Powers and Info Dataframes

In [33]:
#Change "hero_names" to "Hero" so I can preform a pd.merge on the two df's
powers.rename(columns={'hero_names': 'Hero'}, inplace=True)
powers.head()

Unnamed: 0,Hero,Agility,Super Strength,Stamina,Super Speed,Accelerated Healing,Durability,Longevity,Camouflage,Self-Sustenance,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,3-D Man,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,False,True,True,False,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,True,False,True,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,False,True,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [34]:
#Use pd.merge to combine the two dataframes based on the "Hero" column
merged_df = pd.merge(info,powers, on='Hero')
merged_df.head(2)

Unnamed: 0,Gender,Race,Alignment,Hair color,Eye color,Skin color,Hero,Publisher,Height,Weight,...,Weather Control,Omnipresent,Omniscient,Hair Manipulation,Nova Force,Odin Force,Phoenix Force,Intuitive aptitude,Melting,Changing Armor
0,Male,Human,good,No Hair,yellow,Unknown,A-Bomb,Marvel Comics,203.0,441.0,...,False,False,False,False,False,False,False,False,False,False
1,Male,Icthyo Sapien,good,No Hair,blue,blue,Abe Sapien,Dark Horse Comics,191.0,65.0,...,False,False,False,False,False,False,False,False,False,False


#### Compare the average weight of super powers who have Super Speed to those who do not.

In [35]:
# Filter the DataFrame based on Super Speed column
has_super_speed = merged_df[merged_df['Super Speed'] == True]
no_super_speed = merged_df[merged_df['Super Speed'] == False]

# Calculate the average weight for each group
average_weight_has_speed = has_super_speed['Weight'].mean()
average_weight_no_speed = no_super_speed['Weight'].mean()

# Compare the average weights
print("Average weight of superpowers with Super Speed:", average_weight_has_speed)
print("Average weight of superpowers without Super Speed:", average_weight_no_speed)


Average weight of superpowers with Super Speed: 129.40404040404042
Average weight of superpowers without Super Speed: 101.77358490566037


- Those superheros who have super speed tend to have a higher average weight then those who do not have super speed

#### What is the average height of heroes for each publisher?

In [36]:
# Group the DataFrame by "Publisher" and calculate the average height
average_height_by_publisher = merged_df.groupby('Publisher')['Height'].mean()

# Display the resulting DataFrame
print(average_height_by_publisher)

Publisher
DC Comics            181.923913
Dark Horse Comics    176.909091
George Lucas         159.600000
Image Comics         211.000000
Marvel Comics        191.546128
Shueisha             171.500000
Star Trek            181.500000
Team Epic TV         180.750000
Unknown              178.000000
Name: Height, dtype: float64


- Listed above is the average height for each of the Publishers in the merged dataframe