In [1]:
# You make have noticed using the vanilla CSV Reader has some limitations
# So we're going to use a powerful Python library called Pandas
import pandas as pd  # in the Python community, pd is the agreed shorthand for pandas
# this will help us make graphs
import matplotlib.pyplot as plt
%pylab inline

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [2]:
# read the csv file
path = 'pokemon_data.csv'
pokemon = pd.read_csv(path)    # Fun Fact: the plural form of Pokemon is Pokemon

In [3]:
# let's look at the first five rows
pokemon.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [None]:
# we can also use indexing, just like we've been using with strings and lists
# so if we wanted to see rows 12-18 we can do
pokemon[12:19]   # remember the ending index is exclusive, so if you want row 18, the ending index is 19

In [6]:
pokemon.type1.str.capitalize()

0        Grass
1        Grass
2        Grass
3         Fire
4         Fire
        ...   
796      Steel
797      Grass
798       Dark
799    Psychic
800      Steel
Name: type1, Length: 801, dtype: object

In [None]:
# we can also look at just the headers
pokemon.columns

In [None]:
# Let's look at all of them
display(pokemon)

<h4>Quick Pandas terminology overview</h4>
<b>DataFrame</b> or df is a 2D structure (ie rows and columns), in this code, pokemon is a DataFrame
<br/><b>Series</b> is a single column in a DataFrame


In [None]:
# Last class we needed the index to get a certain column, now we can call a column by name
pokemon['name']

In [None]:
# note we can use bracket notation (see above) OR if the column name has no spaces, we can use dot notation
# when making building datasets, you'll see the benefits of not having spaces in the column name!
pokemon.name

In [None]:
# I can use the loc property to locate a row by a certain value
# for example, I could find the row for the pokemon 'Pikachu'
pokemon.loc[pokemon.name == 'Pikachu']

In [None]:
# I could also use the loc property to just see the japense name of the pokemon 'Pikachu'
# so the first parameter is my search query, the second parameter is the column value I want to return
pokemon.loc[pokemon.name == 'Pikachu', 'japanese_name']

In [None]:
# Analyze data with Pandas
# For example, I can see the highest and lowest values for HP
print(pokemon.hp.max())
print(pokemon.hp.min())

In [None]:
# let's find out who these pokemon are!
# we're going to use the loc property to locate any pokemon where HP equals 255 or 1!
strong_pokemon = pokemon.loc[pokemon.hp == 255, 'name'].item()
weak_pokemon = pokemon.loc[pokemon.hp == 1, 'name'].item()
print(strong_pokemon)
print(weak_pokemon)

<h4>Let's break down the above example</h4>
<code>our_variable = our_data_frame.loc[our_data_frame.column_name == matching_value, 'other_column_name'].item()</code>

So the **loc** property is finding the pokemon where the hp is equal to 255 (or 1). <br/>
We are also telling the **loc** property that we want just the name of this matching pokemon, like we did above
<br/><code>pokemon.loc[pokemon.hp == 255, 'name']</code> returns an object with some metadata that we don't care about (for now), such as dtype, so we use **item()** to just give us the value

In [None]:
# let's make a DataFrame of the strongest pokemon
# note the double brackets: the interior brackets are for list, and the outside brackets are indexing operator, 
# i.e. you must use double brackets if you select two or more columns.
# double brackets also indicates that a DataFrame is being returned

strongest_pokemon = pokemon[[p for p in pokemon.hp>125]]
strongest_pokemon


In [None]:
# I can look at their names
strongest_pokemon.name

In [None]:
# let's see how many unique types there are among these pokemon
print(strongest_pokemon.type1.unique())

In [None]:
plt.hist(pokemon.hp, bins=25)
plt.xlabel('HP')
plt.ylabel('Number of Pokemon')
plt.title('Distribution of HP amongst Pokemon')
plt.axvline(pokemon.hp.mean(), color='yellow', linestyle='dashed', linewidth=2)    # draw where the average is in yellow
axvline(pokemon.hp.median(), color='red', linestyle='dashed', linewidth=2)  # draw where the median is in red

In [None]:
# let's group our pokemon by type and see the differenes in HP
# so we are going to split our pokemon DataFrame by 'type1' using the groupby() function 
# then we'll calculate the median HP for each type
type_groupings = pokemon.groupby('type1')
type_groupings.hp.median()

In [None]:
# we can also see a deeper analysis of these numbers by using describe()
type_groupings.hp.describe()

In [None]:
# let's graph water vs fire

# make a dataframe of just water pokemon
water_df = type_groupings.get_group('water')
# make a dataframe of just fire pokemon
fire_df = type_groupings.get_group('fire')
plt.hist(water_df.hp, bins=20, label='water')
plt.hist(fire_df.hp, bins=20, label='fire')
plt.legend()
plt.title("Distribution of HP by type")
plt.xlabel("HP")
plt.ylabel("Number of Pokemon")