# Working with Dataframes - Data Manipulation, Merge, Apply, and Groupby

In [1]:
import os
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import sklearn.linear_model
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

## Load the Hogwarts Grade data we produced earlier.

In [2]:
# Set the path to our data location
datapath = os.path.join(".", "")
datapath

'./'

In [3]:
hogwarts_datafile = 'hogwarts_grades.csv'

In [4]:
grades = pd.read_csv(datapath + hogwarts_datafile, index_col='Name')

In [5]:
grades

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0
Fred,75.0,93.0,,91.0,,58.0,,
George,75.0,93.0,,91.0,,58.0,,
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0


## Combining DataFrames ##

Dumbledore has asked how each of the houses are doing overall in classes.  We will do a little investigation.

First, add the average grade to each student row.

In [6]:
# remember the active classes before we add columns to the table
active_classes = grades.columns
active_classes

Index(['Potions', 'Transfiguration', 'Runes', 'Defense', 'Divination',
       'Data Science', 'Charms', 'Herbology'],
      dtype='object')

In [7]:
grades['Average Points'] = grades.mean(axis='columns')
grades

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571
Fred,75.0,93.0,,91.0,,58.0,,,79.25
George,75.0,93.0,,91.0,,58.0,,,79.25
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0


In [8]:
grades.sort_values(by='Average Points', ascending=False)

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0
Cedric,,98.0,,,,,,,98.0
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571
Neville,,84.0,,71.0,78.0,,,100.0,83.25
Fred,75.0,93.0,,91.0,,58.0,,,79.25
George,75.0,93.0,,91.0,,58.0,,,79.25


Make a dataframe that maps students to their house.

In [9]:
students = ['Harry', 'Hermione', 'Ron', 'Luna', 'Draco', 'Crabbe', 'Cho', 'Cedric', 'Fred', 'George', 'Neville', 'Goyle', 'Pansy', 'Lavender']

In [10]:
houses = ['Gryffindor', 
          'Gryffindor', 
          'Gryffindor', 
          'Ravenclaw', 
          'Slytherin', 
          'Slytherin', 
          'Ravenclaw',
          'Hufflepuff', 
          'Gryffindor', 
          'Gryffindor', 
          'Gryffindor',
          'Slytherin',
          'Slytherin',
          'Gryffindor']

In [11]:
student_house = { k: v for (k,v) in zip(students, houses)}
student_house

{'Harry': 'Gryffindor',
 'Hermione': 'Gryffindor',
 'Ron': 'Gryffindor',
 'Luna': 'Ravenclaw',
 'Draco': 'Slytherin',
 'Crabbe': 'Slytherin',
 'Cho': 'Ravenclaw',
 'Cedric': 'Hufflepuff',
 'Fred': 'Gryffindor',
 'George': 'Gryffindor',
 'Neville': 'Gryffindor',
 'Goyle': 'Slytherin',
 'Pansy': 'Slytherin',
 'Lavender': 'Gryffindor'}

In [13]:
student_df = DataFrame({'Student': students, 'House': houses})
student_df

Unnamed: 0,Student,House
0,Harry,Gryffindor
1,Hermione,Gryffindor
2,Ron,Gryffindor
3,Luna,Ravenclaw
4,Draco,Slytherin
5,Crabbe,Slytherin
6,Cho,Ravenclaw
7,Cedric,Hufflepuff
8,Fred,Gryffindor
9,George,Gryffindor


Combine the two dataframes into a single one.  The `merge` method does the job.  There are many options on `merge` that guide how two dataframes are combined.  In this case, we set the index of `student_df` to the same set of values as the index in `grades`, then the `merge` combines the dataframes by matching up the index values to create the new rows.

In [15]:
grades_ext = grades.merge(student_df.set_index('Student'), left_index=True, right_index=True)
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


Notice how Pansy and Lavender were *not* added to the dataframe.  By default, `merge` does an *inner* join, which uses the intersection of the keys in the two dataframes.  

The `merge` method has options supporting SQL-style inner, outer, left, right, and cross joins. 

## More on conditional selection from dataframes ##

We are going to evaluate several conditions on the entire dataframe.  As we do that, we will delve a bit more deeply into how that works for Pandas.

First, consider the use of boolean conditions and boolean variables in base Python.  

Python variables can hold the results of boolean conditions.

In [17]:
score = 85

In [18]:
b_student = ((score >= 80) and (score < 90))

In [19]:
b_student

True

In [20]:
type(b_student)

bool

The value of the Boolean variable can be used anywhere a conditional statement is valid.  Perhaps stating the obvious, but the contents of the variable can be used repeatedly once set.

In [23]:
if b_student:
    print('Good')
else:
    print('Perhaps excellent, perhaps less...')
    
if b_student:
    print('To B?')
else:
    print('Or not to B?')

Good
To B?


With primitive data types, the value of the evaluated Boolean condition is a scalar (i.e., it has a single value).

Pandas supports boolean results, but the data object used to hold the results cannot be a scalar since Pandas datatypes (i.e., Series and DataFrames) are multi-dimensional.  As a result, the data objects returned from evaluating boolean expressions against a Pandas datatype must hold multiple values and are multi-dimensional, just like the base Pandas types.

Recall that `active_classes` variable identifies a subset of the full `grades_ext` dataframe.

In [24]:
active_classes

Index(['Potions', 'Transfiguration', 'Runes', 'Defense', 'Divination',
       'Data Science', 'Charms', 'Herbology'],
      dtype='object')

In [25]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


Using the list of active classes, we can easily look at just the class columns.

In [28]:
grades_ext[active_classes]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0
Fred,75.0,93.0,,91.0,,58.0,,
George,75.0,93.0,,91.0,,58.0,,
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0


What students are struggling in Potions?

In [29]:
grades_ext[grades_ext['Potions'] < 60]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin


Let's take a closer look at how this works.

Below is the boolean expression from the statement above.  

The implementation of the Pandas data structures is smart enough to do the compare on each row for the specified column.  The returned value is a data structure holding the truth value for each row in the column.

We identifed the column of interest, but Pandas took care of iterating through each row.  If we had to do this explicitly, it would mean writing a `for` loop to process each row in the column.  The Pandas module is specifically written to allow processing of rows and/or columns without explicit looping control structures.  This enables the Pandas implementation to optimize the execution of the operations.

In [30]:
grades_ext['Potions'] < 60

Harry       False
Hermione    False
Ron         False
Draco       False
Crabbe       True
Fred        False
George      False
Goyle        True
Luna        False
Cho         False
Cedric      False
Neville     False
Name: Potions, dtype: bool

We can save the results of the evaluated boolean expression in a variable.

In [31]:
potions_struggle = grades_ext['Potions'] < 60

Since the boolean result is for a column in the dataframe, the result returned is a Series of booleans with the dataframe row index as the Series index.

In [32]:
type(potions_struggle)

pandas.core.series.Series

In [33]:
potions_struggle

Harry       False
Hermione    False
Ron         False
Draco       False
Crabbe       True
Fred        False
George      False
Goyle        True
Luna        False
Cho         False
Cedric      False
Neville     False
Name: Potions, dtype: bool

As with scalar boolean variables, we can use the saved results in later statements.

First, use the boolean Series to select from the dataframe.

In [34]:
grades_ext[potions_struggle]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin


This shows a bit of how Pandas is evaluating the selection expressions from dataframes.  

First, the condition is evaluated to produce a boolean data object, then that boolean data object is used to identify the rows from the dataframe to return.  All rows where the associated Series value is `True` are returned.

Note that the expression above is really a simplification of the `loc` request with `:` implied for the columns.

In [35]:
grades_ext.loc[potions_struggle, :]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin


Now we will find all the stuggling students regardless of the class.

Note the difference in the expression below compared with our earlier example.  

In this case, rather than referencing a single column, a list of columns is identified within the boolean expression.  The Pandas implementation recognizes that to evaluate the condition, it must iterate both across some of the columns and within each column (i.e., 2 nested `for` loops).  Again, Pandas does this without explicit loops from the programmer.

In [36]:
struggling = (grades_ext[active_classes] < 60)

The result is now a DataFrame rather than a Series because we must hold boolean values for both rows and columns.

In [37]:
type(struggling)

pandas.core.frame.DataFrame

In [38]:
struggling

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,False,False,False,False,False,False,False,False
Hermione,False,False,False,False,False,False,False,False
Ron,False,False,False,False,False,False,False,False
Draco,False,False,False,False,False,False,False,False
Crabbe,True,True,False,True,True,True,False,True
Fred,False,False,False,False,False,True,False,False
George,False,False,False,False,False,True,False,False
Goyle,True,True,False,True,True,True,True,True
Luna,False,False,False,False,False,False,False,False
Cho,False,False,False,False,False,False,False,False


What is returned if we select from the base dataframe using the boolean dataframe as the criteria?

In [1192]:
grades_ext[struggling]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,,,,,,,,,,
Hermione,,,,,,,,,,
Ron,,,,,,,,,,
Draco,,,,,,,,,,
Crabbe,31.0,15.0,,29.0,6.0,3.0,,52.0,,
Fred,,,,,,58.0,,,,
George,,,,,,58.0,,,,
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,,
Luna,,,,,,,,,,
Cho,,,,,,,,,,


Earlier, when the boolean selector was a Series, we got only the rows that had a true value.

Now, the result is in two-dimensions (rows and columns), so the resulting dataframe returns all values in the dataframe that satisfied the expression (i.e, were `True`) and returns `NaN` for all other values.

We can make this a little easier to see if we format a bit.

In [39]:
def highlight_values(val):
    if np.isnan(val):
        color = 'black'
    else:
        color = 'green'
    return 'color: %s' % color

In [40]:
grades_ext[struggling].style.applymap(highlight_values)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,,,,,,,,,,
Hermione,,,,,,,,,,
Ron,,,,,,,,,,
Draco,,,,,,,,,,
Crabbe,31.0,15.0,,29.0,6.0,3.0,,52.0,,
Fred,,,,,,58.0,,,,
George,,,,,,58.0,,,,
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,,
Luna,,,,,,,,,,
Cho,,,,,,,,,,


We can reuse our boolean results to ask only for students that struggling in some class.

In [41]:
struggling

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,False,False,False,False,False,False,False,False
Hermione,False,False,False,False,False,False,False,False
Ron,False,False,False,False,False,False,False,False
Draco,False,False,False,False,False,False,False,False
Crabbe,True,True,False,True,True,True,False,True
Fred,False,False,False,False,False,True,False,False
George,False,False,False,False,False,True,False,False
Goyle,True,True,False,True,True,True,True,True
Luna,False,False,False,False,False,False,False,False
Cho,False,False,False,False,False,False,False,False


We can also do a little formating to help read the boolean tables.

In [42]:
def true_green(val):
    if val:
        color = 'green'
    else:
        color = 'red'
    return 'color: %s' % color       

In [43]:
struggling.style.applymap(true_green)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,False,False,False,False,False,False,False,False
Hermione,False,False,False,False,False,False,False,False
Ron,False,False,False,False,False,False,False,False
Draco,False,False,False,False,False,False,False,False
Crabbe,True,True,False,True,True,True,False,True
Fred,False,False,False,False,False,True,False,False
George,False,False,False,False,False,True,False,False
Goyle,True,True,False,True,True,True,True,True
Luna,False,False,False,False,False,False,False,False
Cho,False,False,False,False,False,False,False,False


Which of the classes has someone struggling?

The `any` method here looks at truth values in each column and returns `True` if any of the cells in the column are `True`.

In [44]:
struggling.any()

Potions             True
Transfiguration     True
Runes              False
Defense             True
Divination          True
Data Science        True
Charms              True
Herbology           True
dtype: bool

This is equivalent to specifying the 'rows' axis (i.e., 'rows' is the default axis).

In [45]:
struggling.any(axis='rows')

Potions             True
Transfiguration     True
Runes              False
Defense             True
Divination          True
Data Science        True
Charms              True
Herbology           True
dtype: bool

This result above operated on each column (i.e., looked down the column).  

Now, examine each row (i.e., look across the row in each column) to see which students have at least one class where they are struggling.

In [46]:
struggling.any(axis='columns')

Harry       False
Hermione    False
Ron         False
Draco       False
Crabbe       True
Fred         True
George       True
Goyle        True
Luna        False
Cho         False
Cedric      False
Neville     False
dtype: bool

Find the struggling students.

Since the `any` function reduces the all boolean values across a row to a single value, Pandas can return the entire row matching the condition. 

In [47]:
grades_ext[struggling.any(axis='columns')]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin


It is a bit trickier to return only the target students and classes.

In [48]:
struggling_classes = [c for c, v in struggling.any().items() if v]

In [1203]:
struggling_classes

['Potions',
 'Transfiguration',
 'Defense',
 'Divination',
 'Data Science',
 'Charms',
 'Herbology']

In [49]:
grades_ext.loc[struggling.any(axis='columns'), struggling_classes]

Unnamed: 0,Potions,Transfiguration,Defense,Divination,Data Science,Charms,Herbology
Crabbe,31.0,15.0,29.0,6.0,3.0,70.0,52.0
Fred,75.0,93.0,91.0,,58.0,,
George,75.0,93.0,91.0,,58.0,,
Goyle,23.0,43.0,32.0,11.0,21.0,41.0,49.0


Looking on the brighter side...

In [50]:
passing = (grades_ext[active_classes] >= 60)
passing

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,True,True,False,True,True,True,True,True
Hermione,True,True,True,True,False,True,True,True
Ron,True,True,False,True,True,True,True,True
Draco,True,True,False,True,True,True,True,True
Crabbe,False,False,False,False,False,False,True,False
Fred,True,True,False,True,False,False,False,False
George,True,True,False,True,False,False,False,False
Goyle,False,False,False,False,False,False,False,False
Luna,True,True,True,True,True,True,True,True
Cho,False,True,False,True,True,True,True,True


In [51]:
passing.style.applymap(true_green)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,True,True,False,True,True,True,True,True
Hermione,True,True,True,True,False,True,True,True
Ron,True,True,False,True,True,True,True,True
Draco,True,True,False,True,True,True,True,True
Crabbe,False,False,False,False,False,False,True,False
Fred,True,True,False,True,False,False,False,False
George,True,True,False,True,False,False,False,False
Goyle,False,False,False,False,False,False,False,False
Luna,True,True,True,True,True,True,True,True
Cho,False,True,False,True,True,True,True,True


Who is passing all their classes?

In [52]:
passing.all(axis='columns')

Harry       False
Hermione    False
Ron         False
Draco       False
Crabbe      False
Fred        False
George      False
Goyle       False
Luna         True
Cho         False
Cedric      False
Neville     False
dtype: bool

***What?!?!?!?!?!***  

It seems everyone except Luna is failing at least one class!

I can sort of believe that except...

***Hermione*** is not passing all her classes?!?!?!?  I don't think so...

What is going on here?

Taking a closer look at the `passing` data object values above, we find that Hermione's Divination boolean is `False`.  

What is up?  Let's take another look at our data.

In [53]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


Ahhhh...Hermione has `NaN` in Divination because she is not taking Divination!

Unlike many of the built-in Pandas mathematical operations, Pandas does not ignore `NaN` in boolean conditions.

We will need to account for this situation in our conditional expression.

Recall that the boolean operators for Pandas expressions are ***not*** the same as native Python.

Also, note that there are two subconditions in the statement below.  Each subcondition refers to a set of columns.  The Pandas implementation takes care of evaluating each cell with the appropriate corresponding cells throughout the conditional expression.

In [54]:
passing = ((grades_ext[active_classes] >= 60) | (grades_ext[active_classes].isnull()))
passing

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,True,True,True,True,True,True,True,True
Hermione,True,True,True,True,True,True,True,True
Ron,True,True,True,True,True,True,True,True
Draco,True,True,True,True,True,True,True,True
Crabbe,False,False,True,False,False,False,True,False
Fred,True,True,True,True,True,False,True,True
George,True,True,True,True,True,False,True,True
Goyle,False,False,True,False,False,False,False,False
Luna,True,True,True,True,True,True,True,True
Cho,True,True,True,True,True,True,True,True


In [55]:
passing.all(axis='columns')

Harry        True
Hermione     True
Ron          True
Draco        True
Crabbe      False
Fred        False
George      False
Goyle       False
Luna         True
Cho          True
Cedric       True
Neville      True
dtype: bool

In [56]:
grades_ext[passing.all(axis='columns')]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw
Cedric,,98.0,,,,,,,98.0,Hufflepuff
Neville,,84.0,,71.0,78.0,,,100.0,83.25,Gryffindor


We have seen boolean conditions that refer to a specific column or a set of columns.  Now, we will construct a condition on all the columns.  We do this by using only the dataframe name in the expression.

In the expression below, using `grades_ext` rather than a specific column (or row) specification causes Pandas to iterate the evaluation over all the rows and columns.  Also, note that `grades_ext.max()` is appropriately interpreted as the maximum in the relevant column for the comparison.

Finally, note that the max condition was assessed for both numeric columns and strings (the 'House' column).

In [1212]:
(grades_ext == grades_ext.max())

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,False,False,False,True,False,False,False,False,False,False
Hermione,True,True,True,True,False,True,True,True,True,False
Ron,False,False,False,False,False,False,False,False,False,False
Draco,True,False,False,False,False,False,False,False,False,True
Crabbe,False,False,False,False,False,False,False,False,False,True
Fred,False,False,False,False,False,False,False,False,False,False
George,False,False,False,False,False,False,False,False,False,False
Goyle,False,False,False,False,False,False,False,False,False,True
Luna,False,False,True,False,True,True,False,False,False,False
Cho,False,False,False,False,False,False,False,False,False,False


Now we can look at who has the highest grade in each class.

In [57]:
grades_ext[(grades_ext == grades_ext.max())]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,,,,100.0,,,,,,
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,
Ron,,,,,,,,,,
Draco,100.0,,,,,,,,,Slytherin
Crabbe,,,,,,,,,,Slytherin
Fred,,,,,,,,,,
George,,,,,,,,,,
Goyle,,,,,,,,,,Slytherin
Luna,,,100.0,,98.0,100.0,,,,
Cho,,,,,,,,,,


Which students have the highest grade in at least one class?

In [58]:
(grades_ext == grades_ext.max()).any(axis='columns')

Harry        True
Hermione     True
Ron         False
Draco        True
Crabbe       True
Fred        False
George      False
Goyle        True
Luna         True
Cho         False
Cedric      False
Neville      True
dtype: bool

Crabbe and Goyle have the highest grade in some class?  That seems...errr...unlikely.  What happened?

We have a bug in our condition.  We forgot to exclude the 'House' column from the condition and
'Slytherin' is the highest (string) value in the column.  We will reform our query.

In [59]:
(grades_ext[active_classes] == grades_ext[active_classes].max()).any(axis='columns')

Harry        True
Hermione     True
Ron         False
Draco        True
Crabbe      False
Fred        False
George      False
Goyle       False
Luna         True
Cho         False
Cedric      False
Neville      True
dtype: bool

That's better...

Which students have perfect scores?

First, check for a perfect score in each class.

In [60]:
perfect = (grades_ext[active_classes] == 100) | (grades_ext[active_classes].isnull())
perfect

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,False,False,True,True,False,False,False,False
Hermione,True,True,True,True,True,True,True,True
Ron,False,False,True,False,False,False,False,False
Draco,True,False,True,False,False,False,False,False
Crabbe,False,False,True,False,False,False,False,False
Fred,False,False,True,False,True,False,True,True
George,False,False,True,False,True,False,True,True
Goyle,False,False,True,False,False,False,False,False
Luna,False,False,True,False,False,True,False,False
Cho,True,False,True,False,False,False,False,False


In [61]:
grades_ext[perfect]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,,,,100.0,,,,,,
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,,
Ron,,,,,,,,,,
Draco,100.0,,,,,,,,,
Crabbe,,,,,,,,,,
Fred,,,,,,,,,,
George,,,,,,,,,,
Goyle,,,,,,,,,,
Luna,,,100.0,,,100.0,,,,
Cho,,,,,,,,,,


Now, check who has a perfect score in all their classes.

In [62]:
perfect.all(axis='columns')

Harry       False
Hermione     True
Ron         False
Draco       False
Crabbe      False
Fred        False
George      False
Goyle       False
Luna        False
Cho         False
Cedric      False
Neville     False
dtype: bool

In [63]:
grades_ext[perfect.all(axis='columns')]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor


I'm stunned.

Just out of curiousity, is there a class where all the students have perfect scores?

In [64]:
perfect.all()

Potions            False
Transfiguration    False
Runes               True
Defense            False
Divination         False
Data Science       False
Charms             False
Herbology          False
dtype: bool

In [65]:
grades_ext['Runes']

Harry         NaN
Hermione    100.0
Ron           NaN
Draco         NaN
Crabbe        NaN
Fred          NaN
George        NaN
Goyle         NaN
Luna        100.0
Cho           NaN
Cedric        NaN
Neville       NaN
Name: Runes, dtype: float64

## Data Manipulation - Direct operations on DataFrame values. ##

Recall our data...

In [66]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


***Aggregations*** take a set of values (e.g., a Series) and return a single value (aggregation), such as summing all the values.

We have already seen several of this operations.

In [67]:
grades_ext[active_classes].mean()

Potions             72.000000
Transfiguration     81.500000
Runes              100.000000
Defense             78.727273
Divination          63.125000
Data Science        70.000000
Charms              85.500000
Herbology           84.222222
dtype: float64

In [68]:
grades_ext[active_classes].median()

Potions             75.0
Transfiguration     92.0
Runes              100.0
Defense             91.0
Divination          74.0
Data Science        82.0
Charms              94.0
Herbology           92.0
dtype: float64

Aggregations can operate down a column.

In [69]:
grades_ext['Potions'].sum()

648.0

In [70]:
grades_ext[active_classes].sum()

Potions            648.0
Transfiguration    978.0
Runes              200.0
Defense            866.0
Divination         505.0
Data Science       700.0
Charms             684.0
Herbology          758.0
dtype: float64

Aggregations can also act across rows.

In [71]:
grades_ext.loc['Harry', active_classes].sum()

611.0

In [72]:
grades_ext[active_classes].sum(axis='columns')

Harry       611.0
Hermione    700.0
Ron         598.0
Draco       591.0
Crabbe      206.0
Fred        317.0
George      317.0
Goyle       220.0
Luna        778.0
Cho         570.0
Cedric       98.0
Neville     333.0
dtype: float64

***Transformations*** operate on each value independently and does not reduce to a single value.  

Many transformations can be done directly with expressions.  Let's add a "curve" value to one of our classes.

The Data Science professor gave a 5 point bonus to everyone on the class.  

The scalar value of 5 is broadcast across all elements in the column (Series).

In [73]:
grades_ext['Data Science'] + 5

Harry        97.0
Hermione    105.0
Ron         103.0
Draco        77.0
Crabbe        8.0
Fred         63.0
George       63.0
Goyle        26.0
Luna        105.0
Cho         103.0
Cedric        NaN
Neville       NaN
Name: Data Science, dtype: float64

The original column was not changed.

In [74]:
grades_ext['Data Science']

Harry        92.0
Hermione    100.0
Ron          98.0
Draco        72.0
Crabbe        3.0
Fred         58.0
George       58.0
Goyle        21.0
Luna        100.0
Cho          98.0
Cedric        NaN
Neville       NaN
Name: Data Science, dtype: float64

Dumbledore awarded a 10 point bonus across all classes (omitting the non-class columns).

In this case, the set of active classes is two-dimensional (rows and columns) and the scalar bonus is broadcast across all cells.

In [76]:
grades_ext[active_classes] + 10

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,90.0,102.0,,110.0,81.0,102.0,103.0,93.0
Hermione,110.0,110.0,110.0,110.0,,110.0,110.0,110.0
Ron,80.0,93.0,,102.0,83.0,108.0,105.0,97.0
Draco,110.0,98.0,,82.0,85.0,82.0,102.0,102.0
Crabbe,41.0,25.0,,39.0,16.0,13.0,80.0,62.0
Fred,85.0,103.0,,101.0,,68.0,,
George,85.0,103.0,,101.0,,68.0,,
Goyle,33.0,53.0,,42.0,21.0,31.0,51.0,59.0
Luna,104.0,107.0,110.0,103.0,108.0,110.0,108.0,108.0
Cho,,102.0,,105.0,103.0,108.0,105.0,107.0


Now, we use the `transform` function to perform a similar action to add a bonus.

In [77]:
def add_curve(val, curve=5):
    return (val + curve)

First, the Data Science curve.

In [78]:
grades_ext['Data Science'].transform(add_curve)

Harry        97.0
Hermione    105.0
Ron         103.0
Draco        77.0
Crabbe        8.0
Fred         63.0
George       63.0
Goyle        26.0
Luna        105.0
Cho         103.0
Cedric        NaN
Neville       NaN
Name: Data Science, dtype: float64

Now, Dumbledore's universal curve.

In [79]:
grades_ext[active_classes].transform(add_curve, curve=10)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,90.0,102.0,,110.0,81.0,102.0,103.0,93.0
Hermione,110.0,110.0,110.0,110.0,,110.0,110.0,110.0
Ron,80.0,93.0,,102.0,83.0,108.0,105.0,97.0
Draco,110.0,98.0,,82.0,85.0,82.0,102.0,102.0
Crabbe,41.0,25.0,,39.0,16.0,13.0,80.0,62.0
Fred,85.0,103.0,,101.0,,68.0,,
George,85.0,103.0,,101.0,,68.0,,
Goyle,33.0,53.0,,42.0,21.0,31.0,51.0,59.0
Luna,104.0,107.0,110.0,103.0,108.0,110.0,108.0,108.0
Cho,,102.0,,105.0,103.0,108.0,105.0,107.0


The combined scores in Potions and Transfiguration is often called "The Murders Pair" by Hogwarts students because of the difficulty of these classes.

Since the two columns are the same shape, the addition operation is performed pairwise with corresponding elements from each Series.

In [80]:
grades_ext['Potions'] + grades_ext['Transfiguration']

Harry       172.0
Hermione    200.0
Ron         153.0
Draco       188.0
Crabbe       46.0
Fred        168.0
George      168.0
Goyle        66.0
Luna        191.0
Cho           NaN
Cedric        NaN
Neville       NaN
dtype: float64

In [81]:
grades_ext[['Potions', 'Transfiguration']]

Unnamed: 0,Potions,Transfiguration
Harry,80.0,92.0
Hermione,100.0,100.0
Ron,70.0,83.0
Draco,100.0,88.0
Crabbe,31.0,15.0
Fred,75.0,93.0
George,75.0,93.0
Goyle,23.0,43.0
Luna,94.0,97.0
Cho,,92.0


In [82]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


The Data Science professor gave a 5 point bonus to everyone on the class.  

The scalar value of 5 is broadcast across all elements in the column (Series).

In [83]:
grades_ext['Data Science'] + 5

Harry        97.0
Hermione    105.0
Ron         103.0
Draco        77.0
Crabbe        8.0
Fred         63.0
George       63.0
Goyle        26.0
Luna        105.0
Cho         103.0
Cedric        NaN
Neville       NaN
Name: Data Science, dtype: float64

The original column was not changed.

In [84]:
grades_ext['Data Science']

Harry        92.0
Hermione    100.0
Ron          98.0
Draco        72.0
Crabbe        3.0
Fred         58.0
George       58.0
Goyle        21.0
Luna        100.0
Cho          98.0
Cedric        NaN
Neville       NaN
Name: Data Science, dtype: float64

Dumbledore awarded a 10 point bonus across all classes (omitting the non-class columns).

In this case, the set of active classes is two-dimensional (rows and columns) and the scalar bonus is broadcast across all cells.

In [85]:
grades_ext[active_classes] + 10

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,90.0,102.0,,110.0,81.0,102.0,103.0,93.0
Hermione,110.0,110.0,110.0,110.0,,110.0,110.0,110.0
Ron,80.0,93.0,,102.0,83.0,108.0,105.0,97.0
Draco,110.0,98.0,,82.0,85.0,82.0,102.0,102.0
Crabbe,41.0,25.0,,39.0,16.0,13.0,80.0,62.0
Fred,85.0,103.0,,101.0,,68.0,,
George,85.0,103.0,,101.0,,68.0,,
Goyle,33.0,53.0,,42.0,21.0,31.0,51.0,59.0
Luna,104.0,107.0,110.0,103.0,108.0,110.0,108.0,108.0
Cho,,102.0,,105.0,103.0,108.0,105.0,107.0


Now, we use the `transform` function to perform a similar action to add a bonus.

In [86]:
def add_curve(val, curve=5):
    return (val + curve)

First, the Data Science curve.

In [1242]:
grades_ext['Data Science'].transform(add_curve)

Harry        97.0
Hermione    105.0
Ron         103.0
Draco        77.0
Crabbe        8.0
Fred         63.0
George       63.0
Goyle        26.0
Luna        105.0
Cho         103.0
Cedric        NaN
Neville       NaN
Name: Data Science, dtype: float64

Now, Dumbledore's universal curve.

In [1243]:
grades_ext[active_classes].transform(add_curve, curve=10)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,90.0,102.0,,110.0,81.0,102.0,103.0,93.0
Hermione,110.0,110.0,110.0,110.0,,110.0,110.0,110.0
Ron,80.0,93.0,,102.0,83.0,108.0,105.0,97.0
Draco,110.0,98.0,,82.0,85.0,82.0,102.0,102.0
Crabbe,41.0,25.0,,39.0,16.0,13.0,80.0,62.0
Fred,85.0,103.0,,101.0,,68.0,,
George,85.0,103.0,,101.0,,68.0,,
Goyle,33.0,53.0,,42.0,21.0,31.0,51.0,59.0
Luna,104.0,107.0,110.0,103.0,108.0,110.0,108.0,108.0
Cho,,102.0,,105.0,103.0,108.0,105.0,107.0


The combined scores in Potions and Transfiguration is often called "The Murders Pair" by Hogwarts students because of the difficulty of these classes.

Since the two columns are the same shape, the addition operation is performed pairwise with corresponding elements from each Series.

In [1244]:
grades_ext['Potions'] + grades_ext['Transfiguration']

Harry       172.0
Hermione    200.0
Ron         153.0
Draco       188.0
Crabbe       46.0
Fred        168.0
George      168.0
Goyle        66.0
Luna        191.0
Cho           NaN
Cedric        NaN
Neville       NaN
dtype: float64

In [1245]:
grades_ext[['Potions', 'Transfiguration']]

Unnamed: 0,Potions,Transfiguration
Harry,80.0,92.0
Hermione,100.0,100.0
Ron,70.0,83.0
Draco,100.0,88.0
Crabbe,31.0,15.0
Fred,75.0,93.0
George,75.0,93.0
Goyle,23.0,43.0
Luna,94.0,97.0
Cho,,92.0


## Data Manipulation - Using `apply` and `applymap` to manipulate Series and DataFrame values. ##

Sometimes we need to perform aggregations or transformatiojns that are not provided as built-in functions.  The methods `apply` and `applymap` gives mechanisms for dealing with these situations.

`apply()` applies a function along a dimension (row or column) of a dataframe.

`applymap()` applies a function element-wise to a dataframe.

In [88]:
def passing(points, failing=60):
    if points < failing:
        return False
    else:
        return True

For a Series, `apply` passes the value of each Series element to the function.  A new Series is returned with the return value of the applied function for each Series element.

In [89]:
grades_ext['Potions'].apply(passing)

Harry        True
Hermione     True
Ron          True
Draco        True
Crabbe      False
Fred         True
George       True
Goyle       False
Luna         True
Cho          True
Cedric       True
Neville      True
Name: Potions, dtype: bool

By using the optional named parameter `failing` to `passing`, we can set the bar a bit higher (or lower).

In [90]:
grades_ext['Potions'].apply(passing, failing=80)

Harry        True
Hermione     True
Ron         False
Draco        True
Crabbe      False
Fred        False
George      False
Goyle       False
Luna         True
Cho          True
Cedric       True
Neville      True
Name: Potions, dtype: bool

When used on Series, each value of the Series is passed in turn. 

In [91]:
grades_ext['Potions'].apply(print)

80.0
100.0
70.0
100.0
31.0
75.0
75.0
23.0
94.0
nan
nan
nan


Harry       None
Hermione    None
Ron         None
Draco       None
Crabbe      None
Fred        None
George      None
Goyle       None
Luna        None
Cho         None
Cedric      None
Neville     None
Name: Potions, dtype: object

For a DataFrame, each Series is passed as the argument to the function invoked by `apply`.  If your function is not prepared for the Series, the results will not be as you expect.

In [92]:
grades_ext[active_classes]

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0
Fred,75.0,93.0,,91.0,,58.0,,
George,75.0,93.0,,91.0,,58.0,,
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0


In [93]:
grades_ext[active_classes].apply(print)

Harry        80.0
Hermione    100.0
Ron          70.0
Draco       100.0
Crabbe       31.0
Fred         75.0
George       75.0
Goyle        23.0
Luna         94.0
Cho           NaN
Cedric        NaN
Neville       NaN
Name: Potions, dtype: float64
Harry        92.0
Hermione    100.0
Ron          83.0
Draco        88.0
Crabbe       15.0
Fred         93.0
George       93.0
Goyle        43.0
Luna         97.0
Cho          92.0
Cedric       98.0
Neville      84.0
Name: Transfiguration, dtype: float64
Harry         NaN
Hermione    100.0
Ron           NaN
Draco         NaN
Crabbe        NaN
Fred          NaN
George        NaN
Goyle         NaN
Luna        100.0
Cho           NaN
Cedric        NaN
Neville       NaN
Name: Runes, dtype: float64
Harry       100.0
Hermione    100.0
Ron          92.0
Draco        72.0
Crabbe       29.0
Fred         91.0
George       91.0
Goyle        32.0
Luna         93.0
Cho          95.0
Cedric        NaN
Neville      71.0
Name: Defense, dtype: float64
Harry     

Potions            None
Transfiguration    None
Runes              None
Defense            None
Divination         None
Data Science       None
Charms             None
Herbology          None
dtype: object

Using `apply` with a function that expects a single value doesn't work when applied to a dataframe.

In [94]:
grades_ext[active_classes].apply(passing)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

We can vectorize a function that expects a single value using the `vectorize` method.  This creates a new function to use with `apply`.

In [95]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


In [96]:
passing_vectorized = np.vectorize(passing)

In [97]:
grades_ext[active_classes].apply(passing_vectorized)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,True,True,True,True,True,True,True,True
Hermione,True,True,True,True,True,True,True,True
Ron,True,True,True,True,True,True,True,True
Draco,True,True,True,True,True,True,True,True
Crabbe,False,False,True,False,False,False,True,False
Fred,True,True,True,True,True,False,True,True
George,True,True,True,True,True,False,True,True
Goyle,False,False,True,False,False,False,False,False
Luna,True,True,True,True,True,True,True,True
Cho,True,True,True,True,True,True,True,True


We can still use the parameters from the original function with the vectorized version.

In [98]:
grades_ext[active_classes].apply(passing_vectorized, failing=80)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,True,True,True,True,False,True,True,True
Hermione,True,True,True,True,True,True,True,True
Ron,False,True,True,True,False,True,True,True
Draco,True,True,True,False,False,False,True,True
Crabbe,False,False,True,False,False,False,False,False
Fred,False,True,True,True,True,False,True,True
George,False,True,True,True,True,False,True,True
Goyle,False,False,True,False,False,False,False,False
Luna,True,True,True,True,True,True,True,True
Cho,True,True,True,True,True,True,True,True


The vectorized version of the function also works on a Series.

In [99]:
grades_ext['Potions'].apply(passing_vectorized)

Harry        True
Hermione     True
Ron          True
Draco        True
Crabbe      False
Fred         True
George       True
Goyle       False
Luna         True
Cho          True
Cedric       True
Neville      True
Name: Potions, dtype: bool

We could directly write the function as being vector aware.  The returned value computes a Series for the return value.

In [100]:
def passing_vec(vec):
    return ((vec < 60))

This works against a Series...

In [101]:
grades_ext['Potions'].apply(passing_vec)

Harry       False
Hermione    False
Ron         False
Draco       False
Crabbe       True
Fred        False
George      False
Goyle        True
Luna        False
Cho         False
Cedric      False
Neville     False
Name: Potions, dtype: bool

And also against a DataFrame...

In [102]:
grades_ext[active_classes].apply(passing_vec)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,False,False,False,False,False,False,False,False
Hermione,False,False,False,False,False,False,False,False
Ron,False,False,False,False,False,False,False,False
Draco,False,False,False,False,False,False,False,False
Crabbe,True,True,False,True,True,True,False,True
Fred,False,False,False,False,False,True,False,False
George,False,False,False,False,False,True,False,False
Goyle,True,True,False,True,True,True,True,True
Luna,False,False,False,False,False,False,False,False
Cho,False,False,False,False,False,False,False,False


The `applymap` method applies the function to each cell of a DataFrame.  In this case, only the single value of the cell is passed to the invoked function.  

In [103]:
grades_ext[active_classes].applymap(passing)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,True,True,True,True,True,True,True,True
Hermione,True,True,True,True,True,True,True,True
Ron,True,True,True,True,True,True,True,True
Draco,True,True,True,True,True,True,True,True
Crabbe,False,False,True,False,False,False,True,False
Fred,True,True,True,True,True,False,True,True
George,True,True,True,True,True,False,True,True
Goyle,False,False,True,False,False,False,False,False
Luna,True,True,True,True,True,True,True,True
Cho,True,True,True,True,True,True,True,True


Note that `applymap` is not available as a method for a Series, so it won't work against a single column (Series).

In [104]:
grades_ext['Potions'].applymap(passing)

AttributeError: 'Series' object has no attribute 'applymap'

Recall our formatting tricks earlier to highlight dataframes.  Now, consider how we set the formats using `applymap` to each cell of interest.

First, we define our formatting function.  

In [106]:
def passing_green(val):
    if val > 60:
        color = 'green'
    elif val > 0:
        color = 'red'
    else:
        color = 'black'
    return 'color: %s' % color

Now, we are going to apply our function to every cell in the dataframe.  

In [107]:
grades[active_classes].style.applymap(passing_green)

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0
Fred,75.0,93.0,,91.0,,58.0,,
George,75.0,93.0,,91.0,,58.0,,
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0


Here are some examples of dataframe formatting:  https://www.geeksforgeeks.org/set-pandas-dataframe-background-color-and-font-color-in-python/

### More Data Manipulation ###

Recall how earlier we added a 'curve' to grades by directly manipulating the dataframe.  We can also do that with `apply` and/or `applymap`.

In [108]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


Now, we use the `applymap` function to add the curve.

In [1265]:
def add_curve(val, curve=5):
    return (val + curve)

First, the Data Science curve.

In [109]:
grades_ext['Data Science'].apply(add_curve)

Harry        97.0
Hermione    105.0
Ron         103.0
Draco        77.0
Crabbe        8.0
Fred         63.0
George       63.0
Goyle        26.0
Luna        105.0
Cho         103.0
Cedric        NaN
Neville       NaN
Name: Data Science, dtype: float64

Now, Dumbledore's universal curve.

In [110]:
grades_ext[active_classes].applymap(add_curve, curve=10)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,90.0,102.0,,110.0,81.0,102.0,103.0,93.0
Hermione,110.0,110.0,110.0,110.0,,110.0,110.0,110.0
Ron,80.0,93.0,,102.0,83.0,108.0,105.0,97.0
Draco,110.0,98.0,,82.0,85.0,82.0,102.0,102.0
Crabbe,41.0,25.0,,39.0,16.0,13.0,80.0,62.0
Fred,85.0,103.0,,101.0,,68.0,,
George,85.0,103.0,,101.0,,68.0,,
Goyle,33.0,53.0,,42.0,21.0,31.0,51.0,59.0
Luna,104.0,107.0,110.0,103.0,108.0,110.0,108.0,108.0
Cho,,102.0,,105.0,103.0,108.0,105.0,107.0


We are going to convert points to American-style letter grades.

In [111]:
def letter_grade(points):
    if points >= 90:
        grade = 'A'
    elif points >= 80:
        grade = 'B'
    elif points >= 70:
        grade = 'C'
    elif points >=60:
        grade = 'D'
    elif points >= 0:
        grade = 'F'
    else:
        grade = ' '
    return grade

In [112]:
letter_grade_vec = np.vectorize(letter_grade)

In [113]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


The `letter_grade` function will work on a Series, either on a column or a row.

In [114]:
grades_ext['Potions'].apply(letter_grade)

Harry       B
Hermione    A
Ron         C
Draco       A
Crabbe      F
Fred        C
George      C
Goyle       F
Luna        A
Cho          
Cedric       
Neville      
Name: Potions, dtype: object

In [115]:
grades_ext.loc['Crabbe', active_classes].apply(letter_grade)

Potions            F
Transfiguration    F
Runes               
Defense            F
Divination         F
Data Science       F
Charms             C
Herbology          F
Name: Crabbe, dtype: object

The original dataframe is not affected by the apply.

In [116]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


We need to use the vectorized function `letter_grade_vec` on a DataFrame.

In [117]:
grades_ext[active_classes].apply(letter_grade_vec)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,B,A,,A,C,A,A,B
Hermione,A,A,A,A,,A,A,A
Ron,C,B,,A,C,A,A,B
Draco,A,B,,C,C,C,A,A
Crabbe,F,F,,F,F,F,C,F
Fred,C,A,,A,,F,,
George,C,A,,A,,F,,
Goyle,F,F,,F,F,F,F,F
Luna,A,A,A,A,A,A,A,A
Cho,,A,,A,A,A,A,A


In [118]:
grades_ext.loc['Crabbe', active_classes].apply(letter_grade_vec)

Potions            F
Transfiguration    F
Runes               
Defense            F
Divination         F
Data Science       F
Charms             C
Herbology          F
Name: Crabbe, dtype: object

`applymap` can use either version of the function.

In [119]:
grades_ext[active_classes].applymap(letter_grade)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,B,A,,A,C,A,A,B
Hermione,A,A,A,A,,A,A,A
Ron,C,B,,A,C,A,A,B
Draco,A,B,,C,C,C,A,A
Crabbe,F,F,,F,F,F,C,F
Fred,C,A,,A,,F,,
George,C,A,,A,,F,,
Goyle,F,F,,F,F,F,F,F
Luna,A,A,A,A,A,A,A,A
Cho,,A,,A,A,A,A,A


In [1277]:
grades_ext[active_classes].applymap(letter_grade_vec)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,B,A,,A,C,A,A,B
Hermione,A,A,A,A,,A,A,A
Ron,C,B,,A,C,A,A,B
Draco,A,B,,C,C,C,A,A
Crabbe,F,F,,F,F,F,C,F
Fred,C,A,,A,,F,,
George,C,A,,A,,F,,
Goyle,F,F,,F,F,F,F,F
Luna,A,A,A,A,A,A,A,A
Cho,,A,,A,A,A,A,A


We want to make a copy of our main dataframe so we do not destroy the original values when we manipulate the data.

A simple assigment of a dataframe does **not** make a copy, but instead provides a reference.

In [120]:
grades_ext2 = grades_ext

In [121]:
grades_ext2.loc['Harry', 'Potions'] = 10

In [122]:
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,10.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


In [123]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,10.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


In [124]:
grades_ext.loc['Harry', 'Potions'] = 80

In [125]:
grades_ext.loc['Harry', 'Potions']

80.0

In [126]:
grades_ext2.loc['Harry', 'Potions']

80.0

Make an explicit copy of the dataframe using the `copy` method.

In [127]:
grades_ext2 = grades_ext.copy()

In [128]:
grades_ext2.loc['Harry', 'Potions'] = 10

In [129]:
grades_ext2.loc['Harry', 'Potions']

10.0

In [130]:
grades_ext.loc['Harry', 'Potions']

80.0

In [131]:
grades_ext2.loc['Harry', 'Potions'] = 80

We want to modify the grades in our new dataframe to be letter grades.  The `apply` and `applymap` methods don't have an `inplace` option.  We must explicitly assign values to the dataframe cells.

In [132]:
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


Change a single column.

In [133]:
grades_ext2['Average Points'] = grades_ext2['Average Points'].apply(letter_grade) 
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,B,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,A,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,B,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,B,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,F,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,C,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,C,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,F,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,A,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,A,Ravenclaw


Change a subset of the columns...

In [134]:
grades_ext2[active_classes] = grades_ext2[active_classes].apply(letter_grade_vec)
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,B,A,,A,C,A,A,B,B,Gryffindor
Hermione,A,A,A,A,,A,A,A,A,Gryffindor
Ron,C,B,,A,C,A,A,B,B,Gryffindor
Draco,A,B,,C,C,C,A,A,B,Slytherin
Crabbe,F,F,,F,F,F,C,F,F,Slytherin
Fred,C,A,,A,,F,,,C,Gryffindor
George,C,A,,A,,F,,,C,Gryffindor
Goyle,F,F,,F,F,F,F,F,F,Slytherin
Luna,A,A,A,A,A,A,A,A,A,Ravenclaw
Cho,,A,,A,A,A,A,A,A,Ravenclaw


Note that our function fails if a string value is encountered.

In [135]:
grades_ext2['House'].apply(letter_grade)

TypeError: '>=' not supported between instances of 'str' and 'int'

We can make a slightly more resilient function to apply...

In [136]:
def safer_letter_grade(points):
    if type(points) == int or type(points) == float:
        return(letter_grade(points))
    else:
        return(points)

In [137]:
grades_ext2['House'].apply(safer_letter_grade)

Harry       Gryffindor
Hermione    Gryffindor
Ron         Gryffindor
Draco        Slytherin
Crabbe       Slytherin
Fred        Gryffindor
George      Gryffindor
Goyle        Slytherin
Luna         Ravenclaw
Cho          Ravenclaw
Cedric      Hufflepuff
Neville     Gryffindor
Name: House, dtype: object

In [138]:
grades_ext2 = grades_ext.copy()
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


Now, we could apply the safer version to every cell.

In [139]:
grades_ext2.applymap(safer_letter_grade)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,B,A,,A,C,A,A,B,B,Gryffindor
Hermione,A,A,A,A,,A,A,A,A,Gryffindor
Ron,C,B,,A,C,A,A,B,B,Gryffindor
Draco,A,B,,C,C,C,A,A,B,Slytherin
Crabbe,F,F,,F,F,F,C,F,F,Slytherin
Fred,C,A,,A,,F,,,C,Gryffindor
George,C,A,,A,,F,,,C,Gryffindor
Goyle,F,F,,F,F,F,F,F,F,Slytherin
Luna,A,A,A,A,A,A,A,A,A,Ravenclaw
Cho,,A,,A,A,A,A,A,A,Ravenclaw


Of course, nothing was changed in the original dataframe.

In [140]:
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


Now, we will change all the cells in the dataframe with one call...

In [141]:
grades_ext2 = grades_ext2.applymap(safer_letter_grade)

In [142]:
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,B,A,,A,C,A,A,B,B,Gryffindor
Hermione,A,A,A,A,,A,A,A,A,Gryffindor
Ron,C,B,,A,C,A,A,B,B,Gryffindor
Draco,A,B,,C,C,C,A,A,B,Slytherin
Crabbe,F,F,,F,F,F,C,F,F,Slytherin
Fred,C,A,,A,,F,,,C,Gryffindor
George,C,A,,A,,F,,,C,Gryffindor
Goyle,F,F,,F,F,F,F,F,F,Slytherin
Luna,A,A,A,A,A,A,A,A,A,Ravenclaw
Cho,,A,,A,A,A,A,A,A,Ravenclaw


## Using Groupby ##

We sometimes wish to perform operations, such as aggregations, across distinct subsets of the data.  The `groupby` method allows to specify the groupings.

Recall our full set of grades data.

In [143]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


We will investigate how each of the houses are doing in their classwork.  

The `groupby` method first groups the rows by value of the specified column, then performs later operations on each of the groups independently.  In this case, we will perform the `mean` aggregation on each group.

In [144]:
grades_ext.groupby('House').mean()

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gryffindor,80.0,90.833333,100.0,90.833333,74.0,81.2,96.0,92.5,85.744048
Hufflepuff,,98.0,,,,,,,98.0
Ravenclaw,94.0,94.5,100.0,94.0,95.5,99.0,96.5,97.5,96.125
Slytherin,51.333333,48.666667,,44.333333,30.666667,32.0,67.666667,64.333333,48.428571


In [1302]:
grades_ext.groupby('House')['Average Points'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gryffindor,6.0,85.744048,7.697714,79.25,80.25,84.339286,86.821429,100.0
Hufflepuff,1.0,98.0,,98.0,98.0,98.0,98.0,98.0
Ravenclaw,2.0,96.125,1.59099,95.0,95.5625,96.125,96.6875,97.25
Slytherin,3.0,48.428571,31.192948,29.428571,30.428571,31.428571,57.928571,84.428571


In [147]:
grades_ext.groupby('House')['Average Points'].describe().sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Hufflepuff,1.0,98.0,,98.0,98.0,98.0,98.0,98.0
Ravenclaw,2.0,96.125,1.59099,95.0,95.5625,96.125,96.6875,97.25
Gryffindor,6.0,85.744048,7.697714,79.25,80.25,84.339286,86.821429,100.0
Slytherin,3.0,48.428571,31.192948,29.428571,30.428571,31.428571,57.928571,84.428571


We can construct a `groupby` object, then use that object for multiple operations.

In [152]:
grouped_grades = grades_ext.groupby('House')

In [153]:
grouped_grades['Potions'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gryffindor,5.0,80.0,11.726039,70.0,75.0,75.0,80.0,100.0
Hufflepuff,0.0,,,,,,,
Ravenclaw,1.0,94.0,,94.0,94.0,94.0,94.0,94.0
Slytherin,3.0,51.333333,42.335958,23.0,27.0,31.0,65.5,100.0


In [154]:
grouped_grades['Potions'].min()

House
Gryffindor    70.0
Hufflepuff     NaN
Ravenclaw     94.0
Slytherin     23.0
Name: Potions, dtype: float64

In [155]:
grouped_grades[active_classes].min()

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gryffindor,70.0,83.0,100.0,71.0,71.0,58.0,93.0,83.0
Hufflepuff,,98.0,,,,,,
Ravenclaw,94.0,92.0,100.0,93.0,93.0,98.0,95.0,97.0
Slytherin,23.0,15.0,,29.0,6.0,3.0,41.0,49.0


How do the mean grades across house look in each class?

In [156]:
grouped_grades[active_classes].mean()

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gryffindor,80.0,90.833333,100.0,90.833333,74.0,81.2,96.0,92.5
Hufflepuff,,98.0,,,,,,
Ravenclaw,94.0,94.5,100.0,94.0,95.5,99.0,96.5,97.5
Slytherin,51.333333,48.666667,,44.333333,30.666667,32.0,67.666667,64.333333


While many aggregations are built-in, we may sometimes need additional power in applying aggregations.  That is where the `agg` method comes in.

The `agg` method can perform aggregation functions across a base dataframe or across a grouped object resulting from a `groupby`.

In [157]:
grouped_grades[active_classes].agg(np.median)

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gryffindor,75.0,92.5,100.0,91.5,73.0,92.0,95.0,93.5
Hufflepuff,,98.0,,,,,,
Ravenclaw,94.0,94.5,100.0,94.0,95.5,99.0,96.5,97.5
Slytherin,31.0,43.0,,32.0,11.0,21.0,70.0,52.0


In [158]:
grouped_grades[active_classes].median()

Unnamed: 0_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Gryffindor,75.0,92.5,100.0,91.5,73.0,92.0,95.0,93.5
Hufflepuff,,98.0,,,,,,
Ravenclaw,94.0,94.5,100.0,94.0,95.5,99.0,96.5,97.5
Slytherin,31.0,43.0,,32.0,11.0,21.0,70.0,52.0


This is similar to directly calling an aggregation function (e.g., `mean`), but `agg` allows a bit more flexibility.

For example, what if we want to find the mode of each value in the groups.  The groupby object doesn't have a `mode` method.

In [159]:
grouped_grades[active_classes].mode()

AttributeError: 'DataFrameGroupBy' object has no attribute 'mode'

We can use `agg` to execute a list of aggregations.

In [160]:
grouped_grades[active_classes].agg([np.mean, Series.mode, Series.median])

Unnamed: 0_level_0,Potions,Potions,Potions,Transfiguration,Transfiguration,Transfiguration,Runes,Runes,Runes,Data Science,Data Science,Data Science,Herbology,Herbology,Herbology
Unnamed: 0_level_1,mean,mode,median,mean,mode,median,mean,mode,median,mean,mode,median,mean,mode,median
House,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Gryffindor,80.0,75.0,75.0,90.833333,93.0,92.5,100.0,100.0,100.0,81.2,58.0,92.0,92.5,100.0,93.5
Hufflepuff,,[],,98.0,98.0,98.0,,[],,,[],,,[],
Ravenclaw,94.0,94.0,94.0,94.5,"[92.0, 97.0]",94.5,100.0,100.0,100.0,99.0,"[98.0, 100.0]",99.0,97.5,"[97.0, 98.0]",97.5
Slytherin,51.333333,"[23.0, 31.0, 100.0]",31.0,48.666667,"[15.0, 43.0, 88.0]",43.0,,[],,32.0,"[3.0, 21.0, 72.0]",21.0,64.333333,"[49.0, 52.0, 92.0]",52.0


Here is a little fancy selective application of aggregation functions.

In [161]:
agg_dict = {'Potions': [np.max, np.min], 'Data Science': [np.mean, Series.mode, Series.median], 'Herbology': [np.mean, np.std]}

In [163]:
grouped_grades.agg(agg_dict)

Unnamed: 0_level_0,Potions,Potions,Data Science,Data Science,Data Science,Herbology,Herbology
Unnamed: 0_level_1,amax,amin,mean,mode,median,mean,std
House,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Gryffindor,100.0,70.0,81.2,58.0,92.0,92.5,8.812869
Hufflepuff,,,,[],,,
Ravenclaw,94.0,94.0,99.0,"[98.0, 100.0]",99.0,97.5,0.707107
Slytherin,100.0,23.0,32.0,"[3.0, 21.0, 72.0]",21.0,64.333333,24.006943


We can use our own functions with `agg`.

In [165]:
def B_sum(values):
    sum = 0
    for value in values:
        if value >= 80 and value < 90:
            sum = sum + value
    return(sum)

In [166]:
grouped_grades[active_classes].agg([np.sum, B_sum])

Unnamed: 0_level_0,Potions,Potions,Transfiguration,Transfiguration,Runes,Runes,Defense,Defense,Divination,Divination,Data Science,Data Science,Charms,Charms,Herbology,Herbology
Unnamed: 0_level_1,sum,B_sum,sum,B_sum,sum,B_sum,sum,B_sum,sum,B_sum,sum,B_sum,sum,B_sum,sum,B_sum
House,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Gryffindor,400.0,80.0,545.0,167.0,100.0,0,545.0,0,222.0,0,406.0,0,288.0,0,370.0,170.0
Hufflepuff,0.0,0.0,98.0,0.0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0.0
Ravenclaw,94.0,0.0,189.0,0.0,100.0,0,188.0,0,191.0,0,198.0,0,193.0,0,195.0,0.0
Slytherin,154.0,0.0,146.0,88.0,0.0,0,133.0,0,92.0,0,96.0,0,203.0,0,193.0,0.0


### Examining Groups ###

We can take a look at the groups and their contents.

In [167]:
grouped_grades.groups

{'Gryffindor': ['Harry', 'Hermione', 'Ron', 'Fred', 'George', 'Neville'], 'Hufflepuff': ['Cedric'], 'Ravenclaw': ['Luna', 'Cho'], 'Slytherin': ['Draco', 'Crabbe', 'Goyle']}

We can also retrieve a particular group.

In [169]:
grouped_grades.get_group('Slytherin')

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin


There has been some speculation that some professors tend to favor or penalize students from particular houses.  Let's investigate a bit.

It might be a useful to know the professor for each class.

Add some information on instructors.

In [170]:
class_instructor = {'Potions': 'Snape', 
                    'Divination': 'Trelawney', 
                    'Defense': 'Lockhart', 
                    'Data Science': 'Talley', 
                    'Transfiguration': 'McGonagall',
                    'Charms': 'Flitwick',
                    'Choir': 'Flitwick',
                    'Herbology': 'Sprout'}

In [171]:
grades_instructor = grades_ext.rename(columns = class_instructor)
grades_instructor

Unnamed: 0,Snape,McGonagall,Runes,Lockhart,Trelawney,Talley,Flitwick,Sprout,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


In [172]:
grouped_grades_instructor = grades_instructor.groupby('House')

In [173]:
grouped_grades_instructor.mean()

Unnamed: 0_level_0,Snape,McGonagall,Runes,Lockhart,Trelawney,Talley,Flitwick,Sprout,Average Points
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gryffindor,80.0,90.833333,100.0,90.833333,74.0,81.2,96.0,92.5,85.744048
Hufflepuff,,98.0,,,,,,,98.0
Ravenclaw,94.0,94.5,100.0,94.0,95.5,99.0,96.5,97.5,96.125
Slytherin,51.333333,48.666667,,44.333333,30.666667,32.0,67.666667,64.333333,48.428571


In the spirit of Fred and George, let's make this a bit more fun. 

Let's change a few grade changes through magic (actually, Pandas).

<span style="color:red">
WARNING:   It is unethical to arbitrarily change grades, even through magic (or Pandas).
<p>Do not attempt.  This is only an example for pedalogical purposes.
</span>

In [174]:
grades_instructor.loc['Harry', 'Snape'] = 10.0

In [175]:
grades_instructor.loc['Ron', 'Snape'] = 10.0

In [176]:
grades_instructor

Unnamed: 0,Snape,McGonagall,Runes,Lockhart,Trelawney,Talley,Flitwick,Sprout,Average Points,House
Harry,10.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,10.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


In [177]:
grouped_grades_instructor.mean()

Unnamed: 0_level_0,Snape,McGonagall,Runes,Lockhart,Trelawney,Talley,Flitwick,Sprout,Average Points
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gryffindor,80.0,90.833333,100.0,90.833333,74.0,81.2,96.0,92.5,85.744048
Hufflepuff,,98.0,,,,,,,98.0
Ravenclaw,94.0,94.5,100.0,94.0,95.5,99.0,96.5,97.5,96.125
Slytherin,51.333333,48.666667,,44.333333,30.666667,32.0,67.666667,64.333333,48.428571


Note that the computed mean for the groups did ***not*** change after we changed grades.

We need to reconstruct the groupby object to get accurate answers when values in the dataframe change.

In [178]:
grouped_grades_instructor = grades_instructor.groupby('House')

In [179]:
grouped_grades_instructor.mean()

Unnamed: 0_level_0,Snape,McGonagall,Runes,Lockhart,Trelawney,Talley,Flitwick,Sprout,Average Points
House,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Gryffindor,54.0,90.833333,100.0,90.833333,74.0,81.2,96.0,92.5,85.744048
Hufflepuff,,98.0,,,,,,,98.0
Ravenclaw,94.0,94.5,100.0,94.0,95.5,99.0,96.5,97.5,96.125
Slytherin,51.333333,48.666667,,44.333333,30.666667,32.0,67.666667,64.333333,48.428571


One of the professors has a pretty low mean for Slytherin students (hint: see Data Science), but there is no 'smoking wand' to confirm the bias.

## Multi-level groups ##

Let's go back to our original data and augment it with some additional data.

In [180]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


In [181]:
quidditch_players = ['Harry', 'Ron', 'Fred', 'George', 'Cedric', 'Cho', 'Draco']
quidditch_players

['Harry', 'Ron', 'Fred', 'George', 'Cedric', 'Cho', 'Draco']

In [182]:
grades_ext.index.to_series()

Harry          Harry
Hermione    Hermione
Ron              Ron
Draco          Draco
Crabbe        Crabbe
Fred            Fred
George        George
Goyle          Goyle
Luna            Luna
Cho              Cho
Cedric        Cedric
Neville      Neville
dtype: object

In [183]:
def plays_quidditch(x, players):
    return(x in players)

In [184]:
plays_quidditch('Harry', players=quidditch_players)

True

In [185]:
players = grades_ext.index.to_series().apply(plays_quidditch, players=quidditch_players)
players

Harry        True
Hermione    False
Ron          True
Draco        True
Crabbe      False
Fred         True
George       True
Goyle       False
Luna        False
Cho          True
Cedric       True
Neville     False
dtype: bool

In [186]:
grades_ext['Quidditch'] = players

In [187]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House,Quidditch
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor,True
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor,False
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor,True
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin,True
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin,False
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor,True
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor,True
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin,False
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw,False
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw,True


In [188]:
grouped_grades = grades_ext.groupby(['House', 'Quidditch'])

In [189]:
grouped_grades.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points
House,Quidditch,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Gryffindor,False,100.0,92.0,100.0,85.5,78.0,100.0,100.0,100.0,91.625
Gryffindor,True,75.0,90.25,,93.5,72.0,76.5,94.0,85.0,82.803571
Hufflepuff,True,,98.0,,,,,,,98.0
Ravenclaw,False,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25
Ravenclaw,True,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0
Slytherin,False,27.0,29.0,,30.5,8.5,12.0,55.5,50.5,30.428571
Slytherin,True,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571


Remove the extra column we created

In [190]:
grades_ext.drop('Quidditch', axis='columns', inplace=True)

In [191]:
grades_ext

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


## Bins ###

It is sometimes useful to group observations into "buckets" or "bins" where a range of values are labeled the same.  

We can use the letter grades example to demonstrate binning.

In [192]:
grades_ext2 = grades_ext.copy()
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw


In [193]:
bins = [0, 60, 70, 80, 90, 101]
labels = ['F', 'D', 'C', 'B', 'A']

In [194]:
pd.cut(grades_ext2['Potions'], bins=bins, labels=labels, right=False, include_lowest=True)

Harry         B
Hermione      A
Ron           C
Draco         A
Crabbe        F
Fred          C
George        C
Goyle         F
Luna          A
Cho         NaN
Cedric      NaN
Neville     NaN
Name: Potions, dtype: category
Categories (5, object): ['F' < 'D' < 'C' < 'B' < 'A']

In [195]:
grades_ext2[active_classes].apply(pd.cut, bins=bins, labels=labels, right=False, include_lowest=True)

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology
Harry,B,A,,A,C,A,A,B
Hermione,A,A,A,A,,A,A,A
Ron,C,B,,A,C,A,A,B
Draco,A,B,,C,C,C,A,A
Crabbe,F,F,,F,F,F,C,F
Fred,C,A,,A,,F,,
George,C,A,,A,,F,,
Goyle,F,F,,F,F,F,F,F
Luna,A,A,A,A,A,A,A,A
Cho,,A,,A,A,A,A,A


In [196]:
grades_ext2[active_classes + '_grade'] = grades_ext2[active_classes].apply(pd.cut, bins=bins, labels=labels, right=False, include_lowest=True)

In [197]:
grades_ext2

Unnamed: 0,Potions,Transfiguration,Runes,Defense,Divination,Data Science,Charms,Herbology,Average Points,House,Potions_grade,Transfiguration_grade,Runes_grade,Defense_grade,Divination_grade,Data Science_grade,Charms_grade,Herbology_grade
Harry,80.0,92.0,,100.0,71.0,92.0,93.0,83.0,87.285714,Gryffindor,B,A,,A,C,A,A,B
Hermione,100.0,100.0,100.0,100.0,,100.0,100.0,100.0,100.0,Gryffindor,A,A,A,A,,A,A,A
Ron,70.0,83.0,,92.0,73.0,98.0,95.0,87.0,85.428571,Gryffindor,C,B,,A,C,A,A,B
Draco,100.0,88.0,,72.0,75.0,72.0,92.0,92.0,84.428571,Slytherin,A,B,,C,C,C,A,A
Crabbe,31.0,15.0,,29.0,6.0,3.0,70.0,52.0,29.428571,Slytherin,F,F,,F,F,F,C,F
Fred,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor,C,A,,A,,F,,
George,75.0,93.0,,91.0,,58.0,,,79.25,Gryffindor,C,A,,A,,F,,
Goyle,23.0,43.0,,32.0,11.0,21.0,41.0,49.0,31.428571,Slytherin,F,F,,F,F,F,F,F
Luna,94.0,97.0,100.0,93.0,98.0,100.0,98.0,98.0,97.25,Ravenclaw,A,A,A,A,A,A,A,A
Cho,,92.0,,95.0,93.0,98.0,95.0,97.0,95.0,Ravenclaw,,A,,A,A,A,A,A
