In [1]:
import pandas as pd
import numpy as np

# Multiindex

If you set an index to more than one columnn you are creating multi index or Hieararchical index. This makes asking questions based on indexes a lot more easier, and also opens the possibility of working with multidimensional data. 

We'll use the example sourced from [here](https://chrisalbon.com/python/pandas_hierarchical_data.html). 

In [2]:
# Create dataframe
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], 
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], 
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], 
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [3]:
df_1_ind = df.set_index('regiment')
df_1_ind

Unnamed: 0_level_0,company,name,preTestScore,postTestScore
regiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nighthawks,1st,Miller,4,25
Nighthawks,1st,Jacobson,24,94
Nighthawks,2nd,Ali,31,57
Nighthawks,2nd,Milner,2,62
Dragoons,1st,Cooze,3,70
Dragoons,1st,Jacon,4,25
Dragoons,2nd,Ryaner,24,94
Dragoons,2nd,Sone,31,57
Scouts,1st,Sloan,2,62
Scouts,1st,Piger,3,70


In [4]:
# How do we get the average scores, based on the regiment? 
df_1_ind.mean(level = 'regiment')

Unnamed: 0_level_0,preTestScore,postTestScore
regiment,Unnamed: 1_level_1,Unnamed: 2_level_1
Nighthawks,15.25,59.5
Dragoons,15.5,61.5
Scouts,2.5,66.0


In [5]:
# How about you want to get the mean scores, based on the company but not the regiment? 

# Set the hierarchical index to be by regiment, and then by company
df_2_ind = df.set_index(['regiment', 'company'])
df_2_ind

Unnamed: 0_level_0,Unnamed: 1_level_0,name,preTestScore,postTestScore
regiment,company,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Nighthawks,1st,Miller,4,25
Nighthawks,1st,Jacobson,24,94
Nighthawks,2nd,Ali,31,57
Nighthawks,2nd,Milner,2,62
Dragoons,1st,Cooze,3,70
Dragoons,1st,Jacon,4,25
Dragoons,2nd,Ryaner,24,94
Dragoons,2nd,Sone,31,57
Scouts,1st,Sloan,2,62
Scouts,1st,Piger,3,70


<div class="alert alert-block alert-info">
<p>
Having multiple indexes will give you an easy way to model more than two dimensional data with DataFrames, which are by default a two dimensional data structures. 
</p>
<p>
For the above example, you can imagine each regiment is a two-dimensional array giving details about the company, names and the scores, and they are stacked one below the other. 
</p>
</div>

In [8]:
df_2_ind.mean(level='company')
df_2_ind.mean(level='regiment')
df_2_ind.mean(level=['regiment','company'])

Unnamed: 0_level_0,Unnamed: 1_level_0,preTestScore,postTestScore
regiment,company,Unnamed: 2_level_1,Unnamed: 3_level_1
Nighthawks,1st,14.0,59.5
Nighthawks,2nd,16.5,59.5
Dragoons,1st,3.5,47.5
Dragoons,2nd,27.5,75.5
Scouts,1st,2.5,66.0
Scouts,2nd,2.5,66.0


# Pandas Aggregation


In [9]:
# We'll be using our college scorecard dataset in this tutorial.
college_scorecard = pd.read_csv(
    './data/college-scorecard-data-scrubbed.csv', 
    encoding='latin-1')

### The `describe()` method
The `describe()` method is available on both **`Series`** and **`DataFrame`** objects and outputs a variety of aggregations that are very useful in getting the general "sense" of a dataset.


In [11]:
# You can specify **`include='all'`** to force Pandas
# to evaluate all columns.  It will inject NaN where
# a calculation cannot be done.
college_scorecard.describe()
college_scorecard.describe(include='all')

Unnamed: 0,UNITID,OPEID,OPEID6,institution_name,city,state,url,predominant_degree_code,predominant_degree_desc,institutional_owner_code,...,pell_grant_receipents,full_time_retention_rate_4_year,full_time_retention_rate_less_than_4_year,part_time_rentention_rate_4_year,part_time_rentention_rate_less_than_4_year,students_with_federal_loans,median_student_earnings,median_student_debt,less_than_4_year_school_completion_rate,4_year_school_completion_rate
count,7282.0,7282.0,7282.0,7282,7282,7282,7225,7282.0,7282,7282.0,...,6966.0,2293.0,3843.0,1412.0,2208.0,6966.0,6201,7251,3972,2497
unique,,,,7164,2493,59,5992,,5,,...,,,,,,,598,2059,3742,2377
top,,,,Stevens-Henager College,New York,CA,www.itt-tech.edu,,Certificate,,...,,,,,,,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed,PrivacySuppressed
freq,,,,7,87,708,143,,3343,,...,,,,,,,816,1519,166,116
mean,283704.0883,1911246.0,16393.400439,,,,,1.903735,,2.196924,...,0.532093,0.707081,0.686155,0.455639,0.564679,0.523092,,,,
std,133558.728309,3459461.0,13945.231754,,,,,0.954501,,0.838866,...,0.225941,0.195645,0.180121,0.293325,0.26354,0.284088,,,,
min,100654.0,100200.0,1002.0,,,,,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
25%,170749.5,345950.0,3459.5,,,,,1.0,,1.0,...,0.35885,0.6182,0.5679,0.25,0.382925,0.3333,,,,
50%,222372.5,1063250.0,10490.0,,,,,2.0,,2.0,...,0.5233,0.7414,0.6906,0.45,0.50325,0.5849,,,,
75%,442070.75,3010606.0,26089.75,,,,,3.0,,3.0,...,0.7143,0.8333,0.81575,0.6364,0.7895,0.747325,,,,


## Airline Data

We will using a sample dataset of the flight schedules data that is available on Kaggle [here](https://www.kaggle.com/usdot/flight-delays)

This is only a sample of the original data. You will use the original data in your Group Project!

In [13]:
flights = pd.read_csv('./data/flight_sample.csv')
flights.sample(10)
flights.describe(include="all")

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAXI_IN,TAXI_OUT,DISTANCE
count,10000.0,10000.0,10000.0,10000.0,10000,10000.0,9829.0,9834.0,10000.0
unique,,,,,14,,,,
top,,,,,WN,,,,
freq,,,,,2188,,,,
mean,2015.0,6.4977,15.7309,3.9349,,2196.9175,7.450911,15.923429,824.9117
std,0.0,3.42207,8.743668,1.980468,,1778.808251,5.39903,8.640529,604.715934
min,2015.0,1.0,1.0,1.0,,1.0,1.0,1.0,31.0
25%,2015.0,4.0,8.0,2.0,,718.75,4.0,11.0,375.75
50%,2015.0,6.0,16.0,4.0,,1696.5,6.0,14.0,651.0
75%,2015.0,9.0,23.0,6.0,,3273.0,9.0,18.0,1065.0


## Activity


### Selection Without using GroupBy

**NOTE**: The following three questions does not involve any `groupby`

1. Returning to the `flights` dataframe, extract only the flight details of the American Airlines (AA) using a mask. 
2. What is the median DISTANCE, TAXI_IN times and TAXI_OUT times? 
3. How about median DISTANCE, TAXI_IN and TAXI_OUT times for United Airlines (UA)? 


In [19]:
# Question 1
aa_flights = flights[ flights['AIRLINE'] == 'AA' ]
aa_flights[['DISTANCE', 'TAXI_IN', 'TAXI_OUT']][:5]


Unnamed: 0,DISTANCE,TAXI_IN,TAXI_OUT
2,761,13.0,25.0
19,432,9.0,17.0
43,2585,8.0,20.0
55,507,14.0,10.0
59,1192,4.0,12.0


In [21]:
# Question 2
aa_flights[['DISTANCE', 'TAXI_IN', 'TAXI_OUT']].median()

DISTANCE    985.0
TAXI_IN       7.0
TAXI_OUT     15.0
dtype: float64

In [23]:
# Question 3
ua_flights = flights[ flights['AIRLINE'] == 'UA' ]
ua_flights[['DISTANCE', 'TAXI_IN', 'TAXI_OUT']][:5]
ua_flights[['DISTANCE', 'TAXI_IN', 'TAXI_OUT']].median()

DISTANCE    1023.0
TAXI_IN        7.0
TAXI_OUT      15.0
dtype: float64

# Pandas Grouping

## The `groupby()` Method

So far, all the calculations that we've done on **`DataFrame`** objects have looked at the values of columns as a whole.

The `groupby()` method allows you to move into deeper forms analysis by splitting up the rows of a dataset into groups by the values in specified row(s). You can think of this in some ways as putting rows into buckets for evaluation.

### Specifying how to Split your Dataset into Groups
Of course, before we can perform evaluations on groups, we have to create them from an existing dataframe. 

Let's explore how **`groupby()`** provides a variety of ways to split up your datasets. We'll explore some of these here, starting with the most simple.

#### Single Column Grouping

In [29]:
flights_by_airline = flights.groupby(['AIRLINE'])
# print(flights_by_airline.groups)
flights_by_airline.head( )

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAXI_IN,TAXI_OUT,DISTANCE
0,2015,8,19,3,EV,3260,7.0,20.0,1091
1,2015,9,23,3,WN,3050,4.0,9.0,837
2,2015,10,16,5,AA,1382,13.0,25.0,761
3,2015,1,19,1,WN,4274,5.0,23.0,1547
4,2015,4,22,3,WN,2237,5.0,18.0,872
...,...,...,...,...,...,...,...,...,...
241,2015,10,23,5,NK,305,6.0,14.0,2173
270,2015,11,23,1,VX,108,7.0,8.0,2288
299,2015,3,25,3,NK,501,6.0,10.0,1139
329,2015,6,18,4,NK,180,8.0,12.0,1076


The **`groupby()`** method returns an type called **`DataFrameGroupBy`**. We will explore it in more depth shortly, but for now just know that it has an attribute called **`groups`** which provides a *`dict`* object with the **labels** of each group and the **corresponding index values** in the original dataframe that belong to that group.

If you look above, you can see there is a group labelled 'AA' will index values [2,   19,   43,   55,   59,   64,   71,   74,   82,   92, ...].

You can think of this as a record of all the groups that we will perform calculations on later.

#### Multi Column Grouping

You can specify multiple columns if you wish to split your data up in multiple levels:

In [30]:
flights_by_airline_month = flights.groupby(['AIRLINE', 'MONTH'])
flights_by_airline_month.groups

{('AA',
  1): Int64Index([ 182,  476,  573,  641,  655,  722,  848,  914,  971, 1027, 1266,
             1836, 1889, 1892, 2024, 2060, 2062, 2188, 2207, 2240, 2409, 2454,
             2512, 2652, 2737, 2895, 2933, 2958, 2978, 3039, 3542, 3562, 3635,
             3808, 4031, 4130, 4193, 4245, 4318, 4435, 4540, 4623, 4631, 4800,
             4914, 4955, 5199, 5239, 5402, 5417, 5453, 5773, 5853, 5870, 5893,
             5963, 6028, 6149, 6345, 6395, 6736, 6800, 6997, 7051, 7229, 7239,
             7380, 7434, 7717, 7791, 7862, 7875, 7879, 8015, 8205, 8217, 8233,
             8243, 8280, 8329, 8497, 8742, 8779, 8872, 9152, 9236, 9294, 9571],
            dtype='int64'),
 ('AA',
  2): Int64Index([ 512,  571,  616,  727,  860,  929,  953,  956, 1086, 1118, 1159,
             1231, 1291, 1456, 1512, 1734, 1796, 1910, 1940, 1941, 1959, 2090,
             2430, 2589, 2962, 3257, 3358, 3369, 3629, 3760, 3851, 4019, 4060,
             4078, 4155, 4319, 4508, 4520, 4575, 4829, 5759, 5924, 6170, 621

### Aggregations after GroupBy

For example, let us say you want to find out the average distance traveled by each airline, you can do that using the following aggregeate function

In [31]:
flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAXI_IN,TAXI_OUT,DISTANCE
0,2015,8,19,3,EV,3260,7.0,20.0,1091
1,2015,9,23,3,WN,3050,4.0,9.0,837
2,2015,10,16,5,AA,1382,13.0,25.0,761
3,2015,1,19,1,WN,4274,5.0,23.0,1547
4,2015,4,22,3,WN,2237,5.0,18.0,872


In [32]:
flights_by_airline = flights.groupby(['AIRLINE'])

In [33]:
flights_by_airline.mean()

Unnamed: 0_level_0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,TAXI_IN,TAXI_OUT,DISTANCE
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,2015.0,7.097862,16.024671,4.055921,1266.090461,9.092593,17.785354,1053.736842
AS,2015.0,6.565916,15.356913,3.996785,386.385852,6.44373,15.073955,1202.405145
B6,2015.0,6.653333,15.771111,3.893333,905.42,5.966216,18.231982,1064.124444
DL,2015.0,6.551383,15.486825,3.887352,1631.198287,7.279392,17.189564,862.416996
EV,2015.0,6.408591,15.355644,3.851149,4742.821179,7.700409,16.813456,466.038961
F9,2015.0,7.181818,15.762238,3.79021,775.594406,10.188811,16.335664,1034.223776
HA,2015.0,6.239669,16.140496,3.809917,209.578512,7.214876,11.115702,789.768595
MQ,2015.0,6.247059,14.858824,3.917647,3304.494118,8.512397,16.628099,433.701961
NK,2015.0,6.706161,15.938389,3.976303,534.971564,8.908213,14.318841,993.298578
OO,2015.0,6.347614,16.03408,3.938656,5193.527751,6.725919,17.864945,516.424537


In [34]:
avg_by_airline = flights_by_airline[['DISTANCE', 'TAXI_OUT', 'TAXI_IN']].mean()

**NOTE**: The double [[ ]] for computing the summary stististics. The first pair [] is used to look into the `DataFrameGroupyBy` object the second pair [] is used to list all the columns you want to produce the summary statistics. 

In [35]:
avg_by_airline

Unnamed: 0_level_0,DISTANCE,TAXI_OUT,TAXI_IN
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA,1053.736842,17.785354,9.092593
AS,1202.405145,15.073955,6.44373
B6,1064.124444,18.231982,5.966216
DL,862.416996,17.189564,7.279392
EV,466.038961,16.813456,7.700409
F9,1034.223776,16.335664,10.188811
HA,789.768595,11.115702,7.214876
MQ,433.701961,16.628099,8.512397
NK,993.298578,14.318841,8.908213
OO,516.424537,17.864945,6.725919


## Using `loc[]` with group by aggregate data

A dataframe is returned with the results of your aggregate data. Therefore, everything we've learned about Pandas still apply to accessing and working with the results.

This includes the use of `loc[]` to access rows and dictionary keys to access columns

In [45]:
print(avg_by_airline.loc['AA'])
print(avg_by_airline.loc['UA'])
print(avg_by_airline.loc['WN'])

DISTANCE    1053.736842
TAXI_OUT      17.785354
TAXI_IN        9.092593
Name: AA, dtype: float64
DISTANCE    1243.796421
TAXI_OUT      16.754545
TAXI_IN        8.638225
Name: UA, dtype: float64
DISTANCE    722.505027
TAXI_OUT     11.931787
TAXI_IN       6.247911
Name: WN, dtype: float64


In [50]:
flights_by_airline_month.mean()

flights_by_airline_month.mean().loc['AA',3]

YEAR             2015.000000
DAY                17.494382
DAY_OF_WEEK         4.067416
FLIGHT_NUMBER    1137.595506
TAXI_IN             9.800000
TAXI_OUT           17.188235
DISTANCE         1087.707865
Name: (AA, 3), dtype: float64

## Activity
### Gerneralizing using GroupBy

4. Instead of doing this for each airline separately, can you do this for all airlines at a the same time using `groupby`?
5. Extract the median DISTANCE for SouthWest airlines (WN) and assign it a variable `median_distance_WN`. 
6. What is the median DISTANCE, TAXI_IN times and TAXI_OUT times per airline per month? Hint: Notice that we want to group by two different columns.
7. Extract the median TAXI_OUT for SouthWest airlines (WN) in December (12) and assign it a variable `median_distance_WN_12`. 

In [68]:
# Question 4
# get median 'DISTANCE', 'TAXI_IN', 'TAXI_OUT' for all airlines
median_by_airline = flights[ 
    ['AIRLINE', 'DISTANCE', 'TAXI_IN', 'TAXI_OUT'] 
].groupby('AIRLINE').median()

median_by_airline[:3]

Unnamed: 0_level_0,DISTANCE,TAXI_IN,TAXI_OUT
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AA,985.0,7.0,15.0
AS,954.0,5.0,14.0
B6,997.0,5.0,15.0


In [72]:
# Question 5
# Extract the median DISTANCE for SouthWest airlines (WN) using loc[]
# Assign it a variable median_distance_WN
median_by_airline.loc['WN']['DISTANCE']

611.0

In [64]:
# Question 6
# What is the median DISTANCE, TAXI_IN times and TAXI_OUT times per airline per month? 
median_by_airline_and_month = flights[ ['AIRLINE', 'MONTH', 'DISTANCE', 'TAXI_IN', 'TAXI_OUT'] ].groupby(['AIRLINE', 'MONTH']).median()
median_by_airline_and_month[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,DISTANCE,TAXI_IN,TAXI_OUT
AIRLINE,MONTH,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,1,1061.0,7.0,15.0
AA,2,932.5,8.0,14.0
AA,3,1089.0,8.0,16.0


In [67]:
# Question 7: 
# Extract the median TAXI_OUT for SouthWest airlines (WN) in December (12) 
# Assign it a variable median_distance_WN_12
median_by_airline_and_month.loc['WN', 12]['TAXI_OUT']

10.0

### Understanding the Aggregation After GroupBy: Method Dispatching

Let us now look at how the Aggregations on the DataFrameGroupBy objects work. In the **`DataFrameGroupBy`** objects, any method not found on the object itself is forwarded ("**dispatched**") to all the groups that it contains.

That is why we were able to ask for the *`median`* of a **`flights_by_airline`** object above and get something back: it is (1) "dispatching" the *`median`* method call to each group (that is each airline), (2) collecting the results and (3) presenting them to us.

In [73]:
flights_by_airline = flights.groupby(['AIRLINE'])

In [74]:
flights_by_airline.median()

Unnamed: 0_level_0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,TAXI_IN,TAXI_OUT,DISTANCE
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,2015.0,8.0,16.0,4.0,1292.0,7.0,15.0,985.0
AS,2015.0,7.0,15.0,4.0,384.0,5.0,14.0,954.0
B6,2015.0,7.0,16.0,4.0,749.0,5.0,15.0,997.0
DL,2015.0,7.0,15.0,4.0,1654.5,6.0,15.0,666.0
EV,2015.0,6.0,15.0,4.0,4891.0,7.0,15.0,429.0
F9,2015.0,7.0,14.0,4.0,720.0,8.0,13.0,927.0
HA,2015.0,6.0,16.0,4.0,214.0,6.0,11.0,163.0
MQ,2015.0,6.0,14.0,4.0,3301.5,6.0,14.0,408.0
NK,2015.0,7.0,16.0,4.0,511.0,7.0,12.0,977.0
OO,2015.0,6.0,16.0,4.0,5265.0,5.0,16.0,451.0


In [76]:
# Compute the median for the entire DataFrameGroupBy object and then select 'DISTANCE' column 
flights_by_airline.median()[['DISTANCE']][:2]

Unnamed: 0_level_0,DISTANCE
AIRLINE,Unnamed: 1_level_1
AA,985.0
AS,954.0


In [77]:
# Select the 'DISTANCE' Column as a dataframe and then compute the median
flights_by_airline[ ['DISTANCE'] ].median()[:2]

Unnamed: 0_level_0,DISTANCE
AIRLINE,Unnamed: 1_level_1
AA,985.0
AS,954.0


In [78]:
# Select the 'DISTANCE' Column as a numpy array and then compute the median
flights_by_airline['DISTANCE'].median()[:2]

AIRLINE
AA    985.0
AS    954.0
Name: DISTANCE, dtype: float64

**Question**: Which of the above methods should be preferred? 

### Methods of `DataFrameGroupBy` Objects
Now we will understand the various operations built into the `DataFrameGroupBy` object type.

#### The `aggregate()` Method
At first, the `aggregate()` method appears to be quite similiar to what we just covered when we talked about method dispatching. It performs aggregations on the groups in a **`DataFrameGroupBy`** object.

In [79]:
flights_by_airline.aggregate('mean')

Unnamed: 0_level_0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,TAXI_IN,TAXI_OUT,DISTANCE
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AA,2015.0,7.097862,16.024671,4.055921,1266.090461,9.092593,17.785354,1053.736842
AS,2015.0,6.565916,15.356913,3.996785,386.385852,6.44373,15.073955,1202.405145
B6,2015.0,6.653333,15.771111,3.893333,905.42,5.966216,18.231982,1064.124444
DL,2015.0,6.551383,15.486825,3.887352,1631.198287,7.279392,17.189564,862.416996
EV,2015.0,6.408591,15.355644,3.851149,4742.821179,7.700409,16.813456,466.038961
F9,2015.0,7.181818,15.762238,3.79021,775.594406,10.188811,16.335664,1034.223776
HA,2015.0,6.239669,16.140496,3.809917,209.578512,7.214876,11.115702,789.768595
MQ,2015.0,6.247059,14.858824,3.917647,3304.494118,8.512397,16.628099,433.701961
NK,2015.0,6.706161,15.938389,3.976303,534.971564,8.908213,14.318841,993.298578
OO,2015.0,6.347614,16.03408,3.938656,5193.527751,6.725919,17.864945,516.424537


The difference is that the **`aggregate()`** method gives you some additional options that are not available if you rely on method dispatching as shown above.

In [80]:
# You can pass multiple aggregates as a list.
# Here will we get various aggregates for each
# column of our flights_by_airline object.
flights_by_airline.aggregate([np.mean, 'min', 'max'])[:5]

Unnamed: 0_level_0,YEAR,YEAR,YEAR,MONTH,MONTH,MONTH,DAY,DAY,DAY,DAY_OF_WEEK,...,FLIGHT_NUMBER,TAXI_IN,TAXI_IN,TAXI_IN,TAXI_OUT,TAXI_OUT,TAXI_OUT,DISTANCE,DISTANCE,DISTANCE
Unnamed: 0_level_1,mean,min,max,mean,min,max,mean,min,max,mean,...,max,mean,min,max,mean,min,max,mean,min,max
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AA,2015,2015,2015,7.097862,1,12,16.024671,1,31,4.055921,...,2580,9.092593,2.0,60.0,17.785354,7.0,110.0,1053.736842,130,3784
AS,2015,2015,2015,6.565916,1,12,15.356913,1,31,3.996785,...,895,6.44373,2.0,24.0,15.073955,3.0,88.0,1202.405145,31,2846
B6,2015,2015,2015,6.653333,1,12,15.771111,1,31,3.893333,...,2784,5.966216,2.0,38.0,18.231982,7.0,81.0,1064.124444,68,2704
DL,2015,2015,2015,6.551383,1,12,15.486825,1,31,3.887352,...,2853,7.279392,1.0,68.0,17.189564,7.0,105.0,862.416996,74,4502
EV,2015,2015,2015,6.408591,1,12,15.355644,1,31,3.851149,...,6189,7.700409,2.0,47.0,16.813456,3.0,144.0,466.038961,69,1330


<div class="alert alert-block alert-warning">
<p>
It is important to notice that you are able to pass both strings and functions to the `aggregate()` method. It is probably best to choose one approach and stick with it rather than mixing and matching like I've done here.
</p>
</div>

In [81]:
flights_by_airline.aggregate([np.mean, np.min, np.max]).head(5)

Unnamed: 0_level_0,YEAR,YEAR,YEAR,MONTH,MONTH,MONTH,DAY,DAY,DAY,DAY_OF_WEEK,...,FLIGHT_NUMBER,TAXI_IN,TAXI_IN,TAXI_IN,TAXI_OUT,TAXI_OUT,TAXI_OUT,DISTANCE,DISTANCE,DISTANCE
Unnamed: 0_level_1,mean,amin,amax,mean,amin,amax,mean,amin,amax,mean,...,amax,mean,amin,amax,mean,amin,amax,mean,amin,amax
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AA,2015,2015,2015,7.097862,1,12,16.024671,1,31,4.055921,...,2580,9.092593,2.0,60.0,17.785354,7.0,110.0,1053.736842,130,3784
AS,2015,2015,2015,6.565916,1,12,15.356913,1,31,3.996785,...,895,6.44373,2.0,24.0,15.073955,3.0,88.0,1202.405145,31,2846
B6,2015,2015,2015,6.653333,1,12,15.771111,1,31,3.893333,...,2784,5.966216,2.0,38.0,18.231982,7.0,81.0,1064.124444,68,2704
DL,2015,2015,2015,6.551383,1,12,15.486825,1,31,3.887352,...,2853,7.279392,1.0,68.0,17.189564,7.0,105.0,862.416996,74,4502
EV,2015,2015,2015,6.408591,1,12,15.355644,1,31,3.851149,...,6189,7.700409,2.0,47.0,16.813456,3.0,144.0,466.038961,69,1330


Your textbook also talks about using a dict to apply labels to the aggregation columns so that they can have user friendly names like 'Longest Distance' rather than just 'max'.

This sort of functionality is, however, deprecated in Pandas, which means that it will be removed in future versions.

To accomplish the same thing, we should instead append a `rename()` method after our `aggregate()` method like so:

In [82]:
# Using `rename()` to apply friendly labels to output columns
flights_by_airline[['DISTANCE','TAXI_OUT']].aggregate(
    [np.mean, np.min, np.max]).rename(
        columns={'mean': 'Avg. Distance', 
                 'amin': 'Shortest Distance', 
                 'amax': 'Longest Distance'})

Unnamed: 0_level_0,DISTANCE,DISTANCE,DISTANCE,TAXI_OUT,TAXI_OUT,TAXI_OUT
Unnamed: 0_level_1,Avg. Distance,Shortest Distance,Longest Distance,Avg. Distance,Shortest Distance,Longest Distance
AIRLINE,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AA,1053.736842,130,3784,17.785354,7.0,110.0
AS,1202.405145,31,2846,15.073955,3.0,88.0
B6,1064.124444,68,2704,18.231982,7.0,81.0
DL,862.416996,74,4502,17.189564,7.0,105.0
EV,466.038961,69,1330,16.813456,3.0,144.0
F9,1034.223776,373,2218,16.335664,7.0,53.0
HA,789.768595,84,2917,11.115702,5.0,26.0
MQ,433.701961,89,1236,16.628099,4.0,167.0
NK,993.298578,177,2381,14.318841,7.0,63.0
OO,516.424537,67,1735,17.864945,4.0,78.0


<div class="alert alert-block alert-danger">
<p>
Note, there are three main things happening in the above statement. 
<ul>
<li> flights_by_airline['DISTANCE'] selects the distance column for analysis</li>
<li> flights_by_airline['DISTANCE'].aggregate([np.mean, np.min, np.max]) computes the average, min and max of the distance column selected</li>
<li> Finally .rename() function is appropriately renaming the columns according the dictionary we have given  </li>
</ul>
</p>
</div>

The recommended way of using a **`dict`** with the **`aggregate()`** method is actually to specify which aggregation(s) to perform on what columns. You can use it to specify different aggregation(s) on a per-column basis.

Here I'll use it to get the high/low values for DISTANCE and the mean for TAXI_IN on our *`flights_by_airline_month`* object.

In [83]:
flights_by_airline_month = flights.groupby(['AIRLINE', 'MONTH'])

# Notice how using this style automatically filters
# out all columns you don't specify.
flights_by_airline_month.aggregate(
        {'DISTANCE': [np.min, np.max], 
         'TAXI_IN': np.mean}
).tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,DISTANCE,DISTANCE,TAXI_IN
Unnamed: 0_level_1,Unnamed: 1_level_1,amin,amax,mean
AIRLINE,MONTH,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
VX,5,414,2704,8.0
VX,6,337,2586,8.0
VX,7,189,2454,7.285714
VX,8,337,2704,7.0
VX,9,236,2475,6.0
VX,10,236,2475,7.294118
VX,11,236,2565,7.1875
VX,12,337,2586,7.333333
WN,1,148,2447,6.519337
WN,2,148,2039,5.715278


## Activity: 

We will work again on the `college-loan-default-rates.csv` and `college-scorecard-data-scrubbed.csv` datasets. 

Use `aggregate()` method to produce

1. The average, minimum and maximum `full_time_retention_rate_4_year` per state using `college-scorecard-data-scrubbed.csv` dataset. 
    * After producing the above summary statistics, make sure you rename your columns for average, minimum and maximum as `Avg. Retention`, `Low Retention`, and `High Retention` respectively. 
2. Produce per state and city, minimum and maximum for the `sat_average` column and average for the `full_time_retention_rate_4_year` column. 

3. Which state has the highest average four year retention rate (`full_time_retention_rate_4_year`)? Which has the lowest average? 


In [2]:
import numpy as np
import pandas as pd

In [3]:
# For this tutorial, we will need both of our datasets.
college_loan_defaults = pd.read_csv(
    './data/college-loan-default-rates.csv')

college_scorecard = pd.read_csv(
    './data/college-scorecard-data-scrubbed.csv', 
    encoding='latin-1')

college_scorecard.head()

Unnamed: 0,UNITID,OPEID,OPEID6,institution_name,city,state,url,predominant_degree_code,predominant_degree_desc,institutional_owner_code,...,pell_grant_receipents,full_time_retention_rate_4_year,full_time_retention_rate_less_than_4_year,part_time_rentention_rate_4_year,part_time_rentention_rate_less_than_4_year,students_with_federal_loans,median_student_earnings,median_student_debt,less_than_4_year_school_completion_rate,4_year_school_completion_rate
0,102580,884300,8843,Alaska Bible College,Palmer,AK,www.akbible.edu/,3,Bachelors,2,...,0.3571,0.3333,,,,0.2857,,PrivacySuppressed,,
1,103501,2541000,25410,Alaska Career College,Anchorage,AK,www.alaskacareercollege.edu,1,Certificate,3,...,0.7078,,0.7941,,,0.786,28700.0,8994,0.707589494,
2,442523,4138600,41386,Alaska Christian College,Soldotna,AK,www.alaskacc.edu,1,Certificate,2,...,0.8868,,0.4737,,1.0,0.6792,,PrivacySuppressed,0.0,
3,102669,106100,1061,Alaska Pacific University,Anchorage,AK,www.alaskapacific.edu,3,Bachelors,2,...,0.3152,0.7742,,1.0,,0.5297,47000.0,23250,,0.514833663
4,102711,3160300,31603,AVTEC-Alaska's Institute of Technology,Seward,AK,www.avtec.edu/,1,Certificate,1,...,0.0737,,1.0,,1.0,0.0664,33500.0,PrivacySuppressed,0.846055789,


In [20]:
# Question 1
# The average, minimum and maximum full_time_retention_rate_4_year 
# per state using college-scorecard-data-scrubbed.csv dataset.
# After producing the above summary statistics, 
# make sure you rename your columns for 
# average, minimum and maximum as Avg. Retention, Low Retention, and High Retention respectively.

scorecard_by_state = college_scorecard.groupby(['state'])
summary_full_time = scorecard_by_state['full_time_retention_rate_4_year'].aggregate(
    [np.mean, np.min, np.max]
).rename( columns=
         {
            'mean': 'Average', 
            'amin': 'Minimum', 
            'amax': 'Maximum'
         }
        )
summary_full_time[:3]

Unnamed: 0_level_0,Average,Minimum,Maximum
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,0.66324,0.3333,0.7756
AL,0.615436,0.0,1.0
AR,0.650996,0.2564,0.8667


In [15]:
# Question 2
# Produce per state and city, minimum and maximum for the sat_average column 
# and average for the full_time_retention_rate_4_year column
scorecard_by_state_city = college_scorecard.groupby(['state', 'city'])
scorecard_by_state_city.aggregate(
    {
        'sat_average': [np.min, np.max],
        'full_time_retention_rate_4_year': [np.mean]
    }
).rename( columns=
         {
            'mean': 'Average', 
            'amin': 'Minimum', 
            'amax': 'Maximum'
         }
        )

Unnamed: 0_level_0,Unnamed: 1_level_0,sat_average,sat_average,full_time_retention_rate_4_year
Unnamed: 0_level_1,Unnamed: 1_level_1,Minimum,Maximum,Average
state,city,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
AK,Anchorage,1054.0,1054.0,0.7453
AK,Barrow,,,
AK,Fairbanks,,,0.7756
AK,Juneau,,,0.7167
AK,Palmer,,,0.3333
...,...,...,...,...
WY,Powell,,,
WY,Riverton,,,
WY,Rock Springs,,,
WY,Sheridan,,,


In [26]:
# Question 3
# Return to the summary of state scores from first step...
# Which state has the highest average four year retention rate (full_time_retention_rate_4_year)? 
# Which has the lowest average?
summary_full_time.head()
summary_full_time['Average'].idxmin()

'CO'

<div class="alert alert-block alert-warning">
<h3> Important Notes</h3>
<p> </p> 
When producing any of the summary statistics using group by, you can assign your intermediate operations to the variables. In the entire section above, I have been mostly trying to produce the results to show them to you. However, you can assign the results to a variable for using it in the future. **See the example below.** 
</div>

In [None]:
flights_by_airline_month = flights.groupby(['AIRLINE', 'MONTH'])
aggregation = {
                'DISTANCE': [np.min, np.max], 
                 'TAXI_IN': np.mean
              }
column_names = { 'amin': "Minimum", 'amax': 'Maximum', 'mean': 'Average' }

summary_distance_taxi_in = flights_by_airline_month.aggregate(
    aggregation
    ).rename( columns=column_names )

In [None]:
summary_distance_taxi_in.head()

In [None]:
# Remember from the last class that we can do aggregations at multiple levels using Hierarchical index. 
summary_distance_taxi_in.mean(level='AIRLINE')

In [None]:
summary_distance_taxi_in.mean(level='MONTH')