##### **Aggregating DataFrames** </br> 

In [3]:
import pandas as pd
import numpy as np

##### **You can aggregate a DataFrames column by using aggregation methods** </br> 

In [4]:
path_retail = 'Pandas Course Resources/retail/retail_2016_2017.csv'
retail_df = pd.read_csv(path_retail)

retail_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.0,0
1,1945945,2016-01-01,1,BABY CARE,0.0,0
2,1945946,2016-01-01,1,BEAUTY,0.0,0
3,1945947,2016-01-01,1,BEVERAGES,0.0,0
4,1945948,2016-01-01,1,BOOKS,0.0,0


In [5]:
# aggregate columns like Series using .sum() for 100 samples(100)
retail_df.loc[:, ['sales', 'onpromotion']].sample(100).sum().round(2)

sales          88705.3
onpromotion      901.0
dtype: float64

In [6]:
# aggregate columns like Series using .mean() for 100 samples(100)
retail_df.loc[:, ['sales', 'onpromotion']].sample(100).mean().round(2)

sales          374.80
onpromotion      6.25
dtype: float64

##### **Can call Aggregate functions on entire DataFrame </br> But this is not ideal** </br> Summary Statistics using `.describe()` method is better for DataFrame as a whole

In [7]:
# Create random sample of 100 for aggregation examples
sample_df = retail_df.sample(100, random_state=616)
sample_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
399033,2344977,2016-08-11,54,PRODUCE,487.239,1
579626,2525570,2016-11-21,22,HARDWARE,0.000,0
546385,2492329,2016-11-02,4,BOOKS,3.000,0
534555,2480499,2016-10-26,8,LINGERIE,7.000,0
96159,2042103,2016-02-23,7,PRODUCE,5212.624,0
...,...,...,...,...,...,...
402640,2348584,2016-08-13,7,CLEANING,702.000,6
424226,2370170,2016-08-26,12,FROZEN FOODS,34.000,0
650850,2596794,2017-01-01,20,MEATS,0.000,0
384193,2330137,2016-08-03,39,CLEANING,1031.000,21


In [8]:
# Call aggregate function .sum() on entire DataFrame
sample_df.sum()
# not ideal because of columns that are objects

id                                                     247919675
date           2016-08-112016-11-212016-11-022016-10-262016-0...
store_nbr                                                   2946
family         PRODUCEHARDWAREBOOKSLINGERIEPRODUCEBOOKSBREAD/...
sales                                                  85086.114
onpromotion                                                  905
dtype: object

In [9]:
# Call aggregate function .mean() on entire DataFrame
# sample_df.mean()
##############################
# TypeError: Could not convert
##############################

In [10]:
# call sum aggregation on specific columns of DataFrame (.sum())
sample_df.loc[:, ['sales', 'onpromotion']].sum()

sales          85086.114
onpromotion      905.000
dtype: float64

In [11]:
# call Standard Deviation aggregation on specific columns of DataFrame (.std())
sample_df.loc[:, ['sales', 'onpromotion']].std()

sales          2177.512241
onpromotion      19.364330
dtype: float64

In [12]:
# .describe() is the better method to get summary statistics for an entire DataFrame
sample_df.describe().round(2)

Unnamed: 0,id,store_nbr,sales,onpromotion
count,100.0,100.0,100.0,100.0
mean,2479196.75,29.46,850.86,9.05
std,291897.71,15.95,2177.51,19.36
min,1949080.0,1.0,0.0,0.0
25%,2248124.5,16.75,3.75,0.0
50%,2480757.5,31.0,47.0,0.0
75%,2759322.5,44.0,521.81,6.5
max,2991624.0,54.0,11596.0,95.0


##### **Grouping DataFrames**</br> Grouping a Dataframe allows aggregation of data at a different level </br> - Can transform daily data into monthly </br> - Can transform transaction level data by store </br> `.groupby(column_to_groupby)[[list_of_columns_to_aggregate]].aggregation_method()` </br> This Method must specify a column to group by and then column(s) list in nested [[]] to return DataFrame and that will be aggregated. The `column_to_groupby` column becomes the index by default

In [13]:
# call groupby method on just 'family' column
retail_df.groupby('family')
# this returns a groupby object
# <pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022F9A2DAE90>

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000230264AC090>

In [14]:
# aggregate 'sales' groupby by 'family
retail_df.groupby('family')[['sales']].sum().round().sort_values('sales', ascending=False)
# returns the sum of 'sales' grouped by the 'family' values in a DataFrame

Unnamed: 0_level_0,sales
family,Unnamed: 1_level_1
GROCERY I,143227476.0
BEVERAGES,105700279.0
PRODUCE,73523507.0
CLEANING,38127743.0
DAIRY,28422893.0
BREAD/BAKERY,17092978.0
POULTRY,12375952.0
MEATS,11551426.0
PERSONAL CARE,10193693.0
DELI,9617777.0


##### **Grouping Multiple Columns in DataFrames**</br> Grouping multiple columns allows aggregation of data at a different level using a multi-index </br> - Can transform daily data into monthly </br> - Can transform transaction level data by store </br> `.groupby([columns_to_groupby])[[list_of_columns_to_aggregate]].aggregation_method()` </br> This Method must specify columns to group by and then column(s) list in nested [[]] to return multi-indexed DataFrame that was aggregated. </br> Can also be be displayed with integer index using argument `as_index=False` in groupby method</br> If sorting results aggregated DataFrame: use this syntax `.sort_values(by=[list_of_columns_to_Sort], ascending=['list_of_True_False_for_sort_order])` </br> can also chain .query() method

In [15]:
# Create aggregate 'sales' groupby by 'family' and 'store_nbr' which creates multi-index DataFrame
retail_df.groupby(['family', 'store_nbr'])[['sales']].sum().round()
# displayed as multi-index DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
family,store_nbr,Unnamed: 2_level_1
AUTOMOTIVE,1,2524.0
AUTOMOTIVE,2,3918.0
AUTOMOTIVE,3,6790.0
AUTOMOTIVE,4,2565.0
AUTOMOTIVE,5,3667.0
...,...,...
SEAFOOD,50,12774.0
SEAFOOD,51,34251.0
SEAFOOD,52,1219.0
SEAFOOD,53,3745.0


In [16]:
# Create aggregate 'sales' groupby by 'family' and 'store_nbr' which creates multi-index DataFrame
# to display as a groupby DataFrame with integer index use as_index=False instead of multi-index DataFrame
retail_df.groupby(['family', 'store_nbr'], as_index=False)[['sales']].sum().round()
# displayed DataFrame with integer index and Aggregations

Unnamed: 0,family,store_nbr,sales
0,AUTOMOTIVE,1,2524.0
1,AUTOMOTIVE,2,3918.0
2,AUTOMOTIVE,3,6790.0
3,AUTOMOTIVE,4,2565.0
4,AUTOMOTIVE,5,3667.0
...,...,...,...
1777,SEAFOOD,50,12774.0
1778,SEAFOOD,51,34251.0
1779,SEAFOOD,52,1219.0
1780,SEAFOOD,53,3745.0


##### **Multi-Index DataFrames**</br> Created through aggregation operations </br> They are stored as a list of tuples with an item for each layer of the index

In [17]:
# Create DataFrame for aggregate 'sales' groupby by 'family' and 'store_nbr' which creates multi-index DataFrame
sales_sums = retail_df.groupby(['family', 'store_nbr'])[['sales']].sum().round()
sales_sums

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
family,store_nbr,Unnamed: 2_level_1
AUTOMOTIVE,1,2524.0
AUTOMOTIVE,2,3918.0
AUTOMOTIVE,3,6790.0
AUTOMOTIVE,4,2565.0
AUTOMOTIVE,5,3667.0
...,...,...
SEAFOOD,50,12774.0
SEAFOOD,51,34251.0
SEAFOOD,52,1219.0
SEAFOOD,53,3745.0


In [18]:
# display indices fro sales_sums
sales_sums.index
# This is the list of tuples

MultiIndex([('AUTOMOTIVE',  1),
            ('AUTOMOTIVE',  2),
            ('AUTOMOTIVE',  3),
            ('AUTOMOTIVE',  4),
            ('AUTOMOTIVE',  5),
            ('AUTOMOTIVE',  6),
            ('AUTOMOTIVE',  7),
            ('AUTOMOTIVE',  8),
            ('AUTOMOTIVE',  9),
            ('AUTOMOTIVE', 10),
            ...
            (   'SEAFOOD', 45),
            (   'SEAFOOD', 46),
            (   'SEAFOOD', 47),
            (   'SEAFOOD', 48),
            (   'SEAFOOD', 49),
            (   'SEAFOOD', 50),
            (   'SEAFOOD', 51),
            (   'SEAFOOD', 52),
            (   'SEAFOOD', 53),
            (   'SEAFOOD', 54)],
           names=['family', 'store_nbr'], length=1782)

##### **Accessing Multi-Index DataFrames**</br> use .loc[]  accessor in different ways
|Way|Description|
|---|-----------|
|&nbsp;&nbsp;1&nbsp;|Access rows via the outer index only|
|&nbsp;&nbsp;2&nbsp;|Access rows via the outer & inner  `as a tuple` [('outer_index', 'inner_index')]|

In [19]:
# Access rows via the outer index only
sales_sums.loc['AUTOMOTIVE'].head(15)
# this displays all rows for 'AUTOMOTIVE' and the 'family' column is dropped

Unnamed: 0_level_0,sales
store_nbr,Unnamed: 1_level_1
1,2524.0
2,3918.0
3,6790.0
4,2565.0
5,3667.0
6,3442.0
7,3031.0
8,3225.0
9,7695.0
10,1772.0


In [20]:
# can also slice the outer layer
sales_sums.loc['AUTOMOTIVE':'BEAUTY']
# this displays all rows for 'AUTOMOTIVE' to 'BEAUTY' and the 'family' column is not dropped

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
family,store_nbr,Unnamed: 2_level_1
AUTOMOTIVE,1,2524.0
AUTOMOTIVE,2,3918.0
AUTOMOTIVE,3,6790.0
AUTOMOTIVE,4,2565.0
AUTOMOTIVE,5,3667.0
...,...,...
BEAUTY,50,6353.0
BEAUTY,51,3566.0
BEAUTY,52,972.0
BEAUTY,53,3812.0


In [21]:
# Access rows via the outer and inner indices
# must be done as a tuple (make sure the data types match if string have in quotes, if integer has without quotes)
sales_sums.loc[('AUTOMOTIVE', 5), :]
# this displays as series and removes 'family' and 'store_nbr' of index

sales    3667.0
Name: (AUTOMOTIVE, 5), dtype: float64

In [33]:
# Access rows via the outer and inner indices
# must be done as a tuple (make sure the data types match if string have in quotes, if integer has without quotes)
# Wrapping tuple in [] will display as DataFrame 
sales_sums.loc[[('AUTOMOTIVE', 5)], :]
# this displays as DataFrame with all columns

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
family,store_nbr,Unnamed: 2_level_1
AUTOMOTIVE,5,3667.0


In [27]:
# Can Slice multi-index DataFrames but will still have multi-index, adding reset_index() will put columns labels on same level
sales_sums.loc[('AUTOMOTIVE', 5):('BEAUTY', 5),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
family,store_nbr,Unnamed: 2_level_1
AUTOMOTIVE,5,3667.0
AUTOMOTIVE,6,3442.0
AUTOMOTIVE,7,3031.0
AUTOMOTIVE,8,3225.0
AUTOMOTIVE,9,7695.0
...,...,...
BEAUTY,1,1776.0
BEAUTY,2,3824.0
BEAUTY,3,8150.0
BEAUTY,4,3063.0


In [None]:
# when Slicing multi-index DataFrames use .reset_index()
# to create new idex which turns result DataFrame into integer index with all columns and aggregations
sales_sums.loc[('AUTOMOTIVE', 5):('BEAUTY', 5),:].reset_index()

Unnamed: 0,family,store_nbr,sales
0,AUTOMOTIVE,5,3667.0
1,AUTOMOTIVE,6,3442.0
2,AUTOMOTIVE,7,3031.0
3,AUTOMOTIVE,8,3225.0
4,AUTOMOTIVE,9,7695.0
...,...,...,...
104,BEAUTY,1,1776.0
105,BEAUTY,2,3824.0
106,BEAUTY,3,8150.0
107,BEAUTY,4,3063.0


##### **Accessing Multi-Index DataFrames**</br> Several ways to modify multi-index DataFrames </br> ---- Best to Reset The Index ----
|Way|Description|
|---|-----------|
|`.reset_index()`|Most common operation to return DataFrame to integer based index with aggregations|
|`.swaplevel()`|Changes the hierarchy of index levels|
|`.droplevel()`|Drops an index level from the DataFrame entirely - Will permanently lose data|

In [None]:
# reset_index() to keep aggregation and and revert back to integer index
sales_sums.reset_index()

Unnamed: 0,family,store_nbr,sales
0,AUTOMOTIVE,1,2524.0
1,AUTOMOTIVE,2,3918.0
2,AUTOMOTIVE,3,6790.0
3,AUTOMOTIVE,4,2565.0
4,AUTOMOTIVE,5,3667.0
...,...,...,...
1777,SEAFOOD,50,12774.0
1778,SEAFOOD,51,34251.0
1779,SEAFOOD,52,1219.0
1780,SEAFOOD,53,3745.0


In [None]:
# swap levels so 'family' is not the first part of multi-index -- good if second groupby is not multi-level also - great for individual grabbing individual index
sales_sums.swaplevel()

Unnamed: 0_level_0,Unnamed: 1_level_0,sales
store_nbr,family,Unnamed: 2_level_1
1,AUTOMOTIVE,2524.0
2,AUTOMOTIVE,3918.0
3,AUTOMOTIVE,6790.0
4,AUTOMOTIVE,2565.0
5,AUTOMOTIVE,3667.0
...,...,...
50,SEAFOOD,12774.0
51,SEAFOOD,34251.0
52,SEAFOOD,1219.0
53,SEAFOOD,3745.0


In [None]:
# drop level so 'family' is permanently removed
sales_sums.droplevel('family')

Unnamed: 0_level_0,sales
store_nbr,Unnamed: 1_level_1
1,2524.0
2,3918.0
3,6790.0
4,2565.0
5,3667.0
...,...
50,12774.0
51,34251.0
52,1219.0
53,3745.0


In [38]:
# remove 'date' and 'id' from retail_df for .agg() method functions (issue with object dtypes)
small_retail = retail_df.drop(columns=['date','id'])
small_retail

Unnamed: 0,store_nbr,family,sales,onpromotion
0,1,AUTOMOTIVE,0.000,0
1,1,BABY CARE,0.000,0
2,1,BEAUTY,0.000,0
3,1,BEVERAGES,0.000,0
4,1,BOOKS,0.000,0
...,...,...,...,...
1054939,9,POULTRY,438.133,0
1054940,9,PREPARED FOODS,154.553,1
1054941,9,PRODUCE,2419.729,148
1054942,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


##### **AGG Method**</br> Enables multiple aggregations on a `groupby` object</br> `.agg('operation')` method is better for performing aggregation calculations

In [39]:
# using .agg('operation) will perform the operation on all applicable columns
small_retail.groupby(['store_nbr', 'family']).agg('sum').round()
# 'sum' is applied to sales and onpromtion columns

Unnamed: 0_level_0,Unnamed: 1_level_0,sales,onpromotion
store_nbr,family,Unnamed: 2_level_1,Unnamed: 3_level_1
1,AUTOMOTIVE,2524.0,14
1,BABY CARE,0.0,0
1,BEAUTY,1776.0,190
1,BEVERAGES,1238601.0,13793
1,BOOKS,211.0,0
...,...,...,...
54,POULTRY,35537.0,909
54,PREPARED FOODS,42792.0,577
54,PRODUCE,378612.0,6734
54,SCHOOL AND OFFICE SUPPLIES,997.0,277


##### **Multiple Aggregations using .agg() method**</br> Can perform `multiple` aggregations by passing list of aggregation functions </br>`pd.groupby(['list_of_groupby_columns']).agg(['list_of_agg_functions'])` </br> Can perform `specfic` aggregations by column by passing a dictionary with column_names as keys and aggregation_functions as values</br>`pd.groupby(['list_of_groupby_columns']).agg({'column_name':'aggregation_function'])` 

In [40]:
# using .agg('operation) will perform the operation on all applicable columns
small_retail.groupby(['store_nbr', 'family']).agg(['sum','mean']).round()
# 'sum' and 'mean' applied to sales and onpromtion columns and creates multilevel column index

Unnamed: 0_level_0,Unnamed: 1_level_0,sales,sales,onpromotion,onpromotion
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
store_nbr,family,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1,AUTOMOTIVE,2524.0,4.0,14,0.0
1,BABY CARE,0.0,0.0,0,0.0
1,BEAUTY,1776.0,3.0,190,0.0
1,BEVERAGES,1238601.0,2092.0,13793,23.0
1,BOOKS,211.0,0.0,0,0.0
...,...,...,...,...,...
54,POULTRY,35537.0,60.0,909,2.0
54,PREPARED FOODS,42792.0,72.0,577,1.0
54,PRODUCE,378612.0,640.0,6734,11.0
54,SCHOOL AND OFFICE SUPPLIES,997.0,2.0,277,0.0


In [44]:
# Multiple Aggregations 'sales' to have 'sum' and 'mean', 'onpromotion' to have 'min' and 'max'
small_retail.groupby(
    ['family', 'store_nbr']).agg(
        {'sales':['sum','mean'],
        'onpromotion':['min','max']}).round()


Unnamed: 0_level_0,Unnamed: 1_level_0,sales,sales,onpromotion,onpromotion
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
family,store_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AUTOMOTIVE,1,2524.0,4.0,0,1
AUTOMOTIVE,2,3918.0,7.0,0,1
AUTOMOTIVE,3,6790.0,11.0,0,1
AUTOMOTIVE,4,2565.0,4.0,0,1
AUTOMOTIVE,5,3667.0,6.0,0,2
...,...,...,...,...,...
SEAFOOD,50,12774.0,22.0,0,7
SEAFOOD,51,34251.0,58.0,0,7
SEAFOOD,52,1219.0,2.0,0,5
SEAFOOD,53,3745.0,6.0,0,5


##### **Named Aggregations using .agg() Method**</br> Can name aggregated columns on creation to avoid multi-index columns </br>`pd.groupby(column_name=('column_to_be_agg()',"agg()_function'))` </br> Multiple columns can be created by using commas after each column_name=() </br> provides easier to understand column_labels

In [47]:
# use as_index=False to remove multi-index rows, and then create columns with .agg() function to prevent multi-index columns
(small_retail.groupby(
    ['family', 'store_nbr'],as_index=False).agg(
        sales_sum = ('sales','sum'),
        sales_avg = ('sales', 'mean'),
        onpromotion_max = ('onpromotion', 'max')
    )
)

Unnamed: 0,family,store_nbr,sales_sum,sales_avg,onpromotion_max
0,AUTOMOTIVE,1,2524.000000,4.263514,1
1,AUTOMOTIVE,2,3918.000000,6.618243,1
2,AUTOMOTIVE,3,6790.000000,11.469595,1
3,AUTOMOTIVE,4,2565.000000,4.332770,1
4,AUTOMOTIVE,5,3667.000000,6.194257,2
...,...,...,...,...,...
1777,SEAFOOD,50,12773.966999,21.577647,7
1778,SEAFOOD,51,34250.948976,57.856333,7
1779,SEAFOOD,52,1219.475999,2.059926,5
1780,SEAFOOD,53,3745.180001,6.326318,5
