##### **Aggregating DataFrames** </br> 

In [2]:
import pandas as pd
import numpy as np

In [3]:
path_retail = 'Pandas Course Resources/retail/retail_2016_2017.csv'
retail_df = pd.read_csv(path_retail)

retail_df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.0,0
1,1945945,2016-01-01,1,BABY CARE,0.0,0
2,1945946,2016-01-01,1,BEAUTY,0.0,0
3,1945947,2016-01-01,1,BEVERAGES,0.0,0
4,1945948,2016-01-01,1,BOOKS,0.0,0


In [4]:
# Create random sample of 100 for aggregation examples
sample_df = retail_df.sample(100, random_state=616)
sample_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
399033,2344977,2016-08-11,54,PRODUCE,487.239,1
579626,2525570,2016-11-21,22,HARDWARE,0.000,0
546385,2492329,2016-11-02,4,BOOKS,3.000,0
534555,2480499,2016-10-26,8,LINGERIE,7.000,0
96159,2042103,2016-02-23,7,PRODUCE,5212.624,0
...,...,...,...,...,...,...
402640,2348584,2016-08-13,7,CLEANING,702.000,6
424226,2370170,2016-08-26,12,FROZEN FOODS,34.000,0
650850,2596794,2017-01-01,20,MEATS,0.000,0
384193,2330137,2016-08-03,39,CLEANING,1031.000,21


##### **AGG Method**</br> Enables multiple aggregations on a `groupby` object</br> `.agg('operation')` method is better for performing aggregation calculations

In [5]:
# using .agg('operation) will perform the operation on all applicable columns
sample_df.groupby(['store_nbr', 'family']).agg('sum').round()
# 'sum' is applied to sales and onpromtion columns

Unnamed: 0_level_0,Unnamed: 1_level_0,id,date,sales,onpromotion
store_nbr,family,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,BEAUTY,2117018,2016-04-06,7.0,0
1,LADIESWEAR,2959921,2017-07-24,13.0,0
1,PLAYERS AND ELECTRONICS,2542941,2016-12-01,9.0,0
3,PET SUPPLIES,2082128,2016-03-17,11.0,0
4,BOOKS,2492329,2016-11-02,3.0,0
...,...,...,...,...,...
53,DAIRY,2056238,2016-03-02,483.0,4
53,HOME AND KITCHEN II,2086540,2016-03-19,30.0,2
54,CLEANING,2034886,2016-02-19,1019.0,14
54,"LIQUOR,WINE,BEER",2464363,2016-10-17,23.0,0


##### **Multiple Aggregations using .agg() method**</br> Can perform `multiple` aggregations by passing list of aggregation functions </br>`pd.groupby(['list_of_groupby_columns']).agg(['list_of_agg_functions'])` </br> Can perform `specfic` aggregations by column by passing a dictionary with column_names as keys and aggregation_functions as values</br>`pd.groupby(['list_of_groupby_columns']).agg({'column_name':'aggregation_function'])` 

In [6]:
# using .agg('operation) will perform the operation on all applicable columns
# sample_df.groupby(['store_nbr', 'family']).agg(['mean']).round()
# 'mean' will cause an error because some columns cannot have that operation performed on it

In [7]:
# Multiple Aggregations 'sales' to have 'sum' and 'mean', 'onpromotion' to have 'min' and 'max'
sample_df.groupby(
    ['family', 'store_nbr']).agg(
        {'sales':['sum','mean'],
        'onpromotion':['min','max']}).round()


Unnamed: 0_level_0,Unnamed: 1_level_0,sales,sales,onpromotion,onpromotion
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
family,store_nbr,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AUTOMOTIVE,6,4.0,4.0,0,0
AUTOMOTIVE,13,3.0,3.0,0,0
BABY CARE,10,0.0,0.0,0,0
BABY CARE,30,0.0,0.0,0,0
BABY CARE,35,0.0,0.0,0,0
...,...,...,...,...,...
PRODUCE,54,487.0,487.0,1,1
SCHOOL AND OFFICE SUPPLIES,19,0.0,0.0,0,0
SEAFOOD,4,35.0,35.0,0,0
SEAFOOD,40,5.0,5.0,1,1


##### **Named Aggregations using .agg() Method**</br> Can name aggregated columns on creation to avoid multi-index columns </br>`pd.groupby(column_name=('column_to_be_agg()',"agg()_function'))` </br> Multiple columns can be created by using commas after each column_name=() </br> provides easier to understand column_labels

In [8]:
# use as_index=False to remove multi-index rows, and then create columns with .agg() function to prevent multi-index columns
(sample_df.groupby(
    ['family', 'store_nbr'],as_index=False).agg(
        sales_sum = ('sales','sum'),
        sales_avg = ('sales', 'mean'),
        onpromotion_max = ('onpromotion', 'max')
    )
)

Unnamed: 0,family,store_nbr,sales_sum,sales_avg,onpromotion_max
0,AUTOMOTIVE,6,4.000,4.000,0
1,AUTOMOTIVE,13,3.000,3.000,0
2,BABY CARE,10,0.000,0.000,0
3,BABY CARE,30,0.000,0.000,0
4,BABY CARE,35,0.000,0.000,0
...,...,...,...,...,...
92,PRODUCE,54,487.239,487.239,1
93,SCHOOL AND OFFICE SUPPLIES,19,0.000,0.000,0
94,SEAFOOD,4,34.689,34.689,0
95,SEAFOOD,40,5.000,5.000,1


##### **Transform .transform() Method**</br> Can used to perform aggregations without reshaping </br> Useful for calculating group-level statistics to perform row-level analysis </br> Typically used for operations that need to return a result of the same size as the input and returns a DataFrame with the same index as the original. </br> `pd.groupby(['column(s)_to_group']),['column(s)_to_transform']).transform('function'))`.transform('function'))`

In [9]:
# assign columns and transform the values with sum operation
sample_df.assign(
    store_sales = (sample_df.groupby('store_nbr')['sales'].transform('sum'))
)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,store_sales
399033,2344977,2016-08-11,54,PRODUCE,487.239,1,1529.239
579626,2525570,2016-11-21,22,HARDWARE,0.000,0,42.000
546385,2492329,2016-11-02,4,BOOKS,3.000,0,310.689
534555,2480499,2016-10-26,8,LINGERIE,7.000,0,212.390
96159,2042103,2016-02-23,7,PRODUCE,5212.624,0,5927.624
...,...,...,...,...,...,...,...
402640,2348584,2016-08-13,7,CLEANING,702.000,6,5927.624
424226,2370170,2016-08-26,12,FROZEN FOODS,34.000,0,405.772
650850,2596794,2017-01-01,20,MEATS,0.000,0,1025.000
384193,2330137,2016-08-03,39,CLEANING,1031.000,21,1049.000


In [10]:
# demo with soccer excel file
premier_league = pd.read_excel('Pandas Course Resources/retail/premier_league_games_full.xlsx')
premier_league.head()

Unnamed: 0,id,league_name,season,HomeTeam,AwayTeam,HomeGoals,AwayGoals
0,1729,England Premier League,2008/2009,Manchester United,Newcastle United,1,1
1,1730,England Premier League,2008/2009,Arsenal,West Bromwich Albion,1,0
2,1731,England Premier League,2008/2009,Sunderland,Liverpool,0,1
3,1732,England Premier League,2008/2009,West Ham United,Wigan Athletic,2,1
4,1733,England Premier League,2008/2009,Aston Villa,Manchester City,4,2


In [13]:
# calc avg goals for HomeTeam in new column, without collapsing rows grouping by HomeTeam
premier_league.assign(
    avg_team_goals = premier_league.groupby(['HomeTeam'])['HomeGoals'].transform('mean'),
    # use lambda to use previous column created
    difference = lambda x: x['HomeGoals'] - x['avg_team_goals']
)
# provides row level Dataframe

Unnamed: 0,id,league_name,season,HomeTeam,AwayTeam,HomeGoals,AwayGoals,avg_team_goals,difference
0,1729,England Premier League,2008/2009,Manchester United,Newcastle United,1,1,2.223684,-1.223684
1,1730,England Premier League,2008/2009,Arsenal,West Bromwich Albion,1,0,2.013158,-1.013158
2,1731,England Premier League,2008/2009,Sunderland,Liverpool,0,1,1.210526,-1.210526
3,1732,England Premier League,2008/2009,West Ham United,Wigan Athletic,2,1,1.466165,0.533835
4,1733,England Premier League,2008/2009,Aston Villa,Manchester City,4,2,1.177632,2.822368
...,...,...,...,...,...,...,...,...,...
3035,4764,England Premier League,2015/2016,Southampton,Leicester City,2,2,1.763158,0.236842
3036,4765,England Premier League,2015/2016,Swansea City,Stoke City,0,1,1.421053,-1.421053
3037,4766,England Premier League,2015/2016,Tottenham Hotspur,Liverpool,0,0,1.677632,-1.677632
3038,4767,England Premier League,2015/2016,Watford,Arsenal,0,3,1.052632,-1.052632


In [16]:
# calc avg goals for HomeTeam in new column, without collapsing rows grouping by HomeTeam
pm = premier_league.assign(
    avg_team_goals = premier_league.groupby(['HomeTeam'])['HomeGoals'].transform('mean'),
    # use lambda to use previous column created
    difference = lambda x: x['HomeGoals'] - x['avg_team_goals']
)
# provides row level Dataframe

In [22]:
# calculate mean of difference column grouped by HomeTeam, AwayTeam
pm.groupby(['HomeTeam', 'AwayTeam']).agg({'difference':'mean'}).sort_values('difference')

Unnamed: 0_level_0,Unnamed: 1_level_0,difference
HomeTeam,AwayTeam,Unnamed: 2_level_1
Chelsea,Bournemouth,-2.190789
Southampton,Wigan Athletic,-1.763158
Southampton,Cardiff City,-1.763158
Leicester City,Hull City,-1.657895
Leicester City,Manchester City,-1.657895
...,...,...
Wolverhampton Wanderers,Blackpool,2.912281
Fulham,Queens Park Rangers,2.982456
Everton,Blackpool,3.302632
Leicester City,Queens Park Rangers,3.342105


In [23]:
# query results using .query("")with column_name in ''
pm.query("HomeTeam == 'Arsenal' and AwayTeam == 'Blackpool'")

Unnamed: 0,id,league_name,season,HomeTeam,AwayTeam,HomeGoals,AwayGoals,avg_team_goals,difference
870,2599,England Premier League,2010/2011,Arsenal,Blackpool,6,0,2.013158,3.986842


In [25]:
# query results using .query("")with column_name in ''
pm.query("AwayTeam == 'Blackpool'").head()

Unnamed: 0,id,league_name,season,HomeTeam,AwayTeam,HomeGoals,AwayGoals,avg_team_goals,difference
769,2498,England Premier League,2010/2011,Wigan Athletic,Blackpool,0,4,1.115789,-1.115789
795,2524,England Premier League,2010/2011,Aston Villa,Blackpool,3,2,1.177632,1.822368
801,2530,England Premier League,2010/2011,West Ham United,Blackpool,0,0,1.466165,-1.466165
828,2557,England Premier League,2010/2011,Bolton Wanderers,Blackpool,2,2,1.368421,0.631579
849,2578,England Premier League,2010/2011,Stoke City,Blackpool,0,1,1.342105,-1.342105
