# AGGREGATING DATAFRAMES
## 1.0 Summary Statistics

They summarize many numbers in one statistic. 
Examples of summary statistics

.mean()
.median()
.mode()
.min()
.max()
.var()
.std()
.quantile()

In [112]:
#import libraries
import numpy as np
import pandas as pd


In [113]:
# load DataFrame
sales = pd.read_csv("sales_subset.csv", index_col = 0)

In [114]:
# inspect thg dataframe info
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10774 entries, 0 to 10773
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   store                 10774 non-null  int64  
 1   type                  10774 non-null  object 
 2   department            10774 non-null  int64  
 3   date                  10774 non-null  object 
 4   weekly_sales          10774 non-null  float64
 5   is_holiday            10774 non-null  bool   
 6   temperature_c         10774 non-null  float64
 7   fuel_price_usd_per_l  10774 non-null  float64
 8   unemployment          10774 non-null  float64
dtypes: bool(1), float64(4), int64(2), object(2)
memory usage: 768.1+ KB


In [115]:
# inspect the dataframe's head
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


### 1.1 Mean and Median

In [116]:
# Mean
# Find the mean of weekly_sales
sales["weekly_sales"].mean()

23843.950148505668

In [117]:
# Median
# Find the median of weekly_sales
sales["weekly_sales"].median()

12049.064999999999

### 1.2 Summarizing Dates

In [118]:
# print the maximum of the date column
sales["date"].max()

'2012-10-26'

In [119]:
# print the minimum of the date column
sales["date"].min()

'2010-02-05'

### 1.3 Efficient summaries. The .agg() method
The .agg() method allows you to apply your own custom functions to a DataFrame, 

as well as apply functions to more than one column of a DataFrame at once, making your aggregations super-efficient.

df['column'].agg(function)


In [120]:
# define a custom inter quartile range function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# print the IQR of the temparature
sales["temperature_c"].agg(iqr)

16.583333333333336

In [121]:
# define a custom inter quartile range function
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

# update the column selection to use the custom iqr function with .agg(). print the iqr of temperature_c, fuel_price_usd_per_l,
# and unemployment in that order. 
sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr)


temperature_c           16.583333
fuel_price_usd_per_l     0.073176
unemployment             0.565000
dtype: float64

In [122]:
#Update the aggregation functions called by .agg(): include iqr and np.median in that order.
sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median])

Unnamed: 0,temperature_c,fuel_price_usd_per_l,unemployment
iqr,16.583333,0.073176,0.565
median,16.966667,0.743381,8.099


### 1.4 Cumulative statistics

In [123]:
# Filter sales data. Stores 1, Department 1
sales_1 = sales[sales["store"] == 1]
sales_1_1 = sales_1[sales_1["department"] == 1]
sales_1_1

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808
5,1,A,1,2010-07-02,16333.14,False,27.172222,0.705076,7.787
6,1,A,1,2010-08-06,17508.41,False,30.644444,0.69398,7.787
7,1,A,1,2010-09-03,16241.78,False,27.338889,0.680772,7.787
8,1,A,1,2010-10-01,20094.19,False,22.161111,0.68764,7.838
9,1,A,1,2010-11-05,34238.88,False,14.855556,0.710359,7.838


In [124]:
# Sort sales_1_1 by date
sales_1_1 = sales_1_1.sort_values("date")

# Get the cumulative sum of weekly_sales, add as cum_weekly_sales col
sales_1_1["cum_weekly_sales"] = sales_1_1["weekly_sales"].cumsum()

# Get the cumulative max of weekly_sales, add as cum_max_sales col
sales_1_1["cum_max_sales"] = sales_1_1["weekly_sales"].cummax()

# See the columns you calculated
print(sales_1_1[["date", "weekly_sales", "cum_weekly_sales", "cum_max_sales"]])

          date  weekly_sales  cum_weekly_sales  cum_max_sales
0   2010-02-05      24924.50          24924.50       24924.50
1   2010-03-05      21827.90          46752.40       24924.50
2   2010-04-02      57258.43         104010.83       57258.43
3   2010-05-07      17413.94         121424.77       57258.43
4   2010-06-04      17558.09         138982.86       57258.43
5   2010-07-02      16333.14         155316.00       57258.43
6   2010-08-06      17508.41         172824.41       57258.43
7   2010-09-03      16241.78         189066.19       57258.43
8   2010-10-01      20094.19         209160.38       57258.43
9   2010-11-05      34238.88         243399.26       57258.43
10  2010-12-03      22517.56         265916.82       57258.43
11  2011-01-07      15984.24         281901.06       57258.43


## 2.0 Counting 

### 2.1 dropping duplicates
Removing duplicates is an essential skill to get accurate counts because often, you don't want to count the same thing multiple times.

df.drop_duplicates(subset = ["col1", "col2"])

In [125]:
# Drop duplicate store/type combinations
store_types = sales.drop_duplicates(subset = ["store", "type"])

# Drop duplicate store/department combinations
store_depts = sales.drop_duplicates(subset = ["store", "department"])

# Subset the rows where is_holiday is True and drop duplicate dates
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(subset = "date")

# print the date column of holiday dates
holiday_dates["date"]

498     2010-09-10
691     2011-11-25
2315    2010-02-12
6735    2012-09-07
6810    2010-12-31
6815    2012-02-10
6820    2011-09-09
Name: date, dtype: object

### 2.2 Counting categorical variables
Counting is a great way to get an overview of your data and to spot curiosities that you might not notice otherwise.

df["col_name"].value_counts()

In [126]:
# Count the number of stores of each store type in store_types.
store_types = sales.drop_duplicates(subset = ["store", "type"])
store_counts = store_types["type"].value_counts()
store_counts

A    11
B     1
Name: type, dtype: int64

In [127]:
# Count the proportion of stores of each store type in store_types.
store_props = store_types["type"].value_counts(normalize = True)
store_props

A    0.916667
B    0.083333
Name: type, dtype: float64

In [128]:
# Count the number of different departments in store_depts, sorting the counts in descending order.
store_depts = sales.drop_duplicates(subset = ["store", "department"])
dept_counts_sorted = store_depts["department"].value_counts(sort = True)
dept_counts_sorted

1     12
55    12
72    12
71    12
67    12
      ..
37    10
48     8
50     6
39     4
43     2
Name: department, Length: 80, dtype: int64

In [129]:
# Count the proportion of different departments in store_depts, sorting the proportions in descending order.
dept_prop_sorted = store_depts["department"].value_counts(sort = True, normalize = True)
dept_prop_sorted

1     0.012917
55    0.012917
72    0.012917
71    0.012917
67    0.012917
        ...   
37    0.010764
48    0.008611
50    0.006459
39    0.004306
43    0.002153
Name: department, Length: 80, dtype: float64

## 3.0 Grouped summary statistics
### 3.1 Calculation without groupby
While .groupby() is useful, you can calculate grouped summary statistics without it.

Walmart distinguishes three types of stores: 

"supercenters," 

"discount stores," 

"neighborhood markets," 

encoded in this dataset as type "A," "B," and "C." 

Calculate the total sales made at each store type, without using .groupby(). You can then use these numbers to see what proportion of Walmart's total sales were made at each type.

In [130]:
# Calc total weekly sales
sales_all = sales["weekly_sales"].sum()

# Subset for type A stores, calc total weekly sales
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()

# Subset for type B stores, calc total weekly sales
sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum()

# Subset for type C stores, calc total weekly sales
sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()

# Get proportion for each type
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all
sales_propn_by_type

array([0.9097747, 0.0902253, 0.       ])

### 3.2 Calculation with .groupby()


In [131]:
# Group sales by "type", take the sum of "weekly_sales", and store as sales_by_type.
sales_by_type = sales.groupby("type")["weekly_sales"].sum()

# Calculate the proportion of sales at each store type by dividing by the sum of sales_by_type. Assign to sales_propn_by_type.
sales_propn_by_type = sales_by_type / sum(sales_by_type)
sales_propn_by_type

type
A    0.909775
B    0.090225
Name: weekly_sales, dtype: float64

In [132]:
# Group sales by "type" and "is_holiday", take the sum of weekly_sales, and store as sales_by_type_is_holiday.
sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].sum()
sales_by_type_is_holiday

type  is_holiday
A     False         2.336927e+08
      True          2.360181e+04
B     False         2.317678e+07
      True          1.621410e+03
Name: weekly_sales, dtype: float64

### 3.3 Multiple grouped summaries
The .agg() method is useful to compute multiple statistics on multiple variables. 

It also works with grouped data. NumPy, which is imported as np, has many different summary statistics functions, including:

 np.min, 
 
 np.max, 
 
 np.mean, and 
 
 np.median

In [133]:
# For each store type, aggregate weekly_sales: get min, max, mean, and median
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])
sales_stats

Unnamed: 0_level_0,amin,amax,mean,median
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,-1098.0,293966.05,23674.667242,11943.92
B,-798.0,232558.51,25696.67837,13336.08


In [134]:
# For each store type, aggregate unemployment and fuel_price_usd_per_l: get min, max, mean, and median
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])
unemp_fuel_stats

Unnamed: 0_level_0,unemployment,unemployment,unemployment,unemployment,fuel_price_usd_per_l,fuel_price_usd_per_l,fuel_price_usd_per_l,fuel_price_usd_per_l
Unnamed: 0_level_1,amin,amax,mean,median,amin,amax,mean,median
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,3.879,8.992,7.972611,8.067,0.664129,1.10741,0.744619,0.735455
B,7.17,9.765,9.279323,9.199,0.760023,1.107674,0.805858,0.803348


## 4.0 Pivot tables
Pivot tables are the standard way of aggregating data in spreadsheets. 

In pandas, pivot tables are essentially just another way of performing grouped calculations. 

That is, the .pivot_table() method is just an alternative to .groupby()

df.pivot_table(values = "col1", index = "col2")

values argument contains the column you want to summarize while the index column is the column you want to group by.

By default, pivot tables give the mean of each group. You can use the aggfunc function to choose a summary statistic option of your own.

fill_value = 0 replaces NAs with 0
margins = True gives the means of the grouped columns or rows at the margins

### 4.1 Pivoting on one variable

In [135]:
# pivoting on one variable
# Pivot for mean weekly_sales for each store type
mean_sales_by_type = sales.pivot_table(values = "weekly_sales", index = "type")
mean_sales_by_type

Unnamed: 0_level_0,weekly_sales
type,Unnamed: 1_level_1
A,23674.667242
B,25696.67837


In [136]:
# Pivot for mean and median weekly_sales for each store type
mean_med_sales_by_type = sales.pivot_table(values = "weekly_sales", index = "type", aggfunc = [np.mean, np.median])
mean_med_sales_by_type

Unnamed: 0_level_0,mean,median
Unnamed: 0_level_1,weekly_sales,weekly_sales
type,Unnamed: 1_level_2,Unnamed: 2_level_2
A,23674.667242,11943.92
B,25696.67837,13336.08


In [137]:
# Pivot for mean weekly_sales by store type and holiday 
mean_sales_by_type_holiday = sales.pivot_table(values = "weekly_sales", index = "type", columns = "is_holiday")
mean_sales_by_type_holiday


is_holiday,False,True
type,Unnamed: 1_level_1,Unnamed: 2_level_1
A,23768.583523,590.04525
B,25751.980533,810.705


### 4.2 Fill in missing values and sum values with pivot tables

The .pivot_table() method has several useful arguments, including fill_value and margins.

fill_value replaces missing values with a real value (known as imputation).

margins is a shortcut for when you pivoted by two variables, but also wanted to pivot by each of those variables separately: it gives the row and column totals of the pivot table contents.

In [138]:
# Print mean weekly_sales by department and type; fill missing values with 0
print(sales.pivot_table(values = "weekly_sales", index = "department", columns = "type", fill_value = 0))


type                    A              B
department                              
1            30961.725379   44050.626667
2            67600.158788  112958.526667
3            17160.002955   30580.655000
4            44285.399091   51219.654167
5            34821.011364   63236.875000
...                   ...            ...
95          123933.787121   77082.102500
96           21367.042857    9528.538333
97           28471.266970    5828.873333
98           12875.423182     217.428333
99             379.123659       0.000000

[80 rows x 2 columns]


In [139]:
# Print the mean weekly_sales by department and type; fill missing values with 0s; sum all rows and cols
print(sales.pivot_table(values="weekly_sales", index="department", columns="type", fill_value = 0, margins = True))

type                   A              B           All
department                                           
1           30961.725379   44050.626667  32052.467153
2           67600.158788  112958.526667  71380.022778
3           17160.002955   30580.655000  18278.390625
4           44285.399091   51219.654167  44863.253681
5           34821.011364   63236.875000  37189.000000
...                  ...            ...           ...
96          21367.042857    9528.538333  20337.607681
97          28471.266970    5828.873333  26584.400833
98          12875.423182     217.428333  11820.590278
99            379.123659       0.000000    379.123659
All         23674.667242   25696.678370  23843.950149

[81 rows x 3 columns]
