# Mean and Median

In [1]:
import pandas as pd

In [4]:
sales = pd.read_csv('./data/sales_subset.csv', index_col=0)

In [5]:
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [6]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10774 entries, 0 to 10773
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   store                 10774 non-null  int64  
 1   type                  10774 non-null  object 
 2   department            10774 non-null  int64  
 3   date                  10774 non-null  object 
 4   weekly_sales          10774 non-null  float64
 5   is_holiday            10774 non-null  bool   
 6   temperature_c         10774 non-null  float64
 7   fuel_price_usd_per_l  10774 non-null  float64
 8   unemployment          10774 non-null  float64
dtypes: bool(1), float64(4), int64(2), object(2)
memory usage: 768.1+ KB


In [9]:
sales['weekly_sales'].mean()

23843.95014850566

In [10]:
sales['weekly_sales'].median()

12049.064999999999

In [11]:
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [12]:
max(sales['date'])

'2012-10-26'

In [13]:
min(sales['date'])

'2010-02-05'

# Efficient summaries

In [14]:
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)

In [15]:
sales['temperature_c'].agg(iqr)

16.583333333333336

In [16]:
sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr)

temperature_c           16.583333
fuel_price_usd_per_l     0.073176
unemployment             0.565000
dtype: float64

In [18]:
import numpy as np
def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)


sales[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, np.median])

Unnamed: 0,temperature_c,fuel_price_usd_per_l,unemployment
iqr,16.583333,0.073176,0.565
median,16.966667,0.743381,8.099


# Cumulative statistics

In [20]:
sales_1_1 = pd.read_csv('./data/sales_subset.csv', index_col=0)

In [21]:
sales_1_1.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [23]:
sales_1_1 = sales_1_1.sort_values('date')

In [24]:
sales_1_1['cum_weekly_sales'] = sales['weekly_sales'].cumsum()

sales_1_1['cum_max_sales'] = sales['weekly_sales'].cummax()

In [27]:
sales_1_1.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment,cum_weekly_sales,cum_max_sales
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106,24924.5,24924.5
6437,19,A,13,2010-02-05,38597.52,False,-6.133333,0.780365,8.35,162961000.0,293966.05
1249,2,A,31,2010-02-05,3840.21,False,4.55,0.679451,8.324,26685390.0,140504.41
6449,19,A,14,2010-02-05,17590.59,False,-6.133333,0.780365,8.35,163387900.0,293966.05
6461,19,A,16,2010-02-05,4929.87,False,-6.133333,0.780365,8.35,163595400.0,293966.05


# Dropping duplicates

In [28]:
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [31]:
store_types = sales.drop_duplicates(["store", "type"])
store_types.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
901,2,A,1,2010-02-05,35034.06,False,4.55,0.679451,8.324
1798,4,A,1,2010-02-05,38724.42,False,6.533333,0.686319,8.623
2699,6,A,1,2010-02-05,25619.0,False,4.683333,0.679451,7.259
3593,10,B,1,2010-02-05,40212.84,False,12.411111,0.782478,9.765


In [35]:
store_depts = sales.drop_duplicates(["store", "department"])
store_depts.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
12,1,A,2,2010-02-05,50605.27,False,5.727778,0.679451,8.106
24,1,A,3,2010-02-05,13740.12,False,5.727778,0.679451,8.106
36,1,A,4,2010-02-05,39954.04,False,5.727778,0.679451,8.106
48,1,A,5,2010-02-05,32229.38,False,5.727778,0.679451,8.106


In [36]:
holiday_dates = sales[sales["is_holiday"]].drop_duplicates(["date"])
holiday_dates.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
498,1,A,45,2010-09-10,11.47,True,25.938889,0.677602,7.787
691,1,A,77,2011-11-25,1431.0,True,15.633333,0.854861,7.866
2315,4,A,47,2010-02-12,498.0,True,-1.755556,0.679715,8.623
6735,19,A,39,2012-09-07,13.41,True,22.333333,1.076766,8.193
6810,19,A,47,2010-12-31,-449.0,True,-1.861111,0.881278,8.067


# Counting categorical variables

In [39]:
store_counts = store_types['type'].value_counts()
store_counts

A    11
B     1
Name: type, dtype: int64

In [50]:
store_props = store_types['type'].value_counts(normalize=True)
store_props

A    0.916667
B    0.083333
Name: type, dtype: float64

In [43]:
dept_counts_sorted = store_depts['department'].value_counts(sort=True)
dept_counts_sorted.head()

1     12
55    12
72    12
71    12
67    12
Name: department, dtype: int64

In [47]:
dept_props_sorted = store_depts['department'].value_counts(sort=True, normalize=False)
dept_counts_sorted.head()

1     12
55    12
72    12
71    12
67    12
Name: department, dtype: int64

# Grouped summary statistics

In [51]:
sales = pd.read_csv('./data/sales_subset.csv', index_col=0)
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [52]:
sales_all = sales["weekly_sales"].sum()

256894718.89999998

In [56]:
sales_A = sales[sales["type"] == "A"]["weekly_sales"].sum()

sales_B = sales[sales["type"] == "B"]["weekly_sales"].sum()

sales_C = sales[sales["type"] == "C"]["weekly_sales"].sum()

In [57]:
sales_propn_by_type = [sales_A, sales_B, sales_C] / sales_all

sales_propn_by_type

array([0.9097747, 0.0902253, 0.       ])

# Calculations with .groupby()

In [58]:
sales = pd.read_csv('./data/sales_subset.csv', index_col=0)
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [60]:
sales_by_type = sales.groupby("type")["weekly_sales"].sum()

sales_by_type

type
A    2.337163e+08
B    2.317840e+07
Name: weekly_sales, dtype: float64

In [62]:
sales_propn_by_type = sales_by_type / sum(sales_by_type)

print(sales_propn_by_type)

type
A    0.909775
B    0.090225
Name: weekly_sales, dtype: float64


In [63]:
sales_by_type_is_holiday = sales.groupby(["type", "is_holiday"])["weekly_sales"].sum()
sales_by_type_is_holiday

type  is_holiday
A     False         2.336927e+08
      True          2.360181e+04
B     False         2.317678e+07
      True          1.621410e+03
Name: weekly_sales, dtype: float64

# Multiple grouped summaries

In [64]:
sales = pd.read_csv('./data/sales_subset.csv', index_col=0)
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [65]:
import numpy as np 

In [69]:
sales_stats = sales.groupby("type")["weekly_sales"].agg([np.min, np.max, np.mean, np.median])
sales_stats

Unnamed: 0_level_0,amin,amax,mean,median
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,-1098.0,293966.05,23674.667242,11943.92
B,-798.0,232558.51,25696.67837,13336.08


In [70]:
unemp_fuel_stats = sales.groupby("type")[["unemployment", "fuel_price_usd_per_l"]].agg([np.min, np.max, np.mean, np.median])
unemp_fuel_stats

Unnamed: 0_level_0,unemployment,unemployment,unemployment,unemployment,fuel_price_usd_per_l,fuel_price_usd_per_l,fuel_price_usd_per_l,fuel_price_usd_per_l
Unnamed: 0_level_1,amin,amax,mean,median,amin,amax,mean,median
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,3.879,8.992,7.972611,8.067,0.664129,1.10741,0.744619,0.735455
B,7.17,9.765,9.279323,9.199,0.760023,1.107674,0.805858,0.803348


# Pivot Tables

In [72]:
sales = pd.read_csv('./data/sales_subset.csv', index_col=0)
sales.head()

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.727778,0.679451,8.106
1,1,A,1,2010-03-05,21827.9,False,8.055556,0.693452,8.106
2,1,A,1,2010-04-02,57258.43,False,16.816667,0.718284,7.808
3,1,A,1,2010-05-07,17413.94,False,22.527778,0.748928,7.808
4,1,A,1,2010-06-04,17558.09,False,27.05,0.714586,7.808


In [73]:
mean_sales_by_type = sales.pivot_table(index="type", values="weekly_sales")

mean_sales_by_type

Unnamed: 0_level_0,weekly_sales
type,Unnamed: 1_level_1
A,23674.667242
B,25696.67837


In [74]:
mean_med_sales_by_type = sales.pivot_table(index="type", values="weekly_sales", aggfunc=[np.mean, np.median])

mean_med_sales_by_type

Unnamed: 0_level_0,mean,median
Unnamed: 0_level_1,weekly_sales,weekly_sales
type,Unnamed: 1_level_2,Unnamed: 2_level_2
A,23674.667242,11943.92
B,25696.67837,13336.08


In [75]:
mean_sales_by_type_holiday = sales.pivot_table(index="type", columns="is_holiday", values="weekly_sales")

mean_sales_by_type_holiday

is_holiday,False,True
type,Unnamed: 1_level_1,Unnamed: 2_level_1
A,23768.583523,590.04525
B,25751.980533,810.705


In [77]:
sales.pivot_table(index="department", columns="type", values="weekly_sales", fill_value=0)

type,A,B
department,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30961.725379,44050.626667
2,67600.158788,112958.526667
3,17160.002955,30580.655000
4,44285.399091,51219.654167
5,34821.011364,63236.875000
...,...,...
95,123933.787121,77082.102500
96,21367.042857,9528.538333
97,28471.266970,5828.873333
98,12875.423182,217.428333


In [78]:
sales.pivot_table(index="department", columns="type", values="weekly_sales", fill_value=0, margins=True)

type,A,B,All
department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,30961.725379,44050.626667,32052.467153
2,67600.158788,112958.526667,71380.022778
3,17160.002955,30580.655000,18278.390625
4,44285.399091,51219.654167,44863.253681
5,34821.011364,63236.875000,37189.000000
...,...,...,...
96,21367.042857,9528.538333,20337.607681
97,28471.266970,5828.873333,26584.400833
98,12875.423182,217.428333,11820.590278
99,379.123659,0.000000,379.123659
