# Assignment 1: Groupby

Can you return a table containing the top 10 stores by total transactions in the data?

Make sure they’re sorted from highest to lowest.

Thanks!

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in transactions data -- parse dates specified here for help with later problem

transactions = pd.read_csv("../retail/transactions.csv", parse_dates=["date"])

transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [3]:
# sum transactions for each store, display 10 highest
transactions.groupby('store_nbr')[['transactions']].sum().sort_values('transactions', ascending=False).head(10)

Unnamed: 0_level_0,transactions
store_nbr,Unnamed: 1_level_1
44,7273093
47,6535810
45,6201115
46,5990113
3,5366350
48,5107785
8,4637971
49,4574103
50,4384444
11,3972488


# Assignment 2: Groupby Multiple Columns

Can you get me the total transactions by store and month? 

Sort the table from first month to last, then by highest transactions to lowest within each month. 


In [4]:
# helper code to extract month date part from date column

transactions["month"] = transactions["date"].dt.month

transactions.head()

Unnamed: 0,date,store_nbr,transactions,month
0,2013-01-01,25,770,1
1,2013-01-02,1,2111,1
2,2013-01-02,2,2358,1
3,2013-01-02,3,3487,1
4,2013-01-02,4,1922,1


In [5]:
# sum transactions for each store, each month. display highest monthly transactions to lowest monthly transactions
transactions.groupby(['store_nbr','month'])[['transactions']].sum().sort_values(by=['month', 'transactions'], ascending=[True,False])

Unnamed: 0_level_0,Unnamed: 1_level_0,transactions
store_nbr,month,Unnamed: 2_level_1
44,1,628438
47,1,568824
45,1,538370
46,1,522763
3,1,463260
...,...,...
32,12,86167
21,12,84128
42,12,76741
29,12,76627


# Assignment 3: Multi-Index DataFrames


Can you help me access rows and columns with multiple indices? I’ve been struggling with multi-index DataFrames.

Access:
* Grab Store 3, Month 1
* Then, select the column storing the mean of transactions

Fix:
* Drop the outer layer of the column Index
* Reset the row index so it is the default integer index

In [6]:
# Ross' grouped DataFrame code, run this first

grouped = (
    transactions.groupby(["store_nbr", "month"])
    .agg({"transactions": ["sum", "mean"]})
    .sort_values(by=["month", ("transactions", "sum")], ascending=[True, False])
)

In [7]:
# df grouped by store and month with two aggregation columns
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,transactions,transactions
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
store_nbr,month,Unnamed: 2_level_2,Unnamed: 3_level_2
44,1,628438,4246.202703
47,1,568824,3843.405405
45,1,538370,3637.635135
46,1,522763,3532.182432
3,1,463260,3151.428571


In [8]:
# grab store 3, month 1
grouped.loc[[(3,1)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,transactions,transactions
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
store_nbr,month,Unnamed: 2_level_2,Unnamed: 3_level_2
3,1,463260,3151.428571


In [9]:
# grab store 3, month 1 and mean --> Wrap both tuples like this [[()],[()]] to have result as DataFrame
grouped.loc[[(3,1)],[('transactions','mean')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,transactions
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
store_nbr,month,Unnamed: 2_level_2
3,1,3151.428571


In [10]:
# or just grab the mean column for all rows
grouped.loc[:,[('transactions','mean')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,transactions
Unnamed: 0_level_1,Unnamed: 1_level_1,mean
store_nbr,month,Unnamed: 2_level_2
44,1,4246.202703
47,1,3843.405405
45,1,3637.635135
46,1,3532.182432
3,1,3151.428571
...,...,...
32,12,718.058333
21,12,1402.133333
42,12,1279.016667
29,12,1277.116667


In [11]:
grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,transactions,transactions
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
store_nbr,month,Unnamed: 2_level_2,Unnamed: 3_level_2
44,1,628438,4246.202703
47,1,568824,3843.405405
45,1,538370,3637.635135
46,1,522763,3532.182432
3,1,463260,3151.428571


In [12]:
# drop the outer layer of the column index (remove 'transaction' label)
# then reset_index() row index to be integer
grouped.droplevel(0, axis = 1).reset_index()

Unnamed: 0,store_nbr,month,sum,mean
0,44,1,628438,4246.202703
1,47,1,568824,3843.405405
2,45,1,538370,3637.635135
3,46,1,522763,3532.182432
4,3,1,463260,3151.428571
...,...,...,...,...
636,32,12,86167,718.058333
637,21,12,84128,1402.133333
638,42,12,76741,1279.016667
639,29,12,76627,1277.116667


# Assignment 4: The Agg Method

Calculate the mean of target met by store, and the sum of bonuses to be paid to each store.

Sort them by highest to lowest bonus payout.

Then, do the same for day of week and month.

In [13]:
# Recreate table from section 3

transactions = transactions.assign(
    target_pct=transactions["transactions"] / 2500,
    met_target=(transactions["transactions"] / 2500) >= 1,
    bonus_payable=((transactions["transactions"] / 2500) >= 1) * 100,
    month=transactions.date.dt.month,
    day_of_week=transactions.date.dt.dayofweek,
)

transactions.head()

Unnamed: 0,date,store_nbr,transactions,month,target_pct,met_target,bonus_payable,day_of_week
0,2013-01-01,25,770,1,0.308,False,0,1
1,2013-01-02,1,2111,1,0.8444,False,0,2
2,2013-01-02,2,2358,1,0.9432,False,0,2
3,2013-01-02,3,3487,1,1.3948,True,100,2
4,2013-01-02,4,1922,1,0.7688,False,0,2


In [23]:
# calculate mean of target met by store and sum bonuses paid to each store
transactions.groupby('store_nbr').agg(
    # average of the True values by day (True = 1)
    met_target=('met_target', 'mean'),
    # sum the bonus_payable 
    bonus_payable = ('bonus_payable', 'sum')
    # sort DataFrame descending for Bonus amount
    ).sort_values(by='bonus_payable', ascending=False).head()

Unnamed: 0_level_0,met_target,bonus_payable
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1
47,0.999404,167600
44,0.998807,167500
45,0.997615,167300
3,0.99821,167300
46,0.989267,165900


In [29]:
# could be done without named columns using dictionary
transactions.groupby(['store_nbr']).agg(
    {'met_target':'mean','bonus_payable':'sum'}
    ).sort_values(by='bonus_payable', ascending=False).head()

Unnamed: 0_level_0,met_target,bonus_payable
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1
47,0.999404,167600
44,0.998807,167500
45,0.997615,167300
3,0.99821,167300
46,0.989267,165900


In [25]:
# calculate mean of target met by month and sum bonuses paid by month
transactions.groupby('month').agg(
    # average of the True values by day (True = 1)
    met_target=('met_target', 'mean'),
    # sum the bonus_payable 
    bonus_payable = ('bonus_payable', 'sum')
    # sort DataFrame descending for Bonus amount
    ).sort_values(by='bonus_payable', ascending=False).head()

Unnamed: 0_level_0,met_target,bonus_payable
month,Unnamed: 1_level_1,Unnamed: 2_level_1
12,0.25564,154100
5,0.170792,131800
3,0.169461,130400
4,0.174469,129700
7,0.162486,126300


In [30]:
# could be done without named columns using dictionary
transactions.groupby(['month']).agg(
    {'met_target':'mean','bonus_payable':'sum'}
    ).sort_values(by='bonus_payable', ascending=False).head()

Unnamed: 0_level_0,met_target,bonus_payable
month,Unnamed: 1_level_1,Unnamed: 2_level_1
12,0.25564,154100
5,0.170792,131800
3,0.169461,130400
4,0.174469,129700
7,0.162486,126300


In [26]:
# calculate mean of target met by day and sum bonuses paid by day
transactions.groupby('day_of_week').agg(
    # average of the True values by day (True = 1)
    met_target=('met_target', 'mean'),
    # sum the bonus_payable 
    bonus_payable = ('bonus_payable', 'sum')
    # sort DataFrame descending for Bonus amount
    ).sort_values(by='bonus_payable', ascending=False).head()

Unnamed: 0_level_0,met_target,bonus_payable
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1
5,0.222204,266400
6,0.204001,241700
4,0.179007,213000
0,0.160214,191600
2,0.160572,191000


In [31]:
# could be done without named columns using dictionary
transactions.groupby(['day_of_week']).agg(
    {'met_target':'mean','bonus_payable':'sum'}
    ).sort_values(by='bonus_payable', ascending=False).head()

Unnamed: 0_level_0,met_target,bonus_payable
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1
5,0.222204,266400
6,0.204001,241700
4,0.179007,213000
0,0.160214,191600
2,0.160572,191000


# Assignment 4: Transform

Calculate the mean of transactions by store number and day of week while keeping row numbers. 

Then compare the performance of each row to its day of week average. (difference between transactions and daily avg)

In [53]:
# calc avg transactions by store_nbr and day_of_week using .transform() to keep row numbers with .assign() to create column
transactions.assign(
    avg_store_trans = transactions.groupby(['store_nbr','day_of_week'])['transactions'].transform('mean')
    ).head()


Unnamed: 0,date,store_nbr,transactions,month,target_pct,met_target,bonus_payable,day_of_week,avg_store_trans
0,2013-01-01,25,770,1,0.308,False,0,1,740.24569
1,2013-01-02,1,2111,1,0.8444,False,0,2,1870.782427
2,2013-01-02,2,2358,1,0.9432,False,0,2,1952.65272
3,2013-01-02,3,3487,1,1.3948,True,100,2,3142.682008
4,2013-01-02,4,1922,1,0.7688,False,0,2,1499.569038


In [52]:
#  create additional column for differnce between transactions and (calc avg transactions by store_nbr and day_of_week using .transform() to keep row numbers with .assign() to create column) at row level
transactions.assign(
    avg_store_trans = transactions.groupby(['store_nbr','day_of_week'])['transactions'].transform('mean'),
    trans_difference = lambda x: x['transactions'] - x['avg_store_trans']
    ).head()

Unnamed: 0,date,store_nbr,transactions,month,target_pct,met_target,bonus_payable,day_of_week,avg_store_trans,trans_difference
0,2013-01-01,25,770,1,0.308,False,0,1,740.24569,29.75431
1,2013-01-02,1,2111,1,0.8444,False,0,2,1870.782427,240.217573
2,2013-01-02,2,2358,1,0.9432,False,0,2,1952.65272,405.34728
3,2013-01-02,3,3487,1,1.3948,True,100,2,3142.682008,344.317992
4,2013-01-02,4,1922,1,0.7688,False,0,2,1499.569038,422.430962


# Assignment 5: Pivot

Pivot transactions with store number as index, columns day of week, with the sum of bonus payable as cells.

Filter to stores that had a non-zero bonus payable and create a heatmap.

Then unpivot (melt) the table so we have one row for each store and day of the week with the corresponding total owed. 


In [15]:
# Use transactions table (ok if includes columns from assignment 4 or not)

transactions.head()

Unnamed: 0,date,store_nbr,transactions,month,target_pct,met_target,bonus_payable,day_of_week
0,2013-01-01,25,770,1,0.308,False,0,1
1,2013-01-02,1,2111,1,0.8444,False,0,2
2,2013-01-02,2,2358,1,0.9432,False,0,2
3,2013-01-02,3,3487,1,1.3948,True,100,2
4,2013-01-02,4,1922,1,0.7688,False,0,2


In [97]:
# create pivot_table that has store_nbr as index, day_of_week as columns, and values of the bonus is summed in the columns
day_bonus = transactions.pivot_table(
    index='store_nbr',
    columns='day_of_week',
    values='bonus_payable',
    aggfunc='sum'
    ).head(11)
day_bonus

day_of_week,0,1,2,3,4,5,6
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,200,300,300,200,100,0,0
2,300,600,500,400,400,500,200
3,24000,23900,23900,23900,23900,24000,23700
4,200,300,300,200,100,200,0
5,200,300,300,100,100,100,0
6,400,500,500,300,200,900,300
7,200,300,300,200,100,100,0
8,22000,18800,23800,18000,22900,23400,20000
9,1200,800,800,700,400,7900,5100
10,0,0,0,0,0,0,0


In [96]:
# create heatmap for pivot_table that has store_nbr as index, day_of_week as columns, and values of the bonus is summed in the columns
heatmap = transactions.pivot_table(
    index='store_nbr',
    columns='day_of_week',
    values='bonus_payable',
    aggfunc='sum'
    ).head(11).style.background_gradient(cmap='RdYlGn', axis = 1)
heatmap

day_of_week,0,1,2,3,4,5,6
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,200,300,300,200,100,0,0
2,300,600,500,400,400,500,200
3,24000,23900,23900,23900,23900,24000,23700
4,200,300,300,200,100,200,0
5,200,300,300,100,100,100,0
6,400,500,500,300,200,900,300
7,200,300,300,200,100,100,0
8,22000,18800,23800,18000,22900,23400,20000
9,1200,800,800,700,400,7900,5100
10,0,0,0,0,0,0,0


In [102]:
# melt the DataFrame to have one row for each store and day of the week with total owed, column_index must be reset because of id_vars argument
day_bonus.reset_index().melt(
    id_vars='store_nbr',
    value_name='bonus_payable'
)

Unnamed: 0,store_nbr,day_of_week,bonus_payable
0,1,0,200
1,2,0,300
2,3,0,24000
3,4,0,200
4,5,0,200
...,...,...,...
72,7,6,0
73,8,6,20000
74,9,6,5100
75,10,6,0
