#### Pandas Group By

In [31]:
import pandas as pd
import numpy as np
from pathlib import Path

In [32]:

data = {
    "customer_id": [187, 127, 117, 199, 197, 187, 197, 141, 192, 131, 113],
    "state": ["New York", "New York", "Connecticut", "New Jersey", "Connecticut", 
              "New York", "New York", "New Jersey", "New York", "Connecticut", "New York"],
    "sex": ["Male", "Female", "Female", "Male", "Female", 
            "Male", "Male", "Female", "Male", "Female", "Male"],
    "month": ["August", "August", "July", "July", "August", 
              "June", "June", "June", "August", "July", "August"],
    "year": [2020, 2020, 2020, 2020, 2020, 
             2020, 2020, 2020, 2020, 2020, 2020],
    "purchase_amount": [1000, 5000, 8000, 9000, 2000, 
                        1000, 1000, 3000, 4000, 2000, 10000]
}

df = pd.DataFrame(data)

path = Path('.', 'data', 'SuperStore.xlsx')
df2 = pd.read_excel(path, sheet_name='Orders')

df

Unnamed: 0,customer_id,state,sex,month,year,purchase_amount
0,187,New York,Male,August,2020,1000
1,127,New York,Female,August,2020,5000
2,117,Connecticut,Female,July,2020,8000
3,199,New Jersey,Male,July,2020,9000
4,197,Connecticut,Female,August,2020,2000
5,187,New York,Male,June,2020,1000
6,197,New York,Male,June,2020,1000
7,141,New Jersey,Female,June,2020,3000
8,192,New York,Male,August,2020,4000
9,131,Connecticut,Female,July,2020,2000


In [33]:
#Sum of purchase amount 
df.groupby('month').sum()

Unnamed: 0_level_0,customer_id,state,sex,year,purchase_amount
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
August,816,New YorkNew YorkConnecticutNew YorkNew York,MaleFemaleFemaleMaleMale,10100,22000
July,447,ConnecticutNew JerseyConnecticut,FemaleMaleFemale,6060,19000
June,525,New YorkNew YorkNew Jersey,MaleMaleFemale,6060,5000


In [34]:
df.groupby('month').groups

{'August': [0, 1, 4, 8, 10], 'July': [2, 3, 9], 'June': [5, 6, 7]}

In [35]:
#Inspect an individual group using the "get group" method
df.groupby('month').get_group('August')

Unnamed: 0,customer_id,state,sex,month,year,purchase_amount
0,187,New York,Male,August,2020,1000
1,127,New York,Female,August,2020,5000
4,197,Connecticut,Female,August,2020,2000
8,192,New York,Male,August,2020,4000
10,113,New York,Male,August,2020,10000


In [36]:
#Show the sum of purchase amount per month as pd.DataFrame
df.groupby('month').agg("sum")[['purchase_amount']]

Unnamed: 0_level_0,purchase_amount
month,Unnamed: 1_level_1
August,22000
July,19000
June,5000


In [42]:
df2.head()

df2.dtypes


Order ID                 object
Order Date       datetime64[ns]
Customer Name            object
Segment                  object
City                     object
State                    object
Region                   object
Category                 object
Sub-Category             object
Product Name             object
Sales                   float64
Quantity                  int64
Profit                  float64
dtype: object

In [43]:
# Calculate the profit per quarters, over the years
df2.groupby([ df2['Order Date'].dt.year, df2['Order Date'].dt.quarter ])[['Profit']].agg(sum)

  df2.groupby([ df2['Order Date'].dt.year, df2['Order Date'].dt.quarter ])[['Profit']].agg(sum)


Unnamed: 0_level_0,Unnamed: 1_level_0,Profit
Order Date,Order Date,Unnamed: 2_level_1
2014,1,3811.229
2014,2,11204.0692
2014,3,12804.7218
2014,4,21723.9541
2015,1,9264.9416
2015,2,12190.9224
2015,3,16853.6194
2015,4,23309.1203
2016,1,11441.3708
2016,2,16390.3394


In [46]:
# Where does the sale of Bookcases bring more profit, in the south of the central part of USA?

central = df2[  (df2['Region']  == 'Central') & (df2['Sub-Category'] ==  'Bookcases')]['Profit'].sum()
south = df2[  (df2['Region']  == 'South') & (df2['Sub-Category'] ==  'Bookcases') ]['Profit'].sum()

central, south

(-1997.9043000000004, 1339.4917999999998)

#### Advanced Group By concepts

In [13]:
df.groupby(["month", "state"]).agg(
    Min=('purchase_amount', 'min'),
    Mean=('purchase_amount', 'mean'),
    Max=('purchase_amount', 'max'),
    Sum=('purchase_amount', 'sum')
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Min,Mean,Max,Sum
month,state,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
August,Connecticut,2000,2000.0,2000,2000
August,New York,1000,5000.0,10000,20000
July,Connecticut,2000,5000.0,8000,10000
July,New Jersey,9000,9000.0,9000,9000
June,New Jersey,3000,3000.0,3000,3000
June,New York,1000,1000.0,1000,2000


In [62]:
# Profit change per Q1 and Q2 in 2017: Verify if the company had over 20 % grow in 2017 Q2 compared to 2017 Q1

profit_change =  df2.groupby([ df2['Order Date'].dt.year, df2['Order Date'].dt.quarter ])[['Profit']].sum()

profit_change

Unnamed: 0_level_0,Unnamed: 1_level_0,Profit
Order Date,Order Date,Unnamed: 2_level_1
2014,1,3811.229
2014,2,11204.0692
2014,3,12804.7218
2014,4,21723.9541
2015,1,9264.9416
2015,2,12190.9224
2015,3,16853.6194
2015,4,23309.1203
2016,1,11441.3708
2016,2,16390.3394


In [63]:
#Multi Index which needs to be breaked
profit_change.index 

profit_change.index.names = ['Year', 'Quarter']

profit_change

Unnamed: 0_level_0,Unnamed: 1_level_0,Profit
Year,Quarter,Unnamed: 2_level_1
2014,1,3811.229
2014,2,11204.0692
2014,3,12804.7218
2014,4,21723.9541
2015,1,9264.9416
2015,2,12190.9224
2015,3,16853.6194
2015,4,23309.1203
2016,1,11441.3708
2016,2,16390.3394


In [64]:
profit_change.reset_index(inplace=True)

profit_change

Unnamed: 0,Year,Quarter,Profit
0,2014,1,3811.229
1,2014,2,11204.0692
2,2014,3,12804.7218
3,2014,4,21723.9541
4,2015,1,9264.9416
5,2015,2,12190.9224
6,2015,3,16853.6194
7,2015,4,23309.1203
8,2016,1,11441.3708
9,2016,2,16390.3394


#### Pandas DataFrame Shift

In [65]:
profit_change['Prev. Q Profit'] = profit_change['Profit'].shift()

profit_change

Unnamed: 0,Year,Quarter,Profit,Prev. Q Profit
0,2014,1,3811.229,
1,2014,2,11204.0692,3811.229
2,2014,3,12804.7218,11204.0692
3,2014,4,21723.9541,12804.7218
4,2015,1,9264.9416,21723.9541
5,2015,2,12190.9224,9264.9416
6,2015,3,16853.6194,12190.9224
7,2015,4,23309.1203,16853.6194
8,2016,1,11441.3708,23309.1203
9,2016,2,16390.3394,11441.3708


In [67]:
profit_change['Change in %'] = profit_change['Profit'] / profit_change['Prev. Q Profit']

profit_change

Unnamed: 0,Year,Quarter,Profit,Prev. Q Profit,Change in %
0,2014,1,3811.229,,
1,2014,2,11204.0692,3811.229,2.939752
2,2014,3,12804.7218,11204.0692,1.142864
3,2014,4,21723.9541,12804.7218,1.696558
4,2015,1,9264.9416,21723.9541,0.426485
5,2015,2,12190.9224,9264.9416,1.315812
6,2015,3,16853.6194,12190.9224,1.382473
7,2015,4,23309.1203,16853.6194,1.383034
8,2016,1,11441.3708,23309.1203,0.490854
9,2016,2,16390.3394,11441.3708,1.43255


In [68]:
# The quarters that did have an increase od 20 % in profit, compared to the previous quarter are:
profit_change[  profit_change['Change in %'] > 1.2  ]

Unnamed: 0,Year,Quarter,Profit,Prev. Q Profit,Change in %
1,2014,2,11204.0692,3811.229,2.939752
3,2014,4,21723.9541,12804.7218,1.696558
5,2015,2,12190.9224,9264.9416,1.315812
6,2015,3,16853.6194,12190.9224,1.382473
7,2015,4,23309.1203,16853.6194,1.383034
9,2016,2,16390.3394,11441.3708,1.43255
11,2016,4,38139.8593,15823.6048,2.410314
14,2017,3,26985.1325,15499.2085,1.741065
