# Grouping & Visualizing Data In Pandas & Matplotlib

## Part A: Aggregating and Grouping with Pandas 

In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('datasets/retail_data.csv')

df.head()

Unnamed: 0,region,product_category,sales_amount,quantity_sold,month
0,East,Clothing,105.33,1,May
1,West,Home,59.94,9,Feb
2,North,Electronics,111.41,1,Mar
3,East,Electronics,118.32,3,Mar
4,East,Electronics,116.79,8,Apr


In [3]:
# Group by region and sum sales 
total_sales_by_region = df.groupby('region')['sales_amount'].sum() 
print(total_sales_by_region) 


region
East     7827.29
North    7862.64
South    6916.63
West     8583.18
Name: sales_amount, dtype: float64


Aggregation Functions
 
Different questions require different aggregation functions: 


In [4]:
# Multiple aggregations on the same group 
region_summary = df.groupby('region')['sales_amount'].agg([ 
'sum', # Total sales 
'mean', # Average sales 
'count', # Number of transactions 
'min', # Lowest sale 
'max' # Highest sale 
]) 
print(region_summary)


            sum        mean  count    min     max
region                                           
East    7827.29  101.653117     77  34.04  175.81
North   7862.64  106.251892     74  49.94  173.66
South   6916.63  101.715147     68  43.31  168.97
West    8583.18  105.965185     81  14.54  164.90


Advanced Grouping Techniques
 Multiple Column 

In [17]:
# Group by both region and category 
detailed_summary = df.groupby(['region', 'product_category'])['sales_amount'].agg(['sum', 'mean'])
print(detailed_summary) 


                             sum        mean
region product_category                     
East   Books              963.21   87.564545
       Clothing          2040.18   92.735455
       Electronics       2617.60  109.066667
       Home              2206.30  110.315000
North  Books             1121.29  101.935455
       Clothing          2273.95   98.867391
       Electronics       1976.14  109.785556
       Home              2491.26  113.239091
South  Books             1187.80   91.369231
       Clothing          1688.35  105.521875
       Electronics       2074.23   98.772857
       Home              1966.25  109.236111
West   Books             2022.89  106.467895
       Clothing          2368.17  112.770000
       Electronics       2549.34  106.222500
       Home              1642.78   96.634118



#Custom Aggregations

In [21]:
# Create custom aggregation functions 
def sales_range(x): 
    return x.max() - x.min() 


In [24]:
custom_agg = df.groupby('region')['sales_amount'].agg({ 
'total': 'sum', 
'average': 'mean', 
'range': sales_range 
}) 
print(custom_agg)

SpecificationError: nested renamer is not supported

 Pivot Tables: Your Data's Best Friend 
Pivot tables are perfect for cross-tabular analysis:


In [26]:
# Create a pivot table: region vs category 

pivot_sales = df.pivot_table( 
values='sales_amount', 
index='region', 
columns='category', 
aggfunc='sum', 
fill_value=0 
) 
print(pivot_sales)


KeyError: 'category'