## Setup

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

import utils_09 as utils

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 01 Loading Data

In [5]:
f = utils.Fortune1000()
f.fortune.head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Walmart,Retailing,General Merchandisers,482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,181241,1476,70400


## 02 The groupby Method

In [6]:
# Grouping by a sector
sectors = f.fortune.groupby('Sector')

In [9]:
# Size of each group
sectors.size().head()

Sector
Aerospace & Defense     20
Apparel                 15
Business Services       51
Chemicals               30
Energy                 122
dtype: int64

In [15]:
# Count the number of sectors
f.fortune['Sector'].nunique(), len(list(sectors))

(21, 21)

In [16]:
# Explore the first group
sector_0, df_0 = list(sectors)[0]
sector_0, df_0.shape

('Aerospace & Defense', (20, 6))

In [17]:
# Filter 'Aerospace & Defense' sector manually  
f.fortune[f.fortune['Sector'] == 'Aerospace & Defense'].shape

(20, 6)

In [18]:
f.fortune[f.fortune['Sector'] == 'Aerospace & Defense'].head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
24,Boeing,Aerospace & Defense,Aerospace and Defense,96114,5176,161400
45,United Technologies,Aerospace & Defense,Aerospace and Defense,61047,7608,197200
60,Lockheed Martin,Aerospace & Defense,Aerospace and Defense,46132,3605,126000
88,General Dynamics,Aerospace & Defense,Aerospace and Defense,31469,2965,99900
118,Northrop Grumman,Aerospace & Defense,Aerospace and Defense,23526,1990,65000


In [19]:
sectors.get_group('Aerospace & Defense').head()

Unnamed: 0_level_0,Company,Sector,Industry,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
24,Boeing,Aerospace & Defense,Aerospace and Defense,96114,5176,161400
45,United Technologies,Aerospace & Defense,Aerospace and Defense,61047,7608,197200
60,Lockheed Martin,Aerospace & Defense,Aerospace and Defense,46132,3605,126000
88,General Dynamics,Aerospace & Defense,Aerospace and Defense,31469,2965,99900
118,Northrop Grumman,Aerospace & Defense,Aerospace and Defense,23526,1990,65000


## 03 Methods on the GroupBy Object

In [20]:
# Compute total revenue by sector
sectors[f.REVENUE].sum().head()

Sector
Aerospace & Defense     357940
Apparel                  95968
Business Services       272195
Chemicals               243897
Energy                 1517809
Name: Revenue, dtype: int64

In [21]:
# Compute revenue for Aerospace & Defense sector manually
mask_aero = f.fortune[f.SECTOR] == 'Aerospace & Defense'
f.fortune.loc[mask_aero, f.REVENUE].sum()

np.int64(357940)

## 04 The agg Method

In [22]:
# Get the total revenue by sector using `agg` method
sectors.agg({f.REVENUE: 'sum'}).head()

Unnamed: 0_level_0,Revenue
Sector,Unnamed: 1_level_1
Aerospace & Defense,357940
Apparel,95968
Business Services,272195
Chemicals,243897
Energy,1517809


## 05 Coding challenge

### 01 Load the data

In [23]:
c = utils.Cereals()
c.cereals.head()

Unnamed: 0,Name,Manufacturer,Type,Calories,Fiber,Sugars
0,100% Bran,Nabisco,Cold,70,10.0,6
1,100% Natural Bran,Quaker Oats,Cold,120,2.0,8
2,All-Bran,Kellogg's,Cold,70,9.0,5
3,All-Bran with Extra Fiber,Kellogg's,Cold,50,14.0,0
4,Almond Delight,Ralston Purina,Cold,110,1.0,8


In [27]:
c.cereals.shape

(77, 6)

In [33]:
c.cereals[c.MANUFACTURER].nunique()

7

### 02 Challenges

In [34]:
# 1 Group the cereals, using the Manufacturer column’s values.
manufacturers = c.cereals.groupby(c.MANUFACTURER)

In [32]:
# 2 Determine the total number of groups, and the number of cereals per group.
manufacturers.ngroups

7

In [56]:
manufacturers.groups

{'American Home Food Products': [43], 'General Mills': [5, 7, 11, 12, 13, 14, 18, 22, 31, 36, 40, 42, 47, 51, 59, 69, 70, 71, 72, 73, 75, 76], 'Kellogg's': [2, 3, 6, 16, 17, 19, 21, 24, 25, 26, 28, 38, 39, 46, 48, 49, 50, 53, 58, 60, 62, 66, 67], 'Nabisco': [0, 20, 63, 64, 65, 68], 'Post': [9, 27, 29, 30, 32, 33, 34, 37, 52], 'Quaker Oats': [1, 10, 35, 41, 54, 55, 56, 57], 'Ralston Purina': [4, 8, 15, 23, 44, 45, 61, 74]}

In [31]:
manufacturers.size().sort_values(ascending=False)

Manufacturer
Kellogg's                      23
General Mills                  22
Post                            9
Quaker Oats                     8
Ralston Purina                  8
Nabisco                         6
American Home Food Products     1
dtype: int64

In [35]:
# 3 Extract the cereals that belong to the manufacturer/group "Nabisco".
manufacturers.get_group("Nabisco")

Unnamed: 0,Name,Manufacturer,Type,Calories,Fiber,Sugars
0,100% Bran,Nabisco,Cold,70,10.0,6
20,Cream of Wheat (Quick),Nabisco,Hot,100,1.0,0
63,Shredded Wheat,Nabisco,Cold,80,3.0,0
64,Shredded Wheat 'n'Bran,Nabisco,Cold,90,4.0,0
65,Shredded Wheat spoon size,Nabisco,Cold,90,3.0,0
68,Strawberry Fruit Wheats,Nabisco,Cold,90,3.0,5


In [45]:
# Bonus: convert the GroupBy object to a dictionary
md = dict(list(manufacturers))
len(md), md.keys(), type(list(md.values())[0])

(7,
 dict_keys(['American Home Food Products', 'General Mills', "Kellogg's", 'Nabisco', 'Post', 'Quaker Oats', 'Ralston Purina']),
 pandas.core.frame.DataFrame)

In [47]:
# 4 Calculate the average of values in the Calories, Fiber, and Sugars columns for
# each manufacturer.
columns_of_interest = [c.CALORIES, c.FIBER, c.SUGARS]
manufacturers[columns_of_interest].mean().round(2).head()

Unnamed: 0_level_0,Calories,Fiber,Sugars
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Home Food Products,100.0,0.0,3.0
General Mills,111.36,1.27,7.95
Kellogg's,108.7,2.74,7.57
Nabisco,86.67,4.0,1.83
Post,108.89,2.78,8.78


In [48]:
# The same using `agg` method
agg_dict = {col: 'mean' for col in columns_of_interest}
manufacturers.agg(agg_dict).round(2).head()

Unnamed: 0_level_0,Calories,Fiber,Sugars
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Home Food Products,100.0,0.0,3.0
General Mills,111.36,1.27,7.95
Kellogg's,108.7,2.74,7.57
Nabisco,86.67,4.0,1.83
Post,108.89,2.78,8.78


In [50]:
# 5 Find the maximum value in the Sugars column for each manufacturer.
manufacturers[c.SUGARS].max().sort_values(ascending=False)

Manufacturer
Kellogg's                      15
Post                           15
General Mills                  14
Quaker Oats                    12
Ralston Purina                 11
Nabisco                         6
American Home Food Products     3
Name: Sugars, dtype: int64

In [52]:
# 6 Find the minimum value in the Fiber column for each manufacturer.
manufacturers[c.FIBER].min().sort_values(ascending=True)

Manufacturer
American Home Food Products    0.0
General Mills                  0.0
Kellogg's                      0.0
Post                           0.0
Quaker Oats                    0.0
Ralston Purina                 0.0
Nabisco                        1.0
Name: Fiber, dtype: float64

In [62]:
# 7 Extract the cereal with the lowest amount of grams of sugar per manufacturer
# in a new DataFrame.

In [None]:
# Approach 1: Using `apply` method
# Extract the cereal with the lowest amount of grams of sugar per manufacturer
min_sugar_cereals = c.cereals.groupby(c.MANUFACTURER).apply(
    lambda group: group.loc[group[c.SUGARS].idxmin(), c.NAME],
    include_groups=False  
)

min_sugar_cereals

Manufacturer
American Home Food Products                        Maypo
General Mills                                   Cheerios
Kellogg's                      All-Bran with Extra Fiber
Nabisco                           Cream of Wheat (Quick)
Post                                          Grape-Nuts
Quaker Oats                               Quaker Oatmeal
Ralston Purina                                 Rice Chex
dtype: object

In [85]:
# Approach 2: Using `idxmin` method
# Step 1: Get the row indices of min sugars per manufacturer
indices = manufacturers[c.SUGARS].idxmin().values

# Step 2: Select the rows and columns
min_sugar_cereals2 = c.cereals.loc[indices, [c.MANUFACTURER, c.NAME]]
min_sugar_cereals2

Unnamed: 0,Manufacturer,Name
43,American Home Food Products,Maypo
11,General Mills,Cheerios
3,Kellogg's,All-Bran with Extra Fiber
20,Nabisco,Cream of Wheat (Quick)
33,Post,Grape-Nuts
57,Quaker Oats,Quaker Oatmeal
61,Ralston Purina,Rice Chex


In [65]:
#### DEBUGGING ####

In [61]:
manufacturers[c.SUGARS].min().sort_values(ascending=True)

Manufacturer
Quaker Oats                   -1
Kellogg's                      0
Nabisco                        0
General Mills                  1
Ralston Purina                 2
American Home Food Products    3
Post                           3
Name: Sugars, dtype: int64

In [63]:
manufacturers.get_group('Quaker Oats')

Unnamed: 0,Name,Manufacturer,Type,Calories,Fiber,Sugars
1,100% Natural Bran,Quaker Oats,Cold,120,2.0,8
10,Cap'n'Crunch,Quaker Oats,Cold,120,0.0,12
35,Honey Graham Ohs,Quaker Oats,Cold,120,1.0,11
41,Life,Quaker Oats,Cold,100,2.0,6
54,Puffed Rice,Quaker Oats,Cold,50,0.0,0
55,Puffed Wheat,Quaker Oats,Cold,50,1.0,0
56,Quaker Oat Squares,Quaker Oats,Cold,100,2.0,6
57,Quaker Oatmeal,Quaker Oats,Hot,100,2.7,-1


In [64]:
manufacturers.get_group('Quaker Oats')[c.SUGARS].idxmin()

np.int64(57)

In [74]:
for name, group in manufacturers:
    min_idx = group[c.SUGARS].idxmin()
    print(f"{name:<30}: {group.loc[min_idx][c.NAME]}")

American Home Food Products   : Maypo
General Mills                 : Cheerios
Kellogg's                     : All-Bran with Extra Fiber
Nabisco                       : Cream of Wheat (Quick)
Post                          : Grape-Nuts
Quaker Oats                   : Quaker Oatmeal
Ralston Purina                : Rice Chex


In [83]:
min_sugar_cereals2 = c.cereals.loc[
    c.cereals.groupby(c.MANUFACTURER)[c.SUGARS].idxmin(), [c.MANUFACTURER, c.NAME]
]

min_sugar_cereals2

Unnamed: 0,Manufacturer,Name
43,American Home Food Products,Maypo
11,General Mills,Cheerios
3,Kellogg's,All-Bran with Extra Fiber
20,Nabisco,Cream of Wheat (Quick)
33,Post,Grape-Nuts
57,Quaker Oats,Quaker Oatmeal
61,Ralston Purina,Rice Chex


In [82]:
c.cereals.groupby(c.MANUFACTURER)[c.SUGARS].idxmin()

Manufacturer
American Home Food Products    43
General Mills                  11
Kellogg's                       3
Nabisco                        20
Post                           33
Quaker Oats                    57
Ralston Purina                 61
Name: Sugars, dtype: int64