## Setup

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Set the format for displaying floats
pd.options.display.float_format = '${:,.0f}'.format

## 01 Intro to grooping

In [2]:
file_name = 'data/fortune1000.csv'

fortune = pd.read_csv(file_name)

In [3]:
fortune.head()

Unnamed: 0,Company,Revenues,Profits,Employees,Sector,Industry
0,Walmart,500343.0,9862.0,2300000,Retailing,General Merchandisers
1,Exxon Mobil,244363.0,19710.0,71200,Energy,Petroleum Refining
2,Berkshire Hathaway,242137.0,44940.0,377000,Financials,Insurance: Property and Casualty (Stock)
3,Apple,229234.0,48351.0,123000,Technology,"Computers, Office Equipment"
4,UnitedHealth Group,201159.0,10558.0,260000,Health Care,Health Care: Insurance and Managed Care


In [4]:
fortune.groupby('Sector')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1380bae40>

In [5]:
len(fortune.groupby('Sector'))

21

In [6]:
fortune.groupby('Sector').size()

Sector
Aerospace & Defense               25
Apparel                           14
Business Services                 53
Chemicals                         33
Energy                           107
Engineering & Construction        27
Financials                       155
Food &  Drug Stores               12
Food, Beverages & Tobacco         37
Health Care                       71
Hotels, Restaurants & Leisure     26
Household Products                28
Industrials                       49
Materials                         45
Media                             25
Motor Vehicles & Parts            19
Retailing                         77
Technology                       103
Telecommunications                10
Transportation                    40
Wholesalers                       44
dtype: int64

In [7]:
fortune.groupby('Sector').first()

Unnamed: 0_level_0,Company,Revenues,Profits,Employees,Industry
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aerospace & Defense,Boeing,93392.0,8197.0,140800,Aerospace and Defense
Apparel,Nike,34350.0,4240.0,74400,Apparel
Business Services,ManpowerGroup,21034.0,545.4,29000,Temporary Help
Chemicals,DowDuPont,62683.0,1460.0,98000,Chemicals
Energy,Exxon Mobil,244363.0,19710.0,71200,Petroleum Refining
Engineering & Construction,Fluor,19521.0,191.4,56706,"Engineering, Construction"
Financials,Berkshire Hathaway,242137.0,44940.0,377000,Insurance: Property and Casualty (Stock)
Food & Drug Stores,Kroger,122662.0,1907.0,449000,Food and Drug Stores
"Food, Beverages & Tobacco",PepsiCo,63525.0,4857.0,263000,Food Consumer Products
Health Care,UnitedHealth Group,201159.0,10558.0,260000,Health Care: Insurance and Managed Care


In [8]:
fortune.groupby('Sector').get_group('Energy')

Unnamed: 0,Company,Revenues,Profits,Employees,Sector,Industry
1,Exxon Mobil,244363.0,19710.0,71200,Energy,Petroleum Refining
12,Chevron,134533.0,9195.0,51900,Energy,Petroleum Refining
27,Phillips 66,91568.0,5106.0,14600,Energy,Petroleum Refining
30,Valero Energy,88407.0,4065.0,10015,Energy,Petroleum Refining
40,Marathon Petroleum,67610.0,3432.0,43800,Energy,Petroleum Refining
...,...,...,...,...,...,...
953,California Resources,2006.0,-266.0,1450,Energy,"Mining, Crude-Oil Production"
976,Ferrellgas Partners,1930.0,-54.2,3891,Energy,Energy
979,Oceaneering International,1922.0,166.4,8200,Energy,"Oil and Gas Equipment, Services"
980,Cimarex Energy,1918.0,494.3,910,Energy,"Mining, Crude-Oil Production"


## 02 Methods on `GroupBy` object

In [9]:
sectors = fortune.groupby('Sector')

In [15]:
sectors['Revenues'].sum()

Sector
Aerospace & Defense               $383,835
Apparel                           $101,157
Business Services                 $316,090
Chemicals                         $251,151
Energy                          $1,543,507
Engineering & Construction        $172,782
Financials                      $2,442,480
Food &  Drug Stores               $405,468
Food, Beverages & Tobacco         $510,232
Health Care                     $1,507,991
Hotels, Restaurants & Leisure     $179,825
Household Products                $231,780
Industrials                       $520,140
Materials                         $278,298
Media                             $230,487
Motor Vehicles & Parts            $433,535
Retailing                       $1,684,353
Technology                      $1,374,822
Telecommunications                $466,959
Transportation                    $455,160
Wholesalers                       $888,149
Name: Revenues, dtype: float64

In [14]:
sectors[['Revenues', 'Profits']].mean()

Unnamed: 0_level_0,Revenues,Profits
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Aerospace & Defense,"$15,353","$1,069"
Apparel,"$7,226",$454
Business Services,"$5,964",$701
Chemicals,"$7,611",$620
Energy,"$14,425",$805
Engineering & Construction,"$6,399",$264
Financials,"$15,758","$1,705"
Food & Drug Stores,"$33,789",$703
"Food, Beverages & Tobacco","$13,790","$1,484"
Health Care,"$21,239","$1,307"


## 03 Grouping by Multiple Columns

In [16]:
fortune.head()

Unnamed: 0,Company,Revenues,Profits,Employees,Sector,Industry
0,Walmart,"$500,343","$9,862",2300000,Retailing,General Merchandisers
1,Exxon Mobil,"$244,363","$19,710",71200,Energy,Petroleum Refining
2,Berkshire Hathaway,"$242,137","$44,940",377000,Financials,Insurance: Property and Casualty (Stock)
3,Apple,"$229,234","$48,351",123000,Technology,"Computers, Office Equipment"
4,UnitedHealth Group,"$201,159","$10,558",260000,Health Care,Health Care: Insurance and Managed Care


In [17]:
# Group by Sector and Industry into `sectors2` variable
sectors2 = fortune.groupby(['Sector', 'Industry'])

In [18]:
sectors2.size()

Sector               Industry                                     
Aerospace & Defense  Aerospace and Defense                            25
Apparel              Apparel                                          14
Business Services    Advertising, marketing                            2
                     Diversified Outsourcing Services                 14
                     Education                                         2
                                                                      ..
Transportation       Trucking, Truck Leasing                          11
Wholesalers          Wholesalers: Diversified                         24
                     Wholesalers: Electronics and Office Equipment     8
                     Wholesalers: Food and Grocery                     6
                     Wholesalers: Health Care                          6
Length: 82, dtype: int64

In [None]:
len(sectors2)

82

In [29]:
type(sectors2.groups), len(sectors2.groups), \
len(sectors2.groups.keys()), len(sectors2.groups.values()), len(sectors2.groups.items())

(pandas.io.formats.printing.PrettyDict, 82, 82, 82, 82)

In [31]:
sectors2.groups[('Apparel', 'Apparel')], len(sectors2.groups[('Apparel', 'Apparel')])

(Index([88, 241, 331, 420, 432, 526, 529, 554, 587, 678, 766, 774, 835, 861], dtype='int64'),
 14)

In [32]:
sectors2.get_group(('Apparel', 'Apparel'))

Unnamed: 0,Company,Revenues,Profits,Employees,Sector,Industry
88,Nike,"$34,350","$4,240",74400,Apparel,Apparel
241,VF,"$12,400",$615,69000,Apparel,Apparel
331,PVH,"$8,915",$538,28050,Apparel,Apparel
420,Ralph Lauren,"$6,653",$-99,18250,Apparel,Apparel
432,Hanesbrands,"$6,478",$62,67200,Apparel,Apparel
526,Under Armour,"$4,977",$-48,11350,Apparel,Apparel
529,Levi Strauss,"$4,904",$281,13800,Apparel,Apparel
554,Tapestry,"$4,488",$591,12450,Apparel,Apparel
587,Skechers U.S.A.,"$4,181",$179,8150,Apparel,Apparel
678,Carters,"$3,400",$303,20900,Apparel,Apparel


In [36]:
sectors2.get_group(('Apparel', 'Apparel'))['Revenues'].mean()

np.float64(7225.521428571429)

In [37]:
sectors2['Revenues'].mean()

Sector               Industry                                     
Aerospace & Defense  Aerospace and Defense                           $15,353
Apparel              Apparel                                          $7,226
Business Services    Advertising, marketing                          $11,578
                     Diversified Outsourcing Services                 $5,298
                     Education                                        $3,485
                                                                       ...  
Transportation       Trucking, Truck Leasing                          $3,971
Wholesalers          Wholesalers: Diversified                         $5,458
                     Wholesalers: Electronics and Office Equipment   $15,279
                     Wholesalers: Food and Grocery                   $20,985
                     Wholesalers: Health Care                        $84,838
Name: Revenues, Length: 82, dtype: float64

## 04 The `agg` method

In [42]:
fortune = pd.read_csv(file_name)

In [43]:
columns_new = ['Sector', 'Industry', 'Company', 'Revenues', 'Profits', 'Employees']
fortune = fortune[columns_new]

In [44]:
fortune.head()

Unnamed: 0,Sector,Industry,Company,Revenues,Profits,Employees
0,Retailing,General Merchandisers,Walmart,"$500,343","$9,862",2300000
1,Energy,Petroleum Refining,Exxon Mobil,"$244,363","$19,710",71200
2,Financials,Insurance: Property and Casualty (Stock),Berkshire Hathaway,"$242,137","$44,940",377000
3,Technology,"Computers, Office Equipment",Apple,"$229,234","$48,351",123000
4,Health Care,Health Care: Insurance and Managed Care,UnitedHealth Group,"$201,159","$10,558",260000


In [45]:
sectors = fortune.groupby('Sector')

In [46]:
# Compute the mean profit and sum of employees for each sector
sectors.agg({'Profits': 'mean', 'Employees': 'sum'})

Unnamed: 0_level_0,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Aerospace & Defense,"$1,069",1010124
Apparel,$454,355699
Business Services,$701,1593999
Chemicals,$620,474020
Energy,$805,981207
Engineering & Construction,$264,420745
Financials,"$1,705",3500119
Food & Drug Stores,$703,1398074
"Food, Beverages & Tobacco","$1,484",1079316
Health Care,"$1,307",2971189


## 05 Iterating through Groups

In [47]:
fortune.head()

Unnamed: 0,Sector,Industry,Company,Revenues,Profits,Employees
0,Retailing,General Merchandisers,Walmart,"$500,343","$9,862",2300000
1,Energy,Petroleum Refining,Exxon Mobil,"$244,363","$19,710",71200
2,Financials,Insurance: Property and Casualty (Stock),Berkshire Hathaway,"$242,137","$44,940",377000
3,Technology,"Computers, Office Equipment",Apple,"$229,234","$48,351",123000
4,Health Care,Health Care: Insurance and Managed Care,UnitedHealth Group,"$201,159","$10,558",260000


In [48]:
sectors = fortune.groupby('Sector')

In [49]:
# Find 2 companies with the highest employees in the full dataset
fortune.nlargest(2, 'Employees')

Unnamed: 0,Sector,Industry,Company,Revenues,Profits,Employees
0,Retailing,General Merchandisers,Walmart,"$500,343","$9,862",2300000
7,Retailing,Internet Services and Retailing,Amazon.com,"$177,866","$3,033",566000


In [50]:
# Find 2 companies with the highest employees in each sector
sectors.apply(lambda x: x.nlargest(2, 'Employees'))

  sectors.apply(lambda x: x.nlargest(2, 'Employees'))


Unnamed: 0_level_0,Unnamed: 1_level_0,Sector,Industry,Company,Revenues,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aerospace & Defense,50,Aerospace & Defense,Aerospace and Defense,United Technologies,"$59,837","$4,552",204700
Aerospace & Defense,26,Aerospace & Defense,Aerospace and Defense,Boeing,"$93,392","$8,197",140800
Apparel,88,Apparel,Apparel,Nike,"$34,350","$4,240",74400
Apparel,241,Apparel,Apparel,VF,"$12,400",$615,69000
Business Services,199,Business Services,Diversified Outsourcing Services,Aramark,"$14,604",$374,215000
Business Services,497,Business Services,Diversified Outsourcing Services,ABM Industries,"$5,454",$4,140000
Chemicals,46,Chemicals,Chemicals,DowDuPont,"$62,683","$1,460",98000
Chemicals,189,Chemicals,Chemicals,Sherwin-Williams,"$14,984","$1,772",52695
Energy,1,Energy,Petroleum Refining,Exxon Mobil,"$244,363","$19,710",71200
Energy,145,Energy,"Oil and Gas Equipment, Services",Halliburton,"$20,620",$-463,55000


In [51]:
pd.__version__

'2.2.3'

In [57]:
fortune.groupby('Sector').groups.keys()

dict_keys(['Aerospace & Defense', 'Apparel', 'Business Services', 'Chemicals', 'Energy', 'Engineering & Construction', 'Financials', 'Food &  Drug Stores', 'Food, Beverages & Tobacco', 'Health Care', 'Hotels, Restaurants & Leisure', 'Household Products', 'Industrials', 'Materials', 'Media', 'Motor Vehicles & Parts', 'Retailing', 'Technology', 'Telecommunications', 'Transportation', 'Wholesalers'])

In [56]:
fortune.groupby('Sector', group_keys=False).groups.keys()

dict_keys(['Aerospace & Defense', 'Apparel', 'Business Services', 'Chemicals', 'Energy', 'Engineering & Construction', 'Financials', 'Food &  Drug Stores', 'Food, Beverages & Tobacco', 'Health Care', 'Hotels, Restaurants & Leisure', 'Household Products', 'Industrials', 'Materials', 'Media', 'Motor Vehicles & Parts', 'Retailing', 'Technology', 'Telecommunications', 'Transportation', 'Wholesalers'])

In [59]:
fortune.groupby('Sector').apply(
    lambda x: x.nlargest(2, 'Employees'), 
    include_groups=False
)

Unnamed: 0_level_0,Unnamed: 1_level_0,Industry,Company,Revenues,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,50,Aerospace and Defense,United Technologies,"$59,837","$4,552",204700
Aerospace & Defense,26,Aerospace and Defense,Boeing,"$93,392","$8,197",140800
Apparel,88,Apparel,Nike,"$34,350","$4,240",74400
Apparel,241,Apparel,VF,"$12,400",$615,69000
Business Services,199,Diversified Outsourcing Services,Aramark,"$14,604",$374,215000
Business Services,497,Diversified Outsourcing Services,ABM Industries,"$5,454",$4,140000
Chemicals,46,Chemicals,DowDuPont,"$62,683","$1,460",98000
Chemicals,189,Chemicals,Sherwin-Williams,"$14,984","$1,772",52695
Energy,1,Petroleum Refining,Exxon Mobil,"$244,363","$19,710",71200
Energy,145,"Oil and Gas Equipment, Services",Halliburton,"$20,620",$-463,55000


## 06 Coding Challenge

In [5]:
file_name = 'data/cereals.csv'

cereals = pd.read_csv(file_name)

In [6]:
cereals.head()

Unnamed: 0,Name,Manufacturer,Type,Calories,Fiber,Sugars
0,100% Bran,Nabisco,Cold,70,$10,6
1,100% Natural Bran,Quaker Oats,Cold,120,$2,8
2,All-Bran,Kellogg's,Cold,70,$9,5
3,All-Bran with Extra Fiber,Kellogg's,Cold,50,$14,0
4,Almond Delight,Ralston Purina,Cold,110,$1,8


In [7]:
# 1 Group the cereals, using the Manufacturer column’s value
manufacturers = cereals.groupby('Manufacturer')

In [15]:
# 2 Determine the total number of groups, and the number of cereals per group

In [17]:
manufacturers.groups

{'American Home Food Products': [43], 'General Mills': [5, 7, 11, 12, 13, 14, 18, 22, 31, 36, 40, 42, 47, 51, 59, 69, 70, 71, 72, 73, 75, 76], 'Kellogg's': [2, 3, 6, 16, 17, 19, 21, 24, 25, 26, 28, 38, 39, 46, 48, 49, 50, 53, 58, 60, 62, 66, 67], 'Nabisco': [0, 20, 63, 64, 65, 68], 'Post': [9, 27, 29, 30, 32, 33, 34, 37, 52], 'Quaker Oats': [1, 10, 35, 41, 54, 55, 56, 57], 'Ralston Purina': [4, 8, 15, 23, 44, 45, 61, 74]}

In [12]:
len(manufacturers.groups), len(manufacturers)

(7, 7)

In [16]:
manufacturers.size()

Manufacturer
American Home Food Products     1
General Mills                  22
Kellogg's                      23
Nabisco                         6
Post                            9
Quaker Oats                     8
Ralston Purina                  8
dtype: int64

In [23]:
for k, v in manufacturers.groups.items():
    print(f'{k:<30} {len(v):>2}')

American Home Food Products     1
General Mills                  22
Kellogg's                      23
Nabisco                         6
Post                            9
Quaker Oats                     8
Ralston Purina                  8


In [25]:
# 3 Extract the cereals that belong to the manufacturer/group "Nabisco"
nabisco_cereals = manufacturers.get_group('Nabisco')
nabisco_cereals

Unnamed: 0,Name,Manufacturer,Type,Calories,Fiber,Sugars
0,100% Bran,Nabisco,Cold,70,$10,6
20,Cream of Wheat (Quick),Nabisco,Hot,100,$1,0
63,Shredded Wheat,Nabisco,Cold,80,$3,0
64,Shredded Wheat 'n'Bran,Nabisco,Cold,90,$4,0
65,Shredded Wheat spoon size,Nabisco,Cold,90,$3,0
68,Strawberry Fruit Wheats,Nabisco,Cold,90,$3,5


In [26]:
# 4 Calculate the average of values in the Calories, Fiber, and Sugars columns for each manufacturer
columns_to_average = ['Calories', 'Fiber', 'Sugars']
manufacturers[columns_to_average].mean()

Unnamed: 0_level_0,Calories,Fiber,Sugars
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
American Home Food Products,$100,$0,$3
General Mills,$111,$1,$8
Kellogg's,$109,$3,$8
Nabisco,$87,$4,$2
Post,$109,$3,$9
Quaker Oats,$95,$1,$5
Ralston Purina,$115,$2,$6


In [27]:
# 5 Find the maximum value in the Sugars column for each manufacturer
manufacturers['Sugars'].max()

Manufacturer
American Home Food Products     3
General Mills                  14
Kellogg's                      15
Nabisco                         6
Post                           15
Quaker Oats                    12
Ralston Purina                 11
Name: Sugars, dtype: int64

In [28]:
# 6 Find the minimum value in the Fiber column for each manufacturer
manufacturers['Fiber'].min()

Manufacturer
American Home Food Products   $0
General Mills                 $0
Kellogg's                     $0
Nabisco                       $1
Post                          $0
Quaker Oats                   $0
Ralston Purina                $0
Name: Fiber, dtype: float64

In [29]:
# 7 Extract the cereal with the lowest amount of grams of sugar per manufacturer in a new DataFrame

# Define a function to get the cereal with the lowest sugar content from a dataframe
def get_lowest_sugar_cereal(df):
    return df.nsmallest(1, 'Sugars')

# Apply the function to each group 
manufacturers.apply(get_lowest_sugar_cereal, include_groups=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Type,Calories,Fiber,Sugars
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
American Home Food Products,43,Maypo,Hot,100,$0,3
General Mills,11,Cheerios,Cold,110,$2,1
Kellogg's,3,All-Bran with Extra Fiber,Cold,50,$14,0
Nabisco,20,Cream of Wheat (Quick),Hot,100,$1,0
Post,33,Grape-Nuts,Cold,110,$3,3
Quaker Oats,57,Quaker Oatmeal,Hot,100,$3,-1
Ralston Purina,61,Rice Chex,Cold,110,$0,2


In [30]:
# The same result but using `idmin` in place of `nsmallest`
manufacturers.apply(lambda x: x.loc[x['Sugars'].idxmin()], include_groups=False)

Unnamed: 0_level_0,Name,Type,Calories,Fiber,Sugars
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
American Home Food Products,Maypo,Hot,100,$0,3
General Mills,Cheerios,Cold,110,$2,1
Kellogg's,All-Bran with Extra Fiber,Cold,50,$14,0
Nabisco,Cream of Wheat (Quick),Hot,100,$1,0
Post,Grape-Nuts,Cold,110,$3,3
Quaker Oats,Quaker Oatmeal,Hot,100,$3,-1
Ralston Purina,Rice Chex,Cold,110,$0,2
