<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_groupby.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas groupby

In [1]:
# libraries needed
import numpy as np
import pandas as pd

In [2]:
# load data
fortune_1k = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/fortune1000.csv'
)

fortune_1k.head()

Unnamed: 0,Rank,Company,Sector,Industry,Location,Revenue,Profits,Employees
0,1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
1,2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
2,3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000
3,4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
4,5,McKesson,Health Care,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [3]:
# meta data
fortune_1k.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Rank       1000 non-null   int64 
 1   Company    1000 non-null   object
 2   Sector     1000 non-null   object
 3   Industry   1000 non-null   object
 4   Location   1000 non-null   object
 5   Revenue    1000 non-null   int64 
 6   Profits    1000 non-null   int64 
 7   Employees  1000 non-null   int64 
dtypes: int64(4), object(4)
memory usage: 62.6+ KB


In [4]:
# make 'Rank' the index
fortune_1k = fortune_1k.set_index('Rank')

In [5]:
fortune_1k.head()

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Walmart,Retailing,General Merchandisers,"Bentonville, AR",482130,14694,2300000
2,Exxon Mobil,Energy,Petroleum Refining,"Irving, TX",246204,16150,75600
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000
4,Berkshire Hathaway,Financials,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
5,McKesson,Health Care,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [6]:
print(fortune_1k['Sector'].nunique())      # 21 unique sectors

print(len(fortune_1k.groupby('Sector')))   # when you group by Sector, 21 groupings, 1 per sector

21
21


In [7]:
# number of rows per grouping
(
    fortune_1k
      .groupby('Sector')                 # 1 grouping per Sector
      .size()                            # number of rows per grouping
      .sort_values(ascending = False)
)

Sector
Financials                      139
Energy                          122
Technology                      102
Retailing                        80
Health Care                      75
Business Services                51
Industrials                      46
Food, Beverages & Tobacco        43
Materials                        43
Wholesalers                      40
Transportation                   36
Chemicals                        30
Household Products               28
Engineering & Construction       26
Hotels, Resturants & Leisure     25
Media                            25
Motor Vehicles & Parts           24
Aerospace & Defense              20
Apparel                          15
Telecommunications               15
Food and Drug Stores             15
dtype: int64

In [8]:
# equivalent to:
(
    fortune_1k['Sector']
      .value_counts()
)

Financials                      139
Energy                          122
Technology                      102
Retailing                        80
Health Care                      75
Business Services                51
Industrials                      46
Materials                        43
Food, Beverages & Tobacco        43
Wholesalers                      40
Transportation                   36
Chemicals                        30
Household Products               28
Engineering & Construction       26
Media                            25
Hotels, Resturants & Leisure     25
Motor Vehicles & Parts           24
Aerospace & Defense              20
Telecommunications               15
Apparel                          15
Food and Drug Stores             15
Name: Sector, dtype: int64

In [9]:
(
    fortune_1k
      .groupby('Sector')
      .first()               # for each grouping, i.e., 'Sector', first entry in each sector; also complementary .last() method
)

Unnamed: 0_level_0,Company,Industry,Location,Revenue,Profits,Employees
Sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aerospace & Defense,Boeing,Aerospace and Defense,"Chicago, IL",96114,5176,161400
Apparel,Nike,Apparel,"Beaverton, OR",30601,3273,62600
Business Services,ManpowerGroup,Temporary Help,"Milwaukee, WI",19330,419,27000
Chemicals,Dow Chemical,Chemicals,"Midland, MI",48778,7685,49495
Energy,Exxon Mobil,Petroleum Refining,"Irving, TX",246204,16150,75600
Engineering & Construction,Fluor,"Engineering, Construction","Irving, TX",18114,413,38758
Financials,Berkshire Hathaway,Insurance: Property and Casualty (Stock),"Omaha, NE",210821,24083,331000
Food and Drug Stores,CVS Health,Food and Drug Stores,"Woonsocket, RI",153290,5237,199000
"Food, Beverages & Tobacco",Archer Daniels Midland,Food Production,"Chicago, IL",67702,1849,32300
Health Care,McKesson,Wholesalers: Health Care,"San Francisco, CA",181241,1476,70400


In [10]:
# .groups attribute
(
    fortune_1k 
      .groupby('Sector')
      .groups
)

# outputs a large dictionary where keys are sectors and values are lists of rows from fortune 1k that belong to each sector

Output = None

In [11]:
# total profit by sector
(
    fortune_1k
      .groupby('Sector')
      ['Profits']
      .sum()
      .to_frame()
      .reset_index()
      .sort_values(by = 'Profits', ascending = False)
)

Unnamed: 0,Sector,Profits
6,Financials,260209
17,Technology,180473
9,Health Care,106114
8,"Food, Beverages & Tobacco",51417
18,Telecommunications,48637
16,Retailing,47830
19,Transportation,44169
0,Aerospace & Defense,28742
2,Business Services,28227
15,Motor Vehicles & Parts,25898


## .get_group() method

In [12]:
(
    fortune_1k
      .groupby('Sector')
      .get_group('Technology')
      .query("Location.str.lower().str.strip().str.contains(', ca')", engine = 'python')  # to use str.contains() inside .query()
)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000
20,HP,Technology,"Computers, Office Equipment","Palo Alto, CA",103355,4554,287000
36,Alphabet,Technology,Internet Services and Retailing,"Mountain View, CA",74989,16348,61814
51,Intel,Technology,Semiconductors and Other Electronic Components,"Santa Clara, CA",55355,11420,107300
54,Cisco Systems,Technology,Network and Other Communications Equipment,"San Jose, CA",49161,8981,71833
77,Oracle,Technology,Computer Software,"Redwood City, CA",38226,9938,132000
110,Qualcomm,Technology,Network and Other Communications Equipment,"San Diego, CA",25281,5271,33000
157,Facebook,Technology,Internet Services and Retailing,"Menlo Park, CA",17928,3688,12691
194,Western Digital,Technology,Computer Peripherals,"Irvine, CA",14572,1465,76449
295,Applied Materials,Technology,Semiconductors and Other Electronic Components,"Santa Clara, CA",9659,1377,15050


In [13]:
# Technology companies in California
(
    fortune_1k
      [
          (fortune_1k['Sector'] == 'Technology')
          & (fortune_1k['Location'].str.strip().str.lower().str.contains(', ca'))
      ]
)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,Apple,Technology,"Computers, Office Equipment","Cupertino, CA",233715,53394,110000
20,HP,Technology,"Computers, Office Equipment","Palo Alto, CA",103355,4554,287000
36,Alphabet,Technology,Internet Services and Retailing,"Mountain View, CA",74989,16348,61814
51,Intel,Technology,Semiconductors and Other Electronic Components,"Santa Clara, CA",55355,11420,107300
54,Cisco Systems,Technology,Network and Other Communications Equipment,"San Jose, CA",49161,8981,71833
77,Oracle,Technology,Computer Software,"Redwood City, CA",38226,9938,132000
110,Qualcomm,Technology,Network and Other Communications Equipment,"San Diego, CA",25281,5271,33000
157,Facebook,Technology,Internet Services and Retailing,"Menlo Park, CA",17928,3688,12691
194,Western Digital,Technology,Computer Peripherals,"Irvine, CA",14572,1465,76449
295,Applied Materials,Technology,Semiconductors and Other Electronic Components,"Santa Clara, CA",9659,1377,15050


In [14]:
# Apparel companies
(
    fortune_1k
      .groupby('Sector')
      .get_group('Apparel')
)

Unnamed: 0_level_0,Company,Sector,Industry,Location,Revenue,Profits,Employees
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
91,Nike,Apparel,Apparel,"Beaverton, OR",30601,3273,62600
231,VF,Apparel,Apparel,"Greensboro, NC",12377,1232,64000
340,PVH,Apparel,Apparel,"New York, NY",8020,572,26200
354,Ralph Lauren,Apparel,Apparel,"New York, NY",7620,702,20000
448,Hanesbrands,Apparel,Apparel,"Winston-Salem, NC",5732,429,65300
547,Levi Strauss,Apparel,Apparel,"San Francisco, CA",4495,209,12500
575,Coach,Apparel,Apparel,"New York, NY",4192,402,12950
597,Under Armour,Apparel,Apparel,"Baltimore, MD",3963,233,9600
683,Fossil Group,Apparel,Apparel,"Richardson, TX",3229,221,15100
695,Skechers U.S.A.,Apparel,Apparel,"Manhattan Beach, CA",3159,232,6400


## Methods on groupby object and DataFrame columns

In [15]:
# total revenue by sector
(
    fortune_1k
      .groupby('Sector')
      ['Revenue']
      .sum()
)

Sector
Aerospace & Defense              357940
Apparel                           95968
Business Services                272195
Chemicals                        243897
Energy                          1517809
Engineering & Construction       153983
Financials                      2217159
Food and Drug Stores             483769
Food, Beverages & Tobacco        555967
Health Care                     1614707
Hotels, Resturants & Leisure     169546
Household Products               234737
Industrials                      497581
Materials                        259145
Media                            220764
Motor Vehicles & Parts           482540
Retailing                       1465076
Technology                      1377600
Telecommunications               461834
Transportation                   408508
Wholesalers                      444800
Name: Revenue, dtype: int64

In [17]:
# average profit by sector
(
    fortune_1k
      .groupby('Sector')
      ['Profits']
      .mean()
)

Sector
Aerospace & Defense             1437.100000
Apparel                          549.066667
Business Services                553.470588
Chemicals                        754.266667
Energy                          -602.024590
Engineering & Construction       204.000000
Financials                      1872.007194
Food and Drug Stores            1117.266667
Food, Beverages & Tobacco       1195.744186
Health Care                     1414.853333
Hotels, Resturants & Leisure     827.880000
Household Products               515.285714
Industrials                      451.391304
Materials                        102.976744
Media                            973.880000
Motor Vehicles & Parts          1079.083333
Retailing                        597.875000
Technology                      1769.343137
Telecommunications              3242.466667
Transportation                  1226.916667
Wholesalers                      205.825000
Name: Profits, dtype: float64

In [18]:
# lowest employee count in each Sector
(
    fortune_1k
      .groupby('Sector')
      ['Employees']
      .min()
)

Sector
Aerospace & Defense             6955
Apparel                         5978
Business Services               2400
Chemicals                       1979
Energy                           480
Engineering & Construction      1036
Financials                       187
Food and Drug Stores            1616
Food, Beverages & Tobacco       1857
Health Care                     2924
Hotels, Resturants & Leisure    8500
Household Products              4406
Industrials                     4104
Materials                         52
Media                           2341
Motor Vehicles & Parts          2700
Retailing                       2155
Technology                      2285
Telecommunications              2700
Transportation                  1223
Wholesalers                      500
Name: Employees, dtype: int64

In [21]:
# number of companies in each sector
(
    fortune_1k
      .groupby('Sector')
      ['Company']
      .count()
)

Sector
Aerospace & Defense              20
Apparel                          15
Business Services                51
Chemicals                        30
Energy                          122
Engineering & Construction       26
Financials                      139
Food and Drug Stores             15
Food, Beverages & Tobacco        43
Health Care                      75
Hotels, Resturants & Leisure     25
Household Products               28
Industrials                      46
Materials                        43
Media                            25
Motor Vehicles & Parts           24
Retailing                        80
Technology                      102
Telecommunications               15
Transportation                   36
Wholesalers                      40
Name: Company, dtype: int64

In [22]:
# can also do a pivot table for average revenue per sector
(
    fortune_1k
      .pivot_table(values = 'Revenue', index = 'Sector', aggfunc = 'mean')
)

Unnamed: 0_level_0,Revenue
Sector,Unnamed: 1_level_1
Aerospace & Defense,17897.0
Apparel,6397.866667
Business Services,5337.156863
Chemicals,8129.9
Energy,12441.057377
Engineering & Construction,5922.423077
Financials,15950.784173
Food and Drug Stores,32251.266667
"Food, Beverages & Tobacco",12929.465116
Health Care,21529.426667


In [39]:
# which company in each sector has the highest revenue?

# index in fortune_1k dataframe containing companies with highest revenue in each secotr
max_revenue_idx = (
    fortune_1k
      .groupby(['Sector'])
      ['Revenue']
      .idxmax()
)

# extract those indices
(
    fortune_1k
      .loc[max_revenue_idx]                   # extract rows defined above 
      [['Sector', 'Company', 'Revenue']]      # extract these columns
      .sort_values('Sector')                  # sort by sector
)

# one row per company
# these are companies with highest revenue in their sector

Unnamed: 0_level_0,Sector,Company,Revenue
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,Aerospace & Defense,Boeing,96114
91,Apparel,Nike,30601
144,Business Services,ManpowerGroup,19330
56,Chemicals,Dow Chemical,48778
2,Energy,Exxon Mobil,246204
155,Engineering & Construction,Fluor,18114
4,Financials,Berkshire Hathaway,210821
7,Food and Drug Stores,CVS Health,153290
41,"Food, Beverages & Tobacco",Archer Daniels Midland,67702
5,Health Care,McKesson,181241


## .groupby() multiple columns

In [49]:
# number companies per sector per industry
(
    fortune_1k
      .groupby(['Sector', 'Industry'])
      ['Company']
      .count()
      .to_frame()
      .reset_index()
      .sort_values(by = ['Sector', 'Company'], ascending = [True, False])
      .query('Sector == "Business Services"')
)

Unnamed: 0,Sector,Industry,Company
5,Business Services,Financial Data Services,19
3,Business Services,Diversified Outsourcing Services,14
7,Business Services,Temporary Help,5
8,Business Services,Waste Management,5
4,Business Services,Education,3
6,Business Services,Miscellaneous,3
2,Business Services,"Advertising, marketing",2


## .agg() method