# Groupby

The groupby method allows you to group rows of data together and call aggregate functions

In [29]:
import numpy as np
import pandas as pd
# Create dataframe
data = {'Company':['Google','Google','Microsoft','Microsoft','Facebook','Facebook'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}

data

{'Company': ['Google',
  'Google',
  'Microsoft',
  'Microsoft',
  'Facebook',
  'Facebook'],
 'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
 'Sales': [200, 120, 340, 124, 243, 350]}

In [4]:
mydata = {'Company':['Google','Google','Microsoft','Microsoft','Facebook','Facebook', 'Google' , 'Microsoft' , 'Facebook' , 'Google'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah' , 'G one', 'M one' , 'F One' , 'G two'],
       'Sales':[200,120,340,124,243,350, np.nan , np.nan, np.nan, None]}

mydata

{'Company': ['Google',
  'Google',
  'Microsoft',
  'Microsoft',
  'Facebook',
  'Facebook',
  'Google',
  'Microsoft',
  'Facebook',
  'Google'],
 'Person': ['Sam',
  'Charlie',
  'Amy',
  'Vanessa',
  'Carl',
  'Sarah',
  'G one',
  'M one',
  'F One',
  'G two'],
 'Sales': [200, 120, 340, 124, 243, 350, nan, nan, nan, None]}

In [30]:
df = pd.DataFrame(data)
mydf = pd.DataFrame(mydata)

In [7]:
df

Unnamed: 0,Company,Person,Sales
0,Google,Sam,200
1,Google,Charlie,120
2,Microsoft,Amy,340
3,Microsoft,Vanessa,124
4,Facebook,Carl,243
5,Facebook,Sarah,350


In [8]:
mydf

Unnamed: 0,Company,Person,Sales
0,Google,Sam,200.0
1,Google,Charlie,120.0
2,Microsoft,Amy,340.0
3,Microsoft,Vanessa,124.0
4,Facebook,Carl,243.0
5,Facebook,Sarah,350.0
6,Google,G one,
7,Microsoft,M one,
8,Facebook,F One,
9,Google,G two,


** Now you can use the .groupby() method to group rows together based off of a column name. For instance let's group based off of Company. This will create a DataFrameGroupBy object:**

In [31]:
df.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f759be202b0>

You can save this object as a new variable:

In [32]:
by_comp = df.groupby("Company")

In [33]:
by_comp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f759be20860>

And then call aggregate methods off the object:

In [12]:
by_comp.mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Facebook,296.5
Google,160.0
Microsoft,232.0


In [34]:
df.groupby('Company').mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Facebook,296.5
Google,160.0
Microsoft,232.0


More examples of aggregate methods:

In [14]:
by_comp.std()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Facebook,75.660426
Google,56.568542
Microsoft,152.735065


In [35]:
by_comp.min()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook,Carl,243
Google,Charlie,120
Microsoft,Amy,124


In [16]:
by_comp.max()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook,Sarah,350
Google,Sam,200
Microsoft,Vanessa,340


In [17]:
by_comp.count()

Unnamed: 0_level_0,Person,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook,2,2
Google,2,2
Microsoft,2,2


In [18]:
by_comp.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Facebook,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
Google,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
Microsoft,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0


In [19]:
type(by_comp.describe())

pandas.core.frame.DataFrame

In [20]:
by_comp.describe().transpose()

Unnamed: 0,Company,Facebook,Google,Microsoft
Sales,count,2.0,2.0,2.0
Sales,mean,296.5,160.0,232.0
Sales,std,75.660426,56.568542,152.735065
Sales,min,243.0,120.0,124.0
Sales,25%,269.75,140.0,178.0
Sales,50%,296.5,160.0,232.0
Sales,75%,323.25,180.0,286.0
Sales,max,350.0,200.0,340.0


In [21]:
by_comp.describe().transpose()['Google']

Sales  count      2.000000
       mean     160.000000
       std       56.568542
       min      120.000000
       25%      140.000000
       50%      160.000000
       75%      180.000000
       max      200.000000
Name: Google, dtype: float64

In [22]:
type( by_comp.describe().transpose()['Google'] )

pandas.core.series.Series

In [38]:
google_series =  by_comp.describe().transpose()['Google']
google_series['Sales']

count      2.000000
mean     160.000000
std       56.568542
min      120.000000
25%      140.000000
50%      160.000000
75%      180.000000
max      200.000000
Name: Google, dtype: float64

In [36]:
google_series.index

MultiIndex([('Sales', 'count'),
            ('Sales',  'mean'),
            ('Sales',   'std'),
            ('Sales',   'min'),
            ('Sales',   '25%'),
            ('Sales',   '50%'),
            ('Sales',   '75%'),
            ('Sales',   'max')],
           )

In [37]:
google_series.values

array([  2.        , 160.        ,  56.56854249, 120.        ,
       140.        , 160.        , 180.        , 200.        ])

In [39]:
google_series['Sales']['count']

2.0

In [40]:
google_series['Sales'].loc['count']

2.0

In [41]:
google_series['Sales'].iloc[0]

2.0