In [3]:
import pandas as pd
import numpy as np

In [2]:
# The idea of groupby() is to take a dataframe, split it
# into chunks based on some given key valuesm the apply
# computation on those chunks and combine them together
# into another dataframe

# This is called the split-apply-combine pattern

In [6]:
df = pd.read_csv('datasets/census.csv')

# Let's exclude state-level summarizations, which have
# sum level value of 40
df = df[df['SUMLEV'] == 50]

df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [12]:
# Let's get a list of unique states, then iterate over
# all of them and for each of them reduce the dataframe 
# and calculate the average
for state in df['STNAME'].unique():

    # Only interested in data from 2010
    avg = np.average(df['CENSUS2010POP'].where(df['STNAME'] == state).dropna())

    print('Counties in the state of ' + state + ' have and average population of ' + 
    str(avg))

Counties in the state of Alabama have and average population of 71339.34328358209
Counties in the state of Alaska have and average population of 24490.724137931036
Counties in the state of Arizona have and average population of 426134.4666666667
Counties in the state of Arkansas have and average population of 38878.90666666667
Counties in the state of California have and average population of 642309.5862068966
Counties in the state of Colorado have and average population of 78581.1875
Counties in the state of Connecticut have and average population of 446762.125
Counties in the state of Delaware have and average population of 299311.3333333333
Counties in the state of District of Columbia have and average population of 601723.0
Counties in the state of Florida have and average population of 280616.5671641791
Counties in the state of Georgia have and average population of 60928.63522012578
Counties in the state of Hawaii have and average population of 272060.2
Counties in the state of I

In [13]:
# As you can see, this takes a bit of time to finish
# Let's do the same with groupby()

### **Splitting**

In [16]:
# We start by telling Pandas we're interested in grouping
# by state name, this is the 'Splitting'

# The are two values set here, as groupby returns a tuple:
# the first value is the key (in our case the state name)
# we're trying to group by, and the second is a projected 
# dataframe that was found for that group
for group, frame in df.groupby('STNAME'):

    # This the 'Apply' step
    avg = np.average(frame['CENSUS2010POP'])

    print(f'Counties in the state of {group} have an averge population of {str(avg)}')

# We don't have to do the combine step this time, since our 
# data transformation is printing out our results 

Counties in the state of Alabama have an averge population of 71339.34328358209
Counties in the state of Alaska have an averge population of 24490.724137931036
Counties in the state of Arizona have an averge population of 426134.4666666667
Counties in the state of Arkansas have an averge population of 38878.90666666667
Counties in the state of California have an averge population of 642309.5862068966
Counties in the state of Colorado have an averge population of 78581.1875
Counties in the state of Connecticut have an averge population of 446762.125
Counties in the state of Delaware have an averge population of 299311.3333333333
Counties in the state of District of Columbia have an averge population of 601723.0
Counties in the state of Florida have an averge population of 280616.5671641791
Counties in the state of Georgia have an averge population of 60928.63522012578
Counties in the state of Hawaii have an averge population of 272060.2
Counties in the state of Idaho have an averge popu

In [18]:
# That was an improve in velocity by roughly two factors!

In [19]:
# Let's say we only want to work with a third or so of the
# states at a given time

# We can create a function that returns a numbers in the [0-2]
# range based on the first character of the state name 

# Then tell groupby() to use this function to split out dataframe

# To do this, we need to set the index of the dataframe to be the
# column that we want to group by first|

In [43]:
# This function will return 0 if the first character is M, 1 if it's
# Q, and 2 if it is any other capital letter.
#df = df.set_index('STNAME')

def set_batch_number(item):

    if item[0] < 'M':
        return 0
    elif item[0] < 'Q':
        return 1
    return 2

for group, frame in df.groupby(set_batch_number):
    
    print(f'There are {str(len(frame))} records in group {str(group)} for processing')

There are 1177 records in group 0 for processing
There are 1134 records in group 1 for processing
There are 831 records in group 2 for processing


In [40]:
# Since we set the index of the dataframe to be STNAME, and no
# column identifier was passed, groupby() will automatically use
# the index

True

In [47]:
# Let's do another example wih housing data from airbnb and group
# together the columns cancellation_policy and review_scores_value
df = pd.read_csv('datasets/listings.csv')
df.head(3)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,
1,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,9.0,f,,,t,moderate,f,f,1,1.3
2,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,10.0,f,,,f,moderate,t,f,1,0.47


In [50]:
# A first approach to grouping them might be to promote them
# to multiindex and call groupby()
df = df.set_index(['cancellation_policy','review_scores_value'])

# When we have multiindexing we need to pass in the levels we
# are interested in grouping by
for group, frame in df.groupby(level = (0,1)):
    print(group)

('flexible', 2.0)
('flexible', 4.0)
('flexible', 5.0)
('flexible', 6.0)
('flexible', 7.0)
('flexible', 8.0)
('flexible', 9.0)
('flexible', 10.0)
('moderate', 2.0)
('moderate', 4.0)
('moderate', 6.0)
('moderate', 7.0)
('moderate', 8.0)
('moderate', 9.0)
('moderate', 10.0)
('strict', 2.0)
('strict', 3.0)
('strict', 4.0)
('strict', 5.0)
('strict', 6.0)
('strict', 7.0)
('strict', 8.0)
('strict', 9.0)
('strict', 10.0)
('super_strict_30', 6.0)
('super_strict_30', 7.0)
('super_strict_30', 8.0)
('super_strict_30', 9.0)
('super_strict_30', 10.0)


In [51]:
# What if we wanted to group by cancellation and review scores
# but separate out all the 10's from those under 10?
# We can use a function tpo manage groupings
def grouping_fun(item):

    # since the score are the second item in the cancellation-score
    # tuple we indicate location with 1
    if item[1] == 10:
        return (item[0], '10.00')
    else:
        return (item[0], 'not 10.00')

for group, frame in df.groupby(grouping_fun):
    print(group)    

('flexible', '10.00')
('flexible', 'not 10.00')
('moderate', '10.00')
('moderate', 'not 10.00')
('strict', '10.00')
('strict', 'not 10.00')
('super_strict_30', '10.00')
('super_strict_30', 'not 10.00')


### **Aggregation**

In [53]:
# So far we've just been printing out statements to show how
# splitting work, but there are actually tree broad categories
# of data processing during the apply stage: Aggregation, tranformation
# of group data, and filtration of group data

In [54]:
# Aggregation uses the agg() method on the groupby() object
# With agg() we can pass a dict of the columns we are interested in
# aggregating along with a function to apply on agg
df = df.reset_index()

In [55]:
# Let's group by cancellation policy and find their avg review scores
df.groupby('cancellation_policy').agg({'review_scores_value': np.average})

Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,
moderate,
strict,
super_strict_30,


In [61]:
# The problem here is that np.avg doesn't ignore NaN's, we can just 
# use np.nanmean() and this will be fixed

# Can also be done like this:
# df.groupby('cancellation_policy')['review_scores_value'].agg([lambda x: np.mean(x)])
df.groupby('cancellation_policy').agg({'review_scores_value': np.nanmean})

Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,9.237421
moderate,9.307398
strict,9.081441
super_strict_30,8.537313


In [64]:
# We can use this to aggregate by multiple columns and functions
df.groupby('cancellation_policy').agg({'review_scores_value': (np.nanmean, np.nanstd),
'reviews_per_month': np.nanmean})

Unnamed: 0_level_0,review_scores_value,review_scores_value,reviews_per_month
Unnamed: 0_level_1,nanmean,nanstd,nanmean
cancellation_policy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
flexible,9.237421,1.096271,1.82921
moderate,9.307398,0.859859,2.391922
strict,9.081441,1.040531,1.873467
super_strict_30,8.537313,0.840785,0.340143


### ***Transformation***

In [65]:
# Whereas agg() returns a single value per column (i.e one row
# per group), transform() returns an object as big as the group
# This is useful for combinig data later

In [66]:
# Suppose we want to get average ratings groupped by cancellation
# policy, but preserve the og dataframe shape to get the difference
# between individual obersvations and then sum

# define columns
cols = ['cancellation_policy', 'review_scores_value']

# tranform and store in a different dataframe
transform_df = df[cols].groupby('cancellation_policy').transform(np.nanmean)
transform_df.head()

Unnamed: 0,review_scores_value
0,9.307398
1,9.307398
2,9.307398
3,9.307398
4,9.237421


In [67]:
# The index is the same as the og dataframe, so we can merge it to it

# Rename column
transform_df.rename(columns = {'review_scores_value':'mean_review_scores'},inplace = True)
df = df.merge(transform_df, left_index = True, right_index = True)

df.head()

Unnamed: 0,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,description,...,review_scores_location,requires_license,license,jurisdiction_names,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_scores
0,moderate,,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",...,,f,,,f,f,f,1,,9.307398
1,moderate,9.0,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,...,9.0,f,,,t,f,f,1,1.3,9.307398
2,moderate,10.0,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",...,9.0,f,,,f,t,f,1,0.47,9.307398
3,moderate,10.0,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,...,10.0,f,,,f,f,f,1,1.0,9.307398
4,flexible,10.0,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",...,9.0,f,,,f,f,f,1,2.25,9.237421


In [71]:
# Now we can get the difference between a given row and it's
# group (cancellation-policy) mean
df['mean_diff'] = np.absolute(df['review_scores_value'] - df['mean_review_scores'])
df['mean_diff'].head()

0         NaN
1    0.307398
2    0.692602
3    0.692602
4    0.762579
Name: mean_diff, dtype: float64

### ***Filtering***

In [72]:
# The filter() function takes in a function and applies it to
# every group dataframe and returns a True or False, according
# to wether or not in should be included in the results 

In [73]:
# Suposse we only only want those groups with a mean rating > 9
df.groupby('cancellation_policy').filter(lambda x: np.nanmean(x['review_scores_value']) > 9.2)

Unnamed: 0,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,description,...,requires_license,license,jurisdiction_names,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_scores,mean_diff
0,moderate,,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",...,f,,,f,f,f,1,,9.307398,
1,moderate,9.0,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,...,f,,,t,f,f,1,1.30,9.307398,0.307398
2,moderate,10.0,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",...,f,,,f,t,f,1,0.47,9.307398,0.692602
3,moderate,10.0,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,...,f,,,f,f,f,1,1.00,9.307398,0.692602
4,flexible,10.0,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",...,f,,,f,f,f,1,2.25,9.237421,0.762579
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3576,flexible,,14689681,https://www.airbnb.com/rooms/14689681,20160906204935,2016-09-07,Beautiful loft style bedroom with large bathroom,You'd be living on the top floor of a four sto...,,You'd be living on the top floor of a four sto...,...,f,,,f,f,f,1,,9.237421,
3577,flexible,,13750763,https://www.airbnb.com/rooms/13750763,20160906204935,2016-09-07,Comfortable Space in the Heart of Brookline,"Our place is close to Coolidge Corner, Allston...",This space consists of 2 Rooms and a private b...,"Our place is close to Coolidge Corner, Allston...",...,f,,,f,f,f,1,,9.237421,
3579,flexible,,14852179,https://www.airbnb.com/rooms/14852179,20160906204935,2016-09-07,Spacious Queen Bed Room Close to Boston Univer...,- Grocery: A full-size Star market is 2 minute...,,- Grocery: A full-size Star market is 2 minute...,...,f,,,f,f,f,1,,9.237421,
3582,flexible,,14585486,https://www.airbnb.com/rooms/14585486,20160906204935,2016-09-07,Gorgeous funky apartment,Funky little apartment close to public transpo...,Modern and relaxed space with many facilities ...,Funky little apartment close to public transpo...,...,f,,,f,f,f,1,,9.237421,


### ***Applying***

In [74]:
# The apply() functions allows us to apply an arbitrary function to
# each group, stitch the results back for each appply onto a single 
# dataframe with the index preserved

In [75]:
# Get a clean copy of the data
df = pd.read_csv('datasets/listings.csv')

# isolate relevant columns
df = df[['cancellation_policy', 'review_scores_value']]
df.head()

Unnamed: 0,cancellation_policy,review_scores_value
0,moderate,
1,moderate,9.0
2,moderate,10.0
3,moderate,10.0
4,flexible,10.0


In [79]:
# With apply() we can get the average reviews score of a listing and
# its deviation from the group mean like we did with transform(), but
# with less code
def calc_mean_reviews_scores(group):

    # group is just the dataframe of whatever we grouped by (e,g cancellation_policy)
    # se we can treat it as the complete dataframe 
    avg = np.nanmean(group['review_scores_value'])

    # brodcast and create a new group
    group['review_scores_mean'] = np.abs(avg-group['review_scores_value'])

    return group

# apply this function to all the3 groups
df.groupby('cancellation_policy').apply(calc_mean_reviews_scores).head()

Unnamed: 0,cancellation_policy,review_scores_value,review_scores_mean
0,moderate,,
1,moderate,9.0,0.307398
2,moderate,10.0,0.692602
3,moderate,10.0,0.692602
4,flexible,10.0,0.762579
