In [2]:
import pandas as pd

### Sales data

In [3]:
sales = pd.DataFrame (
    {
        'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'bread': [139, 236, 326, 456],
        'butter': [20, 45, 70, 98]
    }
)

In [4]:
sales

Unnamed: 0,bread,butter,city,weekday
0,139,20,Austin,Sun
1,236,45,Dallas,Sun
2,326,70,Austin,Mon
3,456,98,Dallas,Mon


### Boolean filter and count

In [5]:
sales.loc[sales['weekday'] == 'Sun'].count() 

bread      2
butter     2
city       2
weekday    2
dtype: int64

### Groupby and count

In [6]:
sales.groupby('weekday').count()

Unnamed: 0_level_0,bread,butter,city
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mon,2,2,2
Sun,2,2,2


### Split-apply-combine

sales.groupby('weekday').count()
- split by 'weekday'
- apply count() function on each group
- combine counts per group

### Aggregation/Reduction

Some reducing functions
- mean()
- std()
- sum()
- first(), last()
- min(), max()

### Groupby and sum

In [7]:
sales.groupby('weekday')['bread'].sum()

weekday
Mon    782
Sun    375
Name: bread, dtype: int64

In [8]:
sales.groupby('weekday').sum()

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,782,168
Sun,375,65


### Groupby and sum: multiple column

In [9]:
sales.groupby('weekday')[['bread','butter']].sum()

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,782,168
Sun,375,65


### Groupby and mean: multi-level index

In [10]:
sales.groupby(['city', 'weekday']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,bread,butter
city,weekday,Unnamed: 2_level_1,Unnamed: 3_level_1
Austin,Mon,326,70
Austin,Sun,139,20
Dallas,Mon,456,98
Dallas,Sun,236,45


### Customers

In [11]:
customers = pd.Series(['Dave', 'Alice', 'Bob', 'Alice'])

In [12]:
customers

0     Dave
1    Alice
2      Bob
3    Alice
dtype: object

### Groupby and sum: by series

In [13]:
sales.groupby(customers)['bread'].sum()

Alice    692
Bob      326
Dave     139
Name: bread, dtype: int64

### Categorical data

In [14]:
sales['weekday'].unique()

array(['Sun', 'Mon'], dtype=object)

In [15]:
sales['weekday'] = sales['weekday'].astype('category')

In [16]:
sales['weekday']

0    Sun
1    Sun
2    Mon
3    Mon
Name: weekday, dtype: category
Categories (2, object): [Mon, Sun]

### Categorical data
Advantages
- Uses less memory
- Speeds up operations like groupby()

# Groupby and aggregation

### Sales data

In [17]:
sales

Unnamed: 0,bread,butter,city,weekday
0,139,20,Austin,Sun
1,236,45,Dallas,Sun
2,326,70,Austin,Mon
3,456,98,Dallas,Mon


### Review: groupby

In [18]:
sales.groupby('city')[['bread', 'butter']].max()

Unnamed: 0_level_0,bread,butter
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Austin,326,70
Dallas,456,98


### Multiple aggregations

In [19]:
sales.groupby('city')[['bread', 'butter']].agg(['max','sum'])

Unnamed: 0_level_0,bread,bread,butter,butter
Unnamed: 0_level_1,max,sum,max,sum
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Austin,326,465,70,90
Dallas,456,692,98,143


### Aggregation functions
string names
- 'sum'
- 'mean'
- 'count'

### Custom aggregation

In [20]:
def data_range(series):
    return series.max() - series.min()

In [21]:
sales.groupby('weekday')[['bread', 'butter']].agg(data_range)

Unnamed: 0_level_0,bread,butter
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Mon,130,28
Sun,97,25


### Custom aggregation: dictionaries

In [22]:
sales.groupby(customers)[['bread', 'butter']].agg({'bread':'sum', 'butter':data_range})

Unnamed: 0,bread,butter
Alice,692,53
Bob,326,0
Dave,139,0


# Groupby and transformation

### The z-score

In [23]:
def zscore(series):
    return (series - series.mean()) / series.std()

### The automobile dataset

In [24]:
auto = {'mpg':[18.0, 15.0, 18.0, 16.0, 17.0],
       'cyl':[8, 8, 8, 8, 8],
       'displ':[307.0, 350.0, 318.0, 304.0, 302.0],
       'hp':[130, 165, 150, 150, 140],
       'weight':[3504, 3693, 3436, 3433, 3449],
       'accel':[12.0, 11.5, 11.0, 12.0, 10.5],
       'yr': [70, 70, 70, 70, 70],
       'origin': ['US','US','US','US','US'],
       'name':['chevrolet chevelle malibu',
              'buick skylark 320',
              'plymouth satellite',
              'amc rebel sst',
              'ford torino']}

In [25]:
auto = pd.DataFrame(auto)

In [26]:
auto.head()

Unnamed: 0,accel,cyl,displ,hp,mpg,name,origin,weight,yr
0,12.0,8,307.0,130,18.0,chevrolet chevelle malibu,US,3504,70
1,11.5,8,350.0,165,15.0,buick skylark 320,US,3693,70
2,11.0,8,318.0,150,18.0,plymouth satellite,US,3436,70
3,12.0,8,304.0,150,16.0,amc rebel sst,US,3433,70
4,10.5,8,302.0,140,17.0,ford torino,US,3449,70


### MPG z-score

In [27]:
zscore(auto['mpg']).head()

0    0.920358
1   -1.380537
2    0.920358
3   -0.613572
4    0.153393
Name: mpg, dtype: float64

### MPG z-score by year

In [28]:
auto.groupby('yr')['mpg'].transform(zscore).head()

0    0.920358
1   -1.380537
2    0.920358
3   -0.613572
4    0.153393
Name: mpg, dtype: float64

### Apply transformation and aggregation

In [30]:
def zscore_with_year_and_name(group):
    df = pd.DataFrame(
        {'mpg':zscore(group['mpg']),
        'year':group['yr'],
        'name':group['name']}
    )
    return df

In [31]:
auto.groupby('yr').apply(zscore_with_year_and_name).head()

Unnamed: 0,mpg,name,year
0,0.920358,chevrolet chevelle malibu,70
1,-1.380537,buick skylark 320,70
2,0.920358,plymouth satellite,70
3,-0.613572,amc rebel sst,70
4,0.153393,ford torino,70


# Groupby and filtering

### The automobile dataset

In [32]:
auto.head()

Unnamed: 0,accel,cyl,displ,hp,mpg,name,origin,weight,yr
0,12.0,8,307.0,130,18.0,chevrolet chevelle malibu,US,3504,70
1,11.5,8,350.0,165,15.0,buick skylark 320,US,3693,70
2,11.0,8,318.0,150,18.0,plymouth satellite,US,3436,70
3,12.0,8,304.0,150,16.0,amc rebel sst,US,3433,70
4,10.5,8,302.0,140,17.0,ford torino,US,3449,70


In [33]:
auto.groupby('yr')['mpg'].mean()

yr
70    16.8
Name: mpg, dtype: float64

### groupby object

In [36]:
splitting = auto.groupby('yr')

In [37]:
type(splitting)

pandas.core.groupby.DataFrameGroupBy

In [38]:
type(splitting.groups)

dict

In [39]:
print(splitting.groups.keys())

dict_keys([70])


### groupby object: iteration

In [41]:
for group_name, group in splitting:
    avg = group['mpg'].mean()
    print(group_name, avg)

70 16.8


### groupby object: iteration and filtering

In [42]:
for group_name, group in splitting:
    avg = group.loc[group['name'].str.contains('chevrolet'), 'mpg'].mean()
    print(group_name, avg)

70 18.0


### groupby object: comprehension

In [43]:
chevy_means = {year:group.loc[group['name'].str.contains('chevrolet'),'mpg'].mean()
              for year, group in splitting}

In [44]:
pd.Series(chevy_means)

70    18.0
dtype: float64

### Boolean groupby

In [45]:
chevy = auto['name'].str.contains('chevrolet')

In [46]:
auto.groupby(['yr', chevy])['mpg'].mean()

yr  name 
70  False    16.5
    True     18.0
Name: mpg, dtype: float64