## Working with groups in pandas

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('tips.csv')

In [3]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Exploring With Aggregations

In [4]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [5]:
df['tip'].mean()

2.99827868852459

In [6]:
df['tip'].max()

10.0

In [7]:
df[['total_bill', 'tip']].mean()

total_bill    19.785943
tip            2.998279
dtype: float64

## Basic Filtering

In [8]:
df[df['sex'] == 'Female']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
11,35.26,5.00,Female,No,Sun,Dinner,4
14,14.83,3.02,Female,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
...,...,...,...,...,...,...,...
226,10.09,2.00,Female,Yes,Fri,Lunch,2
229,22.12,2.88,Female,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [9]:
df[ df['sex'] == 'Male']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.00,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
239,29.03,5.92,Male,No,Sat,Dinner,3
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [10]:
df[ df['smoker'] == 'Yes']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
56,38.01,3.00,Male,Yes,Sat,Dinner,4
58,11.24,1.76,Male,Yes,Sat,Dinner,2
60,20.29,3.21,Male,Yes,Sat,Dinner,2
61,13.81,2.00,Male,Yes,Sat,Dinner,2
62,11.02,1.98,Male,Yes,Sat,Dinner,2
...,...,...,...,...,...,...,...
234,15.53,3.00,Male,Yes,Sat,Dinner,2
236,12.60,1.00,Male,Yes,Sat,Dinner,2
237,32.83,1.17,Male,Yes,Sat,Dinner,2
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [11]:
df[ df['size'] == 4]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
11,35.26,5.0,Female,No,Sun,Dinner,4
13,18.43,3.0,Male,No,Sun,Dinner,4
23,39.42,7.58,Male,No,Sat,Dinner,4
25,17.81,2.34,Male,No,Sat,Dinner,4
31,18.35,2.5,Male,No,Sat,Dinner,4
33,20.69,2.45,Female,No,Sat,Dinner,4
44,30.4,5.6,Male,No,Sun,Dinner,4


In [12]:
df[ df['sex'] == 'Female']['tip']

0      1.01
4      3.61
11     5.00
14     3.02
16     1.67
       ... 
226    2.00
229    2.88
238    4.67
240    2.00
243    3.00
Name: tip, Length: 87, dtype: float64

In [14]:
print('mean', df[ df['sex'] == 'Female']['tip'].mean())
print('max', df[ df['sex'] == 'Female']['tip'].max())

mean 2.8334482758620685
max 6.5


In [15]:
male_mask = df['sex'] == 'Male'

In [17]:
print('mean', df[male_mask]['tip'].mean())
print('max', df[male_mask]['tip'].max())

mean 3.0896178343949043
max 10.0


## More Efficient: groupby

In [18]:
df.groupby(by='sex')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000016F4BAFF910>

In [19]:
df.groupby('sex')['tip']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000016F4BB3D890>