In [3]:
import pandas as pd

In [4]:
## import dataset from a raw github csv file
## data description: https://rdrr.io/cran/reshape2/man/tips.html

df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/tips.csv')

In [5]:
## print first ten rows of the df
df.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [6]:
## showing data types and null values (info)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


#**Exploring with Aggregations**


In [7]:
## using 'describe' to show some statistics about the dataset (numeric features only)
df.describe()

## we can observe that the total_bill is a bit more disperse than the tip (being at 1.38 std)
## but it is consistent with the mean on 'tips' is aproximately 10% of the mean of 'total_bill'
## the mean size of the party is 2.569 so mostly close to 3 people per party but the median shows it's 2 people per party.

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [8]:
## checking the mmean of the tip feature
df['tip'].mean()

np.float64(2.99827868852459)

In [12]:
## checking the max value on the tip feature
df['tip'].max()

10.0

In [15]:
## checking the means of both the 'total_bill' and 'tip' features
df[['total_bill','tip']].mean()

Unnamed: 0,0
total_bill,19.785943
tip,2.998279


# **Basic filtering**

Now imagine we want to see how the male and female groups differ?
*Let's check!*

In [18]:
## we can use a basic mask! let's say:
df[df['sex'] == 'Female']
## and with this, we show the df but only for female sex feature

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
11,35.26,5.00,Female,No,Sun,Dinner,4
14,14.83,3.02,Female,No,Sun,Dinner,2
16,10.33,1.67,Female,No,Sun,Dinner,3
...,...,...,...,...,...,...,...
226,10.09,2.00,Female,Yes,Fri,Lunch,2
229,22.12,2.88,Female,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2


In [20]:
## now we use the mask again and show only the 'tip' feature
## this means we can only see on the output the tips given by the women bill payers!
df[df['sex'] == 'Female']['tip']

Unnamed: 0,tip
0,1.01
4,3.61
11,5.00
14,3.02
16,1.67
...,...
226,2.00
229,2.88
238,4.67
240,2.00


In [27]:
## if we put this together, we can use aggregation on that full filtered for the women bill payers
print('Mean tip of women bill payers:', df[df['sex'] == 'Female']['tip'].mean().round(2))
print('Max tip of women bill payers:', df[df['sex'] ==  'Female']['tip'].max())

Mean tip of women bill payers: 2.83
Max tip of women bill payers: 6.5


In [33]:
## now we can similarly do the same thing for the men bill payers
## we create a mask:

male_mask = df['sex'] == 'Male'

In [32]:
print('Mean tip of men bill payers:', df[male_mask]['tip'].mean().round(2))
print('Max tip of men bill payers:', df[male_mask]['tip'].max())

Mean tip of men bill payers: 3.09
Max tip of men bill payers: 10.0


#**More efficient: groupby in Pandas**
### We did it manually because we only had 2 groups: men and women.
### Imagine if we had more groups, then we can use the **'groupby'** function.

In [36]:
df.groupby(by='sex')

## this line just splits the data forming a grouby object!

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7beda274e450>

In [9]:
df.groupby(by='sex')['tip']

### this create a 'series groupby' object!

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7e7d41ab5040>

In [43]:
## we dont necesarily need to use the 'by = ' inside the method since the first parameter we use is the 'by'
## and we check for the mean of the groups using the aggregation method 'mean()'
df.groupby('sex')['tip'].mean()

## so, as discussed before, we note that the mean tip given by women bill payers is $2.833
## and the mean tip given by men bill payers is $3.089

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Female,2.833448
Male,3.089618


In [10]:
## now we put the split by sex data on a variable:
group_by_sex = df.groupby('sex')

In [47]:
## so we now use the split data:
group_by_sex['tip'].mean()

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Female,2.833448
Male,3.089618


In [53]:
group_by_sex['tip'].max()

Unnamed: 0_level_0,tip
sex,Unnamed: 1_level_1
Female,6.5
Male,10.0


In [56]:
## check how many values we have from each group?
group_by_sex.count()

## we have 87 women and 157 men

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,87,87,87,87,87,87
Male,157,157,157,157,157,157


In [55]:
## for multiple features or columns.
## we now check the total_bill and tip features' means for both groups:
group_by_sex[['total_bill','tip']].mean().round(2)

Unnamed: 0_level_0,total_bill,tip
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,18.06,2.83
Male,20.74,3.09


#**Multiple aggregations**

In [17]:
## using the groupby object and selecting a feature, in this case 'tip', and use the .agg method with a list of aggregation functions:
group_by_sex['tip'].agg(['mean', min, max, 'count'])

  group_by_sex['tip'].agg(['mean', min, max, 'count'])
  group_by_sex['tip'].agg(['mean', min, max, 'count'])


Unnamed: 0_level_0,mean,min,max,count
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,2.833448,1.0,6.5,87
Male,3.089618,1.0,10.0,157


In [18]:
## now if we'd want to do it with multiple features:

group_by_sex[['total_bill', 'tip']].agg(['mean',max,'count'])

  group_by_sex[['total_bill', 'tip']].agg(['mean',max,'count'])


Unnamed: 0_level_0,total_bill,total_bill,total_bill,tip,tip,tip
Unnamed: 0_level_1,mean,max,count,mean,max,count
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Female,18.056897,44.3,87,2.833448,6.5,87
Male,20.744076,50.81,157,3.089618,10.0,157


In [19]:
## so it looks crowdy visually, it not looks good to explore
## we can use the T function to transpose it
## which looks way better and more readable

group_by_sex[['total_bill','tip']].agg(['mean', max, 'count']).T

  group_by_sex[['total_bill','tip']].agg(['mean', max, 'count']).T


Unnamed: 0,sex,Female,Male
total_bill,mean,18.056897,20.744076
total_bill,max,44.3,50.81
total_bill,count,87.0,157.0
tip,mean,2.833448,3.089618
tip,max,6.5,10.0
tip,count,87.0,157.0


In [21]:
### and if we want to check the whole basic statistics summary
### we can use the describe method

group_by_sex[['total_bill','tip']].describe()

## then we could transpose it to make it look better
group_by_sex[['total_bill','tip']].describe().T

Unnamed: 0,sex,Female,Male
total_bill,count,87.0,157.0
total_bill,mean,18.056897,20.744076
total_bill,std,8.009209,9.246469
total_bill,min,3.07,7.25
total_bill,25%,12.75,14.0
total_bill,50%,16.4,18.35
total_bill,75%,21.52,24.71
total_bill,max,44.3,50.81
tip,count,87.0,157.0
tip,mean,2.833448,3.089618
