In [7]:
import pandas as pd
import seaborn as sns

df = sns.load_dataset('titanic')
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [5]:
# titanic 데이터셋의 성별에 대한 생존율을 비교
df.groupby('sex')['survived'].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [6]:
# titanic 데이터셋의 성별과 좌석 등급에 대한 생존율을 비교
df.groupby(['sex','class'])['survived'].mean()

  df.groupby(['sex','class'])['survived'].mean()


sex     class 
female  First     0.968085
        Second    0.921053
        Third     0.500000
male    First     0.368852
        Second    0.157407
        Third     0.135447
Name: survived, dtype: float64

In [8]:
# agg를 이용한 다양한 집계 함수 사용
df.groupby(['sex','class'])['survived'].agg(['mean','count']) # mean과 count를 동시에 보여줌

  df.groupby(['sex','class'])['survived'].agg(['mean','count']) # mean과 count를 동시에 보여줌


Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
sex,class,Unnamed: 2_level_1,Unnamed: 3_level_1
female,First,0.968085,94
female,Second,0.921053,76
female,Third,0.5,144
male,First,0.368852,122
male,Second,0.157407,108
male,Third,0.135447,347


In [9]:
# agg와 딕셔너리를 이용한 컬럼별 다른 집계 함수 사용
df.groupby(['sex','class'])[['survived','age']].agg({'survived':'mean', 'age':'max'}) # survived는 mean, age는 max로 집계

  df.groupby(['sex','class'])[['survived','age']].agg({'survived':'mean', 'age':'max'})


Unnamed: 0_level_0,Unnamed: 1_level_0,survived,age
sex,class,Unnamed: 2_level_1,Unnamed: 3_level_1
female,First,0.968085,63.0
female,Second,0.921053,57.0
female,Third,0.5,63.0
male,First,0.368852,80.0
male,Second,0.157407,70.0
male,Third,0.135447,74.0


In [11]:
import numpy as np
# apply 메서드를 이용한 사용자 정의 함수 사용
def get_IQR(data):
    _3rd = data.quantile(.75)   # 3분위수
    _1st = data.quantile(.25)   # 1분위수
    return (np.abs(_3rd - _1st) * 1.5)  # IQR 계산

df.groupby(['sex','class'])['age'].apply(get_IQR)

  df.groupby(['sex','class'])['age'].apply(get_IQR)


sex     class 
female  First     31.5000
        Second    20.6250
        Third     23.4375
male    First     31.5000
        Second    20.6250
        Third     19.5000
Name: age, dtype: float64

In [12]:
# penguins를 이용한 추가 데이터셋 로드
df = sns.load_dataset('penguins')

# 결측치 개수 확인
df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [13]:
# penguins 데이터셋의 종 별 평균값 비교
df.groupby('species')[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']].mean()

Unnamed: 0_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,38.791391,18.346358,189.953642,3700.662252
Chinstrap,48.833824,18.420588,195.823529,3733.088235
Gentoo,47.504878,14.982114,217.186992,5076.01626


In [14]:
# lambda 함수를 이용한 그룹별 평균치로 결측치 대체
df.groupby('species')[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']].apply(lambda x: x.fillna(x.mean()))

Unnamed: 0_level_0,Unnamed: 1_level_0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Adelie,0,39.100000,18.700000,181.000000,3750.000000
Adelie,1,39.500000,17.400000,186.000000,3800.000000
Adelie,2,40.300000,18.000000,195.000000,3250.000000
Adelie,3,38.791391,18.346358,189.953642,3700.662252
Adelie,4,36.700000,19.300000,193.000000,3450.000000
...,...,...,...,...,...
Gentoo,339,47.504878,14.982114,217.186992,5076.016260
Gentoo,340,46.800000,14.300000,215.000000,4850.000000
Gentoo,341,50.400000,15.700000,222.000000,5750.000000
Gentoo,342,45.200000,14.800000,212.000000,5200.000000


In [15]:
# groupby의 인자로 사용할 수 있는 다양한 데이터 형식들
df = pd.DataFrame(
    {'group':['A','A','A','B','B'],
    'value':[1, 1, 1, 10, 10]}
)
df

Unnamed: 0,group,value
0,A,1
1,A,1
2,A,1
3,B,10
4,B,10


In [20]:
# groupby의 인자로 사용할 수 있는 다양한 데이터 형식들: 리스트
df.groupby([0,0,1,1,2])['value'].sum()

0     2
1    11
2    10
Name: value, dtype: int64

In [19]:
# groupby의 인자로 사용할 수 있는 다양한 데이터 형식들: Pandas Seires
s = pd.Series([False, False, False, True, True])
df.groupby(s)['value'].sum()

False     3
True     20
Name: value, dtype: int64