In [1]:
import pandas as pd
import numpy as np
import seaborn as sns  # dataset load 용도

In [2]:
titanic = sns.load_dataset('titanic')

In [4]:
df = titanic.loc[:, ["age", "sex", "class", "fare", "survived"]]

In [5]:
print(f"승객수 : {len(df)}")
print(df.head())

승객수 : 891
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
1  38.0  female  First  71.2833         1
2  26.0  female  Third   7.9250         1
3  35.0  female  First  53.1000         1
4  35.0    male  Third   8.0500         0


# groupby 실습 

In [8]:
# class 열을 기반으로  group 객체 생성
grouped = df.groupby(["class"])
grouped.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
6,54.0,male,First,51.8625,0
7,2.0,male,Third,21.075,0
9,14.0,female,Second,30.0708,1
11,58.0,female,First,26.55,1


In [9]:
grouped.sum()

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,7111.42,18177.4125,136
Second,5168.83,3801.8417,87
Third,8924.92,6714.6951,119


## 그룹 객체를 iteration 돌면서 출력

In [11]:
# 그룹 객체 grouped : key=[first, second, third]
for key, group in grouped:
    print("* key : ", key)
    print("* number : ", len(group))
    print(group.head())
    print()

* key :  First
* number :  216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1

* key :  Second
* number :  184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1

* key :  Third
* number :  491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0



# 연산 메서드 적용
- 연산 메서드 사용시 연산이 가능한 열에 대해서만 선택적으로 연산을 수행

In [12]:
# 문자열을 포함한 sex, class 열을 제외하고 숫자형 데이터에 대해서만 평균 구하기
average = grouped.mean()
average  # 결과 : 1등석의 경우, 평균 나이가 제일 높고, 구조 확률이 약 62%로 제일 높음

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,38.233441,84.154687,0.62963
Second,29.87763,20.662183,0.472826
Third,25.14062,13.67555,0.242363


In [13]:
# 개별 그룹 선택
group3 = grouped.get_group("Third")
group3.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0


In [15]:
grouped_two = df.groupby(["class", "sex"])
grouped_two.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
6,54.0,male,First,51.8625,0
7,2.0,male,Third,21.075,0
8,27.0,female,Third,11.1333,1
9,14.0,female,Second,30.0708,1


In [16]:
for key, group in grouped_two:
    print("* key : ", key)
    print("* number : ", len(group))
    print(group.head())
    print()

* key :  ('First', 'female')
* number :  94
     age     sex  class      fare  survived
1   38.0  female  First   71.2833         1
3   35.0  female  First   53.1000         1
11  58.0  female  First   26.5500         1
31   NaN  female  First  146.5208         1
52  49.0  female  First   76.7292         1

* key :  ('First', 'male')
* number :  122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
30  40.0  male  First   27.7208         0
34  28.0  male  First   82.1708         0

* key :  ('Second', 'female')
* number :  76
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
41  27.0  female  Second  21.0000         0
43   3.0  female  Second  41.5792         1
53  29.0  female  Second  26.0000         1

* key :  ('Second', 'male')
* number :  108
     age   sex   class  fare  survived
17

In [17]:
# grouped_two에 대해 연산 메서드 적용
grouped_two.mean()  # multi-index

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,female,34.611765,106.125798,0.968085
First,male,41.281386,67.226127,0.368852
Second,female,28.722973,21.970121,0.921053
Second,male,30.740707,19.741782,0.157407
Third,female,21.75,16.11881,0.5
Third,male,26.507589,12.661633,0.135447


## 멀티 인덱스 형태로 되어있는 그룹에서 개별 그룹을 가지고 오고 싶을 때
### => 튜플 형태로 컬럼 지정하면 됨

In [20]:
# 3등석의 여성 데이터를 가져오기 : ("Third", "female")
group3_female = grouped_two.get_group(("Third", "female"))
group3_female.head()

Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.925,1
8,27.0,female,Third,11.1333,1
10,4.0,female,Third,16.7,1
14,14.0,female,Third,7.8542,0
18,31.0,female,Third,18.0,0


# Filtering
ex) 나이 평균이 30보다 작은 그룹만 필터링해서 df로 반환

In [21]:
average = grouped.mean()
average

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,38.233441,84.154687,0.62963
Second,29.87763,20.662183,0.472826
Third,25.14062,13.67555,0.242363


In [22]:
# 평균 나이가 30 미만인 그룹(클래스)는 second, third 그룹
age_filter = grouped.filter(lambda x: x.age.mean() < 30)
age_filter

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.2500,0
2,26.0,female,Third,7.9250,1
4,35.0,male,Third,8.0500,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.0750,0
...,...,...,...,...,...
884,25.0,male,Third,7.0500,0
885,39.0,female,Third,29.1250,0
886,27.0,male,Second,13.0000,0
888,,female,Third,23.4500,0
