# 그룹 연산
- 복잡한 데이터를 어떤 기준에 따라 여러 그룹으로 나눠서 관찰하는 것도 좋은 방법이다.
- 특정 기준을 적용하여 몇개의 그룹으로 분할하여 처리하는 과정이 그룹연산이다.
- 1단계 : 분할 -> 데이터를 특정 조건에 의해 분할
- 2딘계 : 적용 -> 데이터를 집계, 변환, 필터링
- 3단계 : 결합 -> 2단계 처리결과를 하나로 결합

### 그룹 객체 만들기(분할단계)
#### 1개열을 기준으로 그룹화

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age','sex','class','fare','survived']]
df.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
1,38.0,female,First,71.2833,1
2,26.0,female,Third,7.925,1
3,35.0,female,First,53.1,1
4,35.0,male,Third,8.05,0


In [3]:
# class 열을 기준으로 분할
grouped = df.groupby(['class'])
list(grouped)

[('First',
        age     sex  class     fare  survived
  1    38.0  female  First  71.2833         1
  3    35.0  female  First  53.1000         1
  6    54.0    male  First  51.8625         0
  11   58.0  female  First  26.5500         1
  23   28.0    male  First  35.5000         1
  ..    ...     ...    ...      ...       ...
  871  47.0  female  First  52.5542         1
  872  33.0    male  First   5.0000         0
  879  56.0  female  First  83.1583         1
  887  19.0  female  First  30.0000         1
  889  26.0    male  First  30.0000         1
  
  [216 rows x 5 columns]),
 ('Second',
        age     sex   class     fare  survived
  9    14.0  female  Second  30.0708         1
  15   55.0  female  Second  16.0000         1
  17    NaN    male  Second  13.0000         1
  20   35.0    male  Second  26.0000         0
  21   34.0    male  Second  13.0000         1
  ..    ...     ...     ...      ...       ...
  866  27.0  female  Second  13.8583         1
  874  28.0  female

In [4]:
# 그룹객채를 iteration으로 출력
for key, group in grouped:
    print('** key : ',key)
    print('** number : ',len(group))
    print(group.head())

** key :  First
** number :  216
     age     sex  class     fare  survived
1   38.0  female  First  71.2833         1
3   35.0  female  First  53.1000         1
6   54.0    male  First  51.8625         0
11  58.0  female  First  26.5500         1
23  28.0    male  First  35.5000         1
** key :  Second
** number :  184
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
17   NaN    male  Second  13.0000         1
20  35.0    male  Second  26.0000         0
21  34.0    male  Second  13.0000         1
** key :  Third
** number :  491
    age     sex  class     fare  survived
0  22.0    male  Third   7.2500         0
2  26.0  female  Third   7.9250         1
4  35.0    male  Third   8.0500         0
5   NaN    male  Third   8.4583         0
7   2.0    male  Third  21.0750         0


In [5]:
# 연산 메소드 적용
average = grouped.mean()
average

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,38.233441,84.154687,0.62963
Second,29.87763,20.662183,0.472826
Third,25.14062,13.67555,0.242363


In [6]:
# 개별 그룹 선택하기
group3 = grouped.get_group('Third')
group3.head()

Unnamed: 0,age,sex,class,fare,survived
0,22.0,male,Third,7.25,0
2,26.0,female,Third,7.925,1
4,35.0,male,Third,8.05,0
5,,male,Third,8.4583,0
7,2.0,male,Third,21.075,0


In [7]:
# class와 sex열을 기준으로 분할
grouped_two = df.groupby(['class','sex'])
list(grouped_two)

[(('First', 'female'),
        age     sex  class      fare  survived
  1    38.0  female  First   71.2833         1
  3    35.0  female  First   53.1000         1
  11   58.0  female  First   26.5500         1
  31    NaN  female  First  146.5208         1
  52   49.0  female  First   76.7292         1
  ..    ...     ...    ...       ...       ...
  856  45.0  female  First  164.8667         1
  862  48.0  female  First   25.9292         1
  871  47.0  female  First   52.5542         1
  879  56.0  female  First   83.1583         1
  887  19.0  female  First   30.0000         1
  
  [94 rows x 5 columns]),
 (('First', 'male'),
        age   sex  class      fare  survived
  6    54.0  male  First   51.8625         0
  23   28.0  male  First   35.5000         1
  27   19.0  male  First  263.0000         0
  30   40.0  male  First   27.7208         0
  34   28.0  male  First   82.1708         0
  ..    ...   ...    ...       ...       ...
  839   NaN  male  First   29.7000         1
  8

In [8]:
# grouped_two 객체를 iteration으로 출력
for key,group in grouped_two:
    print('** key : ',key)
    print('** number : ',len(group))
    print(group.head())

** key :  ('First', 'female')
** number :  94
     age     sex  class      fare  survived
1   38.0  female  First   71.2833         1
3   35.0  female  First   53.1000         1
11  58.0  female  First   26.5500         1
31   NaN  female  First  146.5208         1
52  49.0  female  First   76.7292         1
** key :  ('First', 'male')
** number :  122
     age   sex  class      fare  survived
6   54.0  male  First   51.8625         0
23  28.0  male  First   35.5000         1
27  19.0  male  First  263.0000         0
30  40.0  male  First   27.7208         0
34  28.0  male  First   82.1708         0
** key :  ('Second', 'female')
** number :  76
     age     sex   class     fare  survived
9   14.0  female  Second  30.0708         1
15  55.0  female  Second  16.0000         1
41  27.0  female  Second  21.0000         0
43   3.0  female  Second  41.5792         1
53  29.0  female  Second  26.0000         1
** key :  ('Second', 'male')
** number :  108
     age   sex   class  fare  surviv

In [9]:
# grouped_two 그룹 객체에 연산 메소드 적용
average_two = grouped_two.mean()
average_two

Unnamed: 0_level_0,Unnamed: 1_level_0,age,fare,survived
class,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
First,female,34.611765,106.125798,0.968085
First,male,41.281386,67.226127,0.368852
Second,female,28.722973,21.970121,0.921053
Second,male,30.740707,19.741782,0.157407
Third,female,21.75,16.11881,0.5
Third,male,26.507589,12.661633,0.135447


In [10]:
# grouped_two 에서 개별 그룹
group3f = grouped_two.get_group(('Third','female'))
group3f.head()

Unnamed: 0,age,sex,class,fare,survived
2,26.0,female,Third,7.925,1
8,27.0,female,Third,11.1333,1
10,4.0,female,Third,16.7,1
14,14.0,female,Third,7.8542,0
18,31.0,female,Third,18.0,0


---
## 그룹 연산 메소드 ( 적용-결합 단계)
### 데이터 집계
: mean(), min(), max(), sum(), count(), size(), var(), describe(), info(), first(),last() 등

In [12]:
list(grouped)

[('First',
        age     sex  class     fare  survived
  1    38.0  female  First  71.2833         1
  3    35.0  female  First  53.1000         1
  6    54.0    male  First  51.8625         0
  11   58.0  female  First  26.5500         1
  23   28.0    male  First  35.5000         1
  ..    ...     ...    ...      ...       ...
  871  47.0  female  First  52.5542         1
  872  33.0    male  First   5.0000         0
  879  56.0  female  First  83.1583         1
  887  19.0  female  First  30.0000         1
  889  26.0    male  First  30.0000         1
  
  [216 rows x 5 columns]),
 ('Second',
        age     sex   class     fare  survived
  9    14.0  female  Second  30.0708         1
  15   55.0  female  Second  16.0000         1
  17    NaN    male  Second  13.0000         1
  20   35.0    male  Second  26.0000         0
  21   34.0    male  Second  13.0000         1
  ..    ...     ...     ...      ...       ...
  866  27.0  female  Second  13.8583         1
  874  28.0  female

In [13]:
# 각 그룹에 대한 모든 열의 표준 편차를 집계하여 데이터 프레임으로 반환
std_all =grouped.std()
std_all

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,14.802856,78.380373,0.484026
Second,14.001077,13.417399,0.500623
Third,12.495398,11.778142,0.428949


In [14]:
# fare의 표준편차를 집계하여 시리즈로 변환
std_fare = grouped.fare.std()
std_fare

class
First     78.380373
Second    13.417399
Third     11.778142
Name: fare, dtype: float64

In [15]:
# 그룹객체에 agg() 적용
def min_max(x): # 최대값과 최소값 차이
    return x.max() - x.min()

In [16]:
agg_minmax = grouped.agg(min_max)
agg_minmax

Unnamed: 0_level_0,age,fare,survived
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
First,79.08,512.3292,1
Second,69.33,73.5,1
Third,73.58,69.55,1


In [17]:
# 여러 함수를 각 열에 동일하게 적용하여 집계
agg_all = grouped.agg(['min','max'])
agg_all.head()

Unnamed: 0_level_0,age,age,sex,sex,fare,fare,survived,survived
Unnamed: 0_level_1,min,max,min,max,min,max,min,max
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
First,0.92,80.0,female,male,0.0,512.3292,0,1
Second,0.67,70.0,female,male,0.0,73.5,0,1
Third,0.42,74.0,female,male,0.0,69.55,0,1
