## 데이터프레임 - 그룹분석

In [11]:
import numpy as np
import pandas as pd
import seaborn as sns

In [31]:
iris = sns.load_dataset('iris')
tips = sns.load_dataset('tips')

#### iris 데이터 사례

In [32]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


- 각 품종별 feature의 평균

In [33]:
iris.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [34]:
iris.groupby('species').first()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.1,3.5,1.4,0.2
versicolor,7.0,3.2,4.7,1.4
virginica,6.3,3.3,6.0,2.5


- 각 품종별 sepal_length의 표준편차

In [35]:
iris.groupby('species').std()['sepal_length']

species
setosa        0.352490
versicolor    0.516171
virginica     0.635880
Name: sepal_length, dtype: float64

In [36]:
iris.groupby('species').std()[['sepal_length']]

Unnamed: 0_level_0,sepal_length
species,Unnamed: 1_level_1
setosa,0.35249
versicolor,0.516171
virginica,0.63588


In [37]:
iris.groupby('species')[['sepal_length']].std()

Unnamed: 0_level_0,sepal_length
species,Unnamed: 1_level_1
setosa,0.35249
versicolor,0.516171
virginica,0.63588


- 그룹 연산을 여러 가지 하는 경우
    - 각 품종별 sepal_length의 평균, 표준편차

In [38]:
iris.groupby('species')['sepal_length'].agg(['mean', 'std', 'min', 'max'])

Unnamed: 0_level_0,mean,std,min,max
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,0.35249,4.3,5.8
versicolor,5.936,0.516171,4.9,7.0
virginica,6.588,0.63588,4.9,7.9


#### tips 데이터 사례

In [39]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [42]:
tips['tip.pct'] = (tips.tip / tips.total_bill * 100).round(2)
tips.tail()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip.pct
239,29.03,5.92,Male,No,Sat,Dinner,3,20.39
240,27.18,2.0,Female,Yes,Sat,Dinner,2,7.36
241,22.67,2.0,Male,Yes,Sat,Dinner,2,8.82
242,17.82,1.75,Male,No,Sat,Dinner,2,9.82
243,18.78,3.0,Female,No,Thur,Dinner,2,15.97


In [43]:
tips.describe()

Unnamed: 0,total_bill,tip,size,tip.pct
count,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,16.079754
std,8.902412,1.383638,0.9511,6.10702
min,3.07,1.0,1.0,3.56
25%,13.3475,2.0,2.0,12.91
50%,17.795,2.9,2.0,15.475
75%,24.1275,3.5625,3.0,19.1475
max,50.81,10.0,6.0,71.03


- 성별 데이터 갯수

In [44]:
tips.groupby('sex').count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size,tip.pct
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Male,157,157,157,157,157,157,157
Female,87,87,87,87,87,87,87


In [45]:
tips.groupby('sex').size()

sex
Male      157
Female     87
dtype: int64

- 성별, 흡연유무별 데이터 갯수

In [46]:
tips.groupby(['sex', 'smoker']).size()

sex     smoker
Male    Yes       60
        No        97
Female  Yes       33
        No        54
dtype: int64

In [47]:
# 남성, 흡연자 수
tips.groupby(['sex', 'smoker']).size()[('Male', 'Yes')]

60

In [48]:
tips.groupby(['sex', 'smoker']).size()['Male']['Yes']

60

- 성별 팁 비율의 평균, 최소, 최대

In [50]:
tips.groupby('sex')['tip.pct'].agg(['mean', 'min', 'max'])

Unnamed: 0_level_0,mean,min,max
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,15.764713,3.56,71.03
Female,16.648276,5.64,41.67


- 성별, 흡연유무별 팁 비율의 평균

In [52]:
tips.groupby(['sex', 'smoker'])['tip.pct'].mean()

sex     smoker
Male    Yes       15.276667
        No        16.066598
Female  Yes       18.214545
        No        15.691111
Name: tip.pct, dtype: float64