In [39]:
import pandas as pd 
import seaborn as sns 

# titanic 데이터 가져오기 
titanic = sns.load_dataset('titanic')
print(titanic.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None


In [40]:
# class 별로 그룹화 
grouped=titanic.groupby('class')
print(grouped) 
# 출력값 : 어떤 클래스 이름의 객체로 나온다. 

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x12abfcf90>


In [41]:
# 그룹화된 데이터에 접근
for key, data in grouped : 
    print(key,'\n')
    print(data.head(2),'\n')
# 출력값 : first, second, third 그룹이 생성됨.


First 

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
1         1       1  female  38.0      1      0  71.2833        C  First   
3         1       1  female  35.0      1      0  53.1000        S  First   

     who  adult_male deck  embark_town alive  alone  
1  woman       False    C    Cherbourg   yes  False  
3  woman       False    C  Southampton   yes  False   

Second 

    survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
9          1       2  female  14.0      1      0  30.0708        C  Second   
15         1       2  female  55.0      0      0  16.0000        S  Second   

      who  adult_male deck  embark_town alive  alone  
9   child       False  NaN    Cherbourg   yes  False  
15  woman       False  NaN  Southampton   yes   True   

Third 

   survived  pclass     sex   age  sibsp  parch   fare embarked  class    who  \
0         0       3    male  22.0      1      0  7.250        S  Third    man   
2         1       

In [42]:
# Third 그룹에 해당하는 그룹의 데이터만 가져오기 
third = grouped.get_group('Third')
print(third['class'])

0      Third
2      Third
4      Third
5      Third
7      Third
       ...  
882    Third
884    Third
885    Third
888    Third
890    Third
Name: class, Length: 491, dtype: category
Categories (3, object): [First, Second, Third]


In [43]:
# 집계 함수 적용
print(grouped.mean())

survived  pclass        age     sibsp     parch       fare  \
class                                                                
First   0.629630     1.0  38.233441  0.416667  0.356481  84.154687   
Second  0.472826     2.0  29.877630  0.402174  0.380435  20.662183   
Third   0.242363     3.0  25.140620  0.615071  0.393075  13.675550   

        adult_male     alone  
class                         
First     0.550926  0.504630  
Second    0.538043  0.565217  
Third     0.649695  0.659878  


In [44]:
# 2개의 특성으로 그룹화해서 집계
grouped = titanic.groupby(['class', 'sex'])  # 범주형 데이터를 넣어야됨, 숫자 안됨. 
print(grouped.mean())

survived  pclass        age     sibsp     parch        fare  \
class  sex                                                                   
First  female  0.968085     1.0  34.611765  0.553191  0.457447  106.125798   
       male    0.368852     1.0  41.281386  0.311475  0.278689   67.226127   
Second female  0.921053     2.0  28.722973  0.486842  0.605263   21.970121   
       male    0.157407     2.0  30.740707  0.342593  0.222222   19.741782   
Third  female  0.500000     3.0  21.750000  0.895833  0.798611   16.118810   
       male    0.135447     3.0  26.507589  0.498559  0.224784   12.661633   

               adult_male     alone  
class  sex                           
First  female    0.000000  0.361702  
       male      0.975410  0.614754  
Second female    0.000000  0.421053  
       male      0.916667  0.666667  
Third  female    0.000000  0.416667  
       male      0.919308  0.760807  


In [45]:
# 그룹화 해서 원하는 함수를 적용 (최대값-최소값)
def func(x) : 
    return x.max() - x.min()

grouped = titanic.groupby(['class'])
print(grouped.agg(func))

survived  pclass    age  sibsp  parch      fare
class                                                  
First          1       0  79.08      3      4  512.3292
Second         1       0  69.33      3      3   73.5000
Third          1       0  73.58      8      6   69.5500


In [46]:
# 여러개의 함수를 적용
# 분석 보고서를 그룹별로 만들면 유용하다.
print(grouped.agg(['max', 'min']))

survived     pclass       sex           age       sibsp      ...  \
            max min    max min   max     min   max   min   max min  ...   
class                                                               ...   
First         1   0      1   1  male  female  80.0  0.92     3   0  ...   
Second        1   0      2   2  male  female  70.0  0.67     3   0  ...   
Third         1   0      3   3  male  female  74.0  0.42     8   0  ...   

            fare         who        adult_male        alive     alone         
             max  min    max    min        max    min   max min   max    min  
class                                                                         
First   512.3292  0.0  woman  child       True  False   yes  no  True  False  
Second   73.5000  0.0  woman  child       True  False   yes  no  True  False  
Third    69.5500  0.0  woman  child       True  False   yes  no  True  False  

[3 rows x 22 columns]


In [47]:
# 각 그룹별 데이터 개수 확인
for key, data in grouped : 
    print(key, len(data))

# 결과값 First 216, Second 184, Third 491

First 216
Second 184
Third 491


In [48]:
# 데이터가 200개 안되는 그룹은 제거 
# 데이터의 개수가 200이상인 여부를 알려주는 함수
def over200(x):
    return len(x) >= 200

# grouped_filter = grouped.filter(over200)
print(grouped_filter['class'])
#결과값을 보면 second 클래스가 제외됐다.

1      First
3      First
6      First
11     First
23     First
       ...  
871    First
872    First
879    First
887    First
889    First
Name: class, Length: 216, dtype: category
Categories (3, object): [First, Second, Third]


In [49]:
# 위의 내용을 람다함수로 변환
# 파이썬에서의 람다는 이름없는 한 줄 짜리 함수
# 필터링이나 mapping(apply) 메소드에서 람다를 많이 사용 
grouped_filter = grouped.filter(lambda x:len(x)>200)
print(grouped_filter['class'])

0      Third
1      First
2      Third
3      First
4      Third
       ...  
885    Third
887    First
888    Third
889    First
890    Third
Name: class, Length: 707, dtype: category
Categories (3, object): [First, Second, Third]


In [50]:
# age 열의 평균이 30이 안되는 그룹을 제거 
for key, data in grouped : 
    print(key, data['age'].mean())

First 38.233440860215055
Second 29.87763005780347
Third 25.14061971830986


In [51]:
grouped_filter = grouped.filter(lambda x: x['age'].mean()>=30)
print(grouped_filter['class'])

1      First
3      First
6      First
11     First
23     First
       ...  
871    First
872    First
879    First
887    First
889    First
Name: class, Length: 216, dtype: category
Categories (3, object): [First, Second, Third]


In [54]:
# 그룹화 할 때 2개의 이상의 컬럼 이름을 대입하면 멀티인덱스
grouped = titanic.groupby(['class', 'sex'])
gdf = grouped.mean()
print(gdf)

survived  pclass        age     sibsp     parch        fare  \
class  sex                                                                   
First  female  0.968085     1.0  34.611765  0.553191  0.457447  106.125798   
       male    0.368852     1.0  41.281386  0.311475  0.278689   67.226127   
Second female  0.921053     2.0  28.722973  0.486842  0.605263   21.970121   
       male    0.157407     2.0  30.740707  0.342593  0.222222   19.741782   
Third  female  0.500000     3.0  21.750000  0.895833  0.798611   16.118810   
       male    0.135447     3.0  26.507589  0.498559  0.224784   12.661633   

               adult_male     alone  
class  sex                           
First  female    0.000000  0.361702  
       male      0.975410  0.614754  
Second female    0.000000  0.421053  
       male      0.916667  0.666667  
Third  female    0.000000  0.416667  
       male      0.919308  0.760807  


In [55]:
# 원하는 값을 출력하고 싶을 때 
# 행단위로 데이터 접근
print(gdf.loc['Third'])
# 결과값 : Third 인덱스로 속해 있는 값들이 출력된다. 

survived  pclass        age     sibsp     parch       fare  \
sex                                                                  
female  0.500000     3.0  21.750000  0.895833  0.798611  16.118810   
male    0.135447     3.0  26.507589  0.498559  0.224784  12.661633   

        adult_male     alone  
sex                           
female    0.000000  0.416667  
male      0.919308  0.760807  


In [56]:
# Third 의 male만 알고싶다.
print(gdf.loc[('Third', 'male')])

survived       0.135447
pclass         3.000000
age           26.507589
sibsp          0.498559
parch          0.224784
fare          12.661633
adult_male     0.919308
alone          0.760807
Name: (Third, male), dtype: float64


In [57]:
# xs 인덱스 사용
print(gdf.xs('male', level='sex'))

survived  pclass        age     sibsp     parch       fare  \
class                                                                
First   0.368852     1.0  41.281386  0.311475  0.278689  67.226127   
Second  0.157407     2.0  30.740707  0.342593  0.222222  19.741782   
Third   0.135447     3.0  26.507589  0.498559  0.224784  12.661633   

        adult_male     alone  
class                         
First     0.975410  0.614754  
Second    0.916667  0.666667  
Third     0.919308  0.760807  
