In [2]:
import pandas as pd
import numpy as np

# 定義一個可以並排顯示 DataFrame 的類別
class display(object):
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
    
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

In [3]:
# 下載行星資料集
import seaborn as sns
planets = sns.load_dataset('planets')
print(planets.shape)
print(planets.head())

(1035, 6)
            method  number  orbital_period   mass  distance  year
0  Radial Velocity       1         269.300   7.10     77.40  2006
1  Radial Velocity       1         874.774   2.21     56.95  2008
2  Radial Velocity       1         763.000   2.60     19.84  2011
3  Radial Velocity       1         326.030  19.40    110.62  2007
4  Radial Velocity       1         516.220  10.50    119.47  2009


In [4]:
# Series 的聚合運算
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
print(ser)
print(ser.sum())
print(ser.mean())

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
dtype: float64
2.811925491708157
0.5623850983416314


In [5]:
# Pandas 的聚合運算
df = pd.DataFrame({'A': rng.rand(5),
                   'B': rng.rand(5)})

print(df)
print(df.mean())

# 可以改成對每一列進行聚合運算
df.mean(axis='columns')

          A         B
0  0.155995  0.020584
1  0.058084  0.969910
2  0.866176  0.832443
3  0.601115  0.212339
4  0.708073  0.181825
A    0.477888
B    0.443420
dtype: float64


0    0.088290
1    0.513997
2    0.849309
3    0.406727
4    0.444949
dtype: float64

In [6]:
# 比較沒有dropna() 的計算結果
print(planets.describe())
print(planets.dropna().describe())

            number  orbital_period        mass     distance         year
count  1035.000000      992.000000  513.000000   808.000000  1035.000000
mean      1.785507     2002.917596    2.638161   264.069282  2009.070531
std       1.240976    26014.728304    3.818617   733.116493     3.972567
min       1.000000        0.090706    0.003600     1.350000  1989.000000
25%       1.000000        5.442540    0.229000    32.560000  2007.000000
50%       1.000000       39.979500    1.260000    55.250000  2010.000000
75%       2.000000      526.005000    3.040000   178.500000  2012.000000
max       7.000000   730000.000000   25.000000  8500.000000  2014.000000
          number  orbital_period        mass    distance         year
count  498.00000      498.000000  498.000000  498.000000   498.000000
mean     1.73494      835.778671    2.509320   52.068213  2007.377510
std      1.17572     1469.128259    3.636274   46.596041     4.167284
min      1.00000        1.328300    0.003600    1.350000  1989.

In [7]:
df = pd.DataFrame({'key':['A', 'B', 'C', 'A', 'B', 'C'],
                   'data':range(6)}, columns=['key', 'data'])
print(df)

# 使用方法 先使用groupby()方法指定要想要使用的關鍵欄位 此時還沒有進行任何運算
print(df.groupby('key'))

# 在指定聚合函式後 才會真的進行後面的apply/combine
print(df.groupby('key').sum())

  key  data
0   A     0
1   B     1
2   C     2
3   A     3
4   B     4
5   C     5
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EE24B6A3C0>
     data
key      
A       3
B       5
C       7


In [8]:
# 在指定聚合函數前 都是groupby物件型態
print(planets.groupby('method'))
print(planets.groupby('method')['orbital_period'])

# 找出每一種方法在軌道周期的中位數
print(planets.groupby('method')['orbital_period'].median())

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EE24B74690>
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001EE24B6A3C0>
method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64


In [9]:
# 觀察迭代的用法 對除錯有幫助 {0:30s}：表示第一個參數（method）是 字串（s），並且占 30 個字元的寬度（不足則填充空格）
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [10]:
# groupby 物件可以使用describe()
planets.groupby('method')['year'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


In [11]:
# 定義新的dataframe
rng = np.random.RandomState(0)
df = pd.DataFrame({'key':['A', 'B', 'C', 'A', 'B', 'C' ],
                   'data1':range(6),
                   'data2':rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])
print(df)

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9


In [14]:
# aggregate() 可以用字串指定需要的聚合運算 並以串列呼叫
print(df.groupby('key').aggregate(['min', 'median', 'max']))

# 傳送字典 key 會變成欄位名稱
print(df.groupby('key').aggregate({'data1': 'min','data2': 'max'}))

    data1            data2           
      min median max   min median max
key                                  
A       0    1.5   3     3    4.0   5
B       1    2.5   4     0    3.5   7
C       2    3.5   5     3    6.0   9
     data1  data2
key              
A        0      5
B        1      7
C        2      9


In [16]:
# 使用filter 保留標準差大於4 的資料
def filter_func(x):
    return x['data2'].std() > 4

display('df', 'df.groupby("key").std()',
        "df.groupby('key').filter(filter_func)")

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


In [18]:
# transform 範例 將資料置中
def center(x):
    return x - x.mean()
print(df.groupby('key').transform(center))

   data1  data2
0   -1.5    1.0
1   -1.5   -3.5
2   -1.5   -3.0
3    1.5   -1.0
4    1.5    3.5
5    1.5    3.0


In [25]:
# apply 範例 利用第二欄總和來正規化第一欄
def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x

print(df.groupby('key').apply(norm_by_data2,include_groups=False))

          data1  data2
key                   
A   0  0.000000      5
    3  0.375000      3
B   1  0.142857      0
    4  0.571429      7
C   2  0.166667      3
    5  0.416667      9


In [27]:
print(df)

# 以串列為鍵
L = [0, 1, 0, 1, 2, 0]
print(df.groupby(L).sum())

  key  data1  data2
0   A      0      5
1   B      1      0
2   C      2      3
3   A      3      3
4   B      4      7
5   C      5      9
   key  data1  data2
0  ACC      7     17
1   BA      4      3
2    B      4      7


In [28]:
# 以字典對應到索引值 做為群組鍵
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
display('df2', 'df2.groupby(mapping).sum()')

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
consonant,12,19
vowel,3,8


In [29]:
# 字串小寫函式
print(df2.groupby(str.lower).mean())

     data1  data2
key              
a      1.5    4.0
b      2.5    3.5
c      3.5    6.0


In [30]:
# 合併有效鍵
print(df2.groupby([str.lower, mapping]).mean())

               data1  data2
key key                    
a   vowel        1.5    4.0
b   consonant    2.5    3.5
c   consonant    3.5    6.0


In [40]:
# 以發現的方法和 10年為單位 計算被發現的行星數量
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
print(planets.groupby(['method',decade])['number'].sum().unstack().fillna(0))

decade                         1980s  1990s  2000s  2010s
method                                                   
Astrometry                       0.0    0.0    0.0    2.0
Eclipse Timing Variations        0.0    0.0    5.0   10.0
Imaging                          0.0    0.0   29.0   21.0
Microlensing                     0.0    0.0   12.0   15.0
Orbital Brightness Modulation    0.0    0.0    0.0    5.0
Pulsar Timing                    0.0    9.0    1.0    1.0
Pulsation Timing Variations      0.0    0.0    1.0    0.0
Radial Velocity                  1.0   52.0  475.0  424.0
Transit                          0.0    0.0   64.0  712.0
Transit Timing Variations        0.0    0.0    0.0    9.0
