In [2]:
import pandas as pd
import numpy as np

## 9.1 進行簡單的分組及聚合運算

In [2]:
# 載入資料集
flights = pd.read_csv('../../data/flights.csv')
print(flights.head())

   MONTH  DAY  WEEKDAY AIRLINE ORG_AIR DEST_AIR  SCHED_DEP  DEP_DELAY  \
0      1    1        4      WN     LAX      SLC       1625       58.0   
1      1    1        4      UA     DEN      IAD        823        7.0   
2      1    1        4      MQ     DFW      VPS       1305       36.0   
3      1    1        4      AA     DFW      DCA       1555        7.0   
4      1    1        4      WN     LAX      MCI       1720       48.0   

   AIR_TIME  DIST  SCHED_ARR  ARR_DELAY  DIVERTED  CANCELLED  
0      94.0   590       1905       65.0         0          0  
1     154.0  1452       1333      -13.0         0          0  
2      85.0   641       1453       35.0         0          0  
3     126.0  1192       1935       -7.0         0          0  
4     166.0  1363       2225       39.0         0          0  


In [3]:
# 查詢每家航空的平均延誤時間
print(flights.groupby('AIRLINE')
             .agg({'ARR_DELAY':'mean'}))

         ARR_DELAY
AIRLINE           
AA        5.542661
AS       -0.833333
B6        8.692593
DL        0.339691
EV        7.034580
F9       13.630651
HA        4.972973
MQ        6.860591
NK       18.436070
OO        7.593463
UA        7.765755
US        1.681105
VX        5.348884
WN        6.397353


In [4]:
# 也可以用指定欄位的方式處理
print(flights.groupby('AIRLINE')['ARR_DELAY']
             .agg('mean'))

AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
F9    13.630651
HA     4.972973
MQ     6.860591
NK    18.436070
OO     7.593463
UA     7.765755
US     1.681105
VX     5.348884
WN     6.397353
Name: ARR_DELAY, dtype: float64


In [5]:
# 除了給字串 也可以給numpy 的聚合函式
print(flights.groupby('AIRLINE')['ARR_DELAY']
             .agg(np.mean))

AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
F9    13.630651
HA     4.972973
MQ     6.860591
NK    18.436070
OO     7.593463
UA     7.765755
US     1.681105
VX     5.348884
WN     6.397353
Name: ARR_DELAY, dtype: float64


  .agg(np.mean))


In [6]:
# 不用agg 也可以直接用聚合函式
print(flights.groupby('AIRLINE')['ARR_DELAY']
             .mean())

AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
F9    13.630651
HA     4.972973
MQ     6.860591
NK    18.436070
OO     7.593463
UA     7.765755
US     1.681105
VX     5.348884
WN     6.397353
Name: ARR_DELAY, dtype: float64


## 9.2 對多個欄位執行分組及聚合運算

In [8]:
# 取得各航空各星期航班取消數目
print(flights.groupby(['AIRLINE', 'WEEKDAY'])
             ['CANCELLED']
             .agg('sum'))

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
                    ..
WN       3          18
         4          10
         5           7
         6          10
         7           7
Name: CANCELLED, Length: 98, dtype: int64


In [14]:
# 取得各航空各星期航班取消與更改的數目與百分比
print(flights.groupby(['AIRLINE', 'WEEKDAY'])
             [['CANCELLED', 'DIVERTED']]
             .agg(['sum', 'mean']))

                CANCELLED           DIVERTED          
                      sum      mean      sum      mean
AIRLINE WEEKDAY                                       
AA      1              41  0.032106        6  0.004699
        2               9  0.007341        2  0.001631
        3              16  0.011949        2  0.001494
        4              20  0.015004        5  0.003751
        5              18  0.014151        1  0.000786
...                   ...       ...      ...       ...
WN      3              18  0.014118        2  0.001569
        4              10  0.007911        4  0.003165
        5               7  0.005828        0  0.000000
        6              10  0.010132        3  0.003040
        7               7  0.006066        3  0.002600

[98 rows x 4 columns]


In [15]:
# 根據不同出發地與目的地 找出航班總數 取消的航班數和百分比 以及飛行時間的平均數與變異數
print(flights.groupby(['ORG_AIR', 'DEST_AIR'])
             .agg({'CANCELLED':['sum', 'mean', 'size'],
                   'AIR_TIME':['mean', 'var']}))

                 CANCELLED                   AIR_TIME            
                       sum      mean size        mean         var
ORG_AIR DEST_AIR                                                 
ATL     ABE              0  0.000000   31   96.387097   45.778495
        ABQ              0  0.000000   16  170.500000   87.866667
        ABY              0  0.000000   19   28.578947    6.590643
        ACY              0  0.000000    6   91.333333   11.466667
        AEX              0  0.000000   40   78.725000   47.332692
...                    ...       ...  ...         ...         ...
SFO     SNA              4  0.032787  122   64.059322   11.338331
        STL              0  0.000000   20  198.900000  101.042105
        SUN              0  0.000000   10   78.000000   25.777778
        TUS              0  0.000000   20  100.200000   35.221053
        XNA              0  0.000000    2  173.500000    0.500000

[1130 rows x 5 columns]


In [4]:
# named aggregation 物件
print(flights.groupby(['ORG_AIR', 'DEST_AIR'])
             .agg(sum_cancelled= pd.NamedAgg(column='CANCELLED', aggfunc='sum'),
                  mean_cancelled= pd.NamedAgg(column='CANCELLED', aggfunc='mean'),
                  size_cancelled= pd.NamedAgg(column='CANCELLED', aggfunc='size'),
                  mean_air= pd.NamedAgg(column='AIR_TIME', aggfunc='sum'),
                  var_time_air= pd.NamedAgg(column='AIR_TIME', aggfunc='var'),
                  ))

                  sum_cancelled  mean_cancelled  size_cancelled  mean_air  \
ORG_AIR DEST_AIR                                                            
ATL     ABE                   0        0.000000              31    2988.0   
        ABQ                   0        0.000000              16    2728.0   
        ABY                   0        0.000000              19     543.0   
        ACY                   0        0.000000               6     548.0   
        AEX                   0        0.000000              40    3149.0   
...                         ...             ...             ...       ...   
SFO     SNA                   4        0.032787             122    7559.0   
        STL                   0        0.000000              20    3978.0   
        SUN                   0        0.000000              10     780.0   
        TUS                   0        0.000000              20    2004.0   
        XNA                   0        0.000000               2     347.0   

In [5]:
# 取出index
res = (flights.groupby(['ORG_AIR', 'DEST_AIR'])
             .agg({'CANCELLED':['sum', 'mean', 'size'],
                   'AIR_TIME':['mean', 'var']}))

print(res.columns)

MultiIndex([('CANCELLED',  'sum'),
            ('CANCELLED', 'mean'),
            ('CANCELLED', 'size'),
            ( 'AIR_TIME', 'mean'),
            ( 'AIR_TIME',  'var')],
           )


In [6]:
# 壓平多重索引
res_flat_column = res.columns.to_flat_index()
print(res_flat_column)

Index([ ('CANCELLED', 'sum'), ('CANCELLED', 'mean'), ('CANCELLED', 'size'),
        ('AIR_TIME', 'mean'),   ('AIR_TIME', 'var')],
      dtype='object')


In [7]:
# 重設index
res.columns = ['_'.join(x) for x in res_flat_column]
print(res)

                  CANCELLED_sum  CANCELLED_mean  CANCELLED_size  \
ORG_AIR DEST_AIR                                                  
ATL     ABE                   0        0.000000              31   
        ABQ                   0        0.000000              16   
        ABY                   0        0.000000              19   
        ACY                   0        0.000000               6   
        AEX                   0        0.000000              40   
...                         ...             ...             ...   
SFO     SNA                   4        0.032787             122   
        STL                   0        0.000000              20   
        SUN                   0        0.000000              10   
        TUS                   0        0.000000              20   
        XNA                   0        0.000000               2   

                  AIR_TIME_mean  AIR_TIME_var  
ORG_AIR DEST_AIR                               
ATL     ABE           96.387097 

In [8]:
# 整合的寫法
def flatten_cols(df):
    df.columns = ['_'.join(x) for x in df.columns.to_flat_index()]
    return df

res = (flights.groupby(['ORG_AIR', 'DEST_AIR'])
             .agg({'CANCELLED':['sum', 'mean', 'size'],
                   'AIR_TIME':['mean', 'var']})
        .pipe(flatten_cols))

print(res)

                  CANCELLED_sum  CANCELLED_mean  CANCELLED_size  \
ORG_AIR DEST_AIR                                                  
ATL     ABE                   0        0.000000              31   
        ABQ                   0        0.000000              16   
        ABY                   0        0.000000              19   
        ACY                   0        0.000000               6   
        AEX                   0        0.000000              40   
...                         ...             ...             ...   
SFO     SNA                   4        0.032787             122   
        STL                   0        0.000000              20   
        SUN                   0        0.000000              10   
        TUS                   0        0.000000              20   
        XNA                   0        0.000000               2   

                  AIR_TIME_mean  AIR_TIME_var  
ORG_AIR DEST_AIR                               
ATL     ABE           96.387097 

In [10]:
# category 型別 會有笛卡爾積
print(flights.assign(ORG_AIR=flights.ORG_AIR.astype('category'))
             .groupby(['ORG_AIR', 'DEST_AIR'])
             .agg({'CANCELLED':['sum', 'mean', 'size'],
                   'AIR_TIME':['mean', 'var']}))
print(res)

                 CANCELLED              AIR_TIME           
                       sum mean size        mean        var
ORG_AIR DEST_AIR                                           
ATL     ABE              0  0.0   31   96.387097  45.778495
        ABI              0  NaN    0         NaN        NaN
        ABQ              0  0.0   16  170.500000  87.866667
        ABR              0  NaN    0         NaN        NaN
        ABY              0  0.0   19   28.578947   6.590643
...                    ...  ...  ...         ...        ...
SFO     TYS              0  NaN    0         NaN        NaN
        VLD              0  NaN    0         NaN        NaN
        VPS              0  NaN    0         NaN        NaN
        XNA              0  0.0    2  173.500000   0.500000
        YUM              0  NaN    0         NaN        NaN

[2710 rows x 5 columns]
                  CANCELLED_sum  CANCELLED_mean  CANCELLED_size  \
ORG_AIR DEST_AIR                                                  
A

  .groupby(['ORG_AIR', 'DEST_AIR'])


In [11]:
# category 型別 會有笛卡爾積 observed 改為True
print(flights.assign(ORG_AIR=flights.ORG_AIR.astype('category'))
             .groupby(['ORG_AIR', 'DEST_AIR'], observed=True)
             .agg({'CANCELLED':['sum', 'mean', 'size'],
                   'AIR_TIME':['mean', 'var']}))
print(res)

                 CANCELLED                   AIR_TIME            
                       sum      mean size        mean         var
ORG_AIR DEST_AIR                                                 
ATL     ABE              0  0.000000   31   96.387097   45.778495
        ABQ              0  0.000000   16  170.500000   87.866667
        ABY              0  0.000000   19   28.578947    6.590643
        ACY              0  0.000000    6   91.333333   11.466667
        AEX              0  0.000000   40   78.725000   47.332692
...                    ...       ...  ...         ...         ...
SFO     SNA              4  0.032787  122   64.059322   11.338331
        STL              0  0.000000   20  198.900000  101.042105
        SUN              0  0.000000   10   78.000000   25.777778
        TUS              0  0.000000   20  100.200000   35.221053
        XNA              0  0.000000    2  173.500000    0.500000

[1130 rows x 5 columns]
                  CANCELLED_sum  CANCELLED_mean  CA

## 9.3 分組後刪除MultiIndex

In [12]:
# 載入資料集
flights = pd.read_csv('../../data/flights.csv')

In [13]:
# 找出各星期每個航空公司的總里程數與平均里程數 以及最大與最小延誤時間
airline_info = (flights.groupby(['AIRLINE', 'WEEKDAY'])
                       .agg({'DIST':['sum', 'mean'],
                             'ARR_DELAY':['min', 'max']})
                        .astype(int))
print(airline_info)

                    DIST       ARR_DELAY     
                     sum  mean       min  max
AIRLINE WEEKDAY                              
AA      1        1455386  1139       -60  551
        2        1358256  1107       -52  725
        3        1496665  1117       -45  473
        4        1452394  1089       -46  349
        5        1427749  1122       -41  732
...                  ...   ...       ...  ...
WN      3         997213   782       -38  262
        4        1024854   810       -52  284
        5         981036   816       -44  244
        6         823946   834       -41  290
        7         945679   819       -45  261

[98 rows x 4 columns]


In [14]:
# 查看最外層名稱
print(airline_info.columns.get_level_values(0))

# 查看第二層名稱
print(airline_info.columns.get_level_values(1))

Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')
Index(['sum', 'mean', 'min', 'max'], dtype='object')


In [15]:
# 攤平欄位名稱
airline_info.columns = ['_'.join(x) for x in airline_info.columns.to_flat_index()]

print(airline_info)

                 DIST_sum  DIST_mean  ARR_DELAY_min  ARR_DELAY_max
AIRLINE WEEKDAY                                                   
AA      1         1455386       1139            -60            551
        2         1358256       1107            -52            725
        3         1496665       1117            -45            473
        4         1452394       1089            -46            349
        5         1427749       1122            -41            732
...                   ...        ...            ...            ...
WN      3          997213        782            -38            262
        4         1024854        810            -52            284
        5          981036        816            -44            244
        6          823946        834            -41            290
        7          945679        819            -45            261

[98 rows x 4 columns]


In [16]:
# 使用reset_index()
print(airline_info.reset_index())

   AIRLINE  WEEKDAY  DIST_sum  DIST_mean  ARR_DELAY_min  ARR_DELAY_max
0       AA        1   1455386       1139            -60            551
1       AA        2   1358256       1107            -52            725
2       AA        3   1496665       1117            -45            473
3       AA        4   1452394       1089            -46            349
4       AA        5   1427749       1122            -41            732
..     ...      ...       ...        ...            ...            ...
93      WN        3    997213        782            -38            262
94      WN        4   1024854        810            -52            284
95      WN        5    981036        816            -44            244
96      WN        6    823946        834            -41            290
97      WN        7    945679        819            -45            261

[98 rows x 6 columns]


In [19]:
# 使用NamedAgg() 扁平化的欄位
print(flights.groupby(['AIRLINE', 'WEEKDAY'])
             .agg(dist_sum= pd.NamedAgg(column='DIST', aggfunc='sum'),
                  dist_mean= pd.NamedAgg(column='DIST', aggfunc='mean'),
                  arr_delay_min= pd.NamedAgg(column='ARR_DELAY', aggfunc='min'),
                  arr_delay_max= pd.NamedAgg(column='ARR_DELAY', aggfunc='max'))
                  .astype(int)
                  .reset_index())

   AIRLINE  WEEKDAY  dist_sum  dist_mean  arr_delay_min  arr_delay_max
0       AA        1   1455386       1139            -60            551
1       AA        2   1358256       1107            -52            725
2       AA        3   1496665       1117            -45            473
3       AA        4   1452394       1089            -46            349
4       AA        5   1427749       1122            -41            732
..     ...      ...       ...        ...            ...            ...
93      WN        3    997213        782            -38            262
94      WN        4   1024854        810            -52            284
95      WN        5    981036        816            -44            244
96      WN        6    823946        834            -41            290
97      WN        7    945679        819            -45            261

[98 rows x 6 columns]


In [20]:
# 避免多重標籤的參數 as_index
print(flights.groupby(['AIRLINE'], as_index=False)['DIST']
             .agg('mean')
             .round(0))

   AIRLINE    DIST
0       AA  1114.0
1       AS  1066.0
2       B6  1772.0
3       DL   866.0
4       EV   460.0
5       F9   970.0
6       HA  2615.0
7       MQ   404.0
8       NK  1047.0
9       OO   511.0
10      UA  1231.0
11      US  1181.0
12      VX  1240.0
13      WN   810.0


## 9.4 使用自訂的聚合函式來分組

In [21]:
# 載入資料集
college = pd.read_csv('../../data/college.csv')

In [22]:
# 找出各州大學生人數的平均數和標準差
print(college.groupby('STABBR')['UGDS']
             .agg(['mean', 'std'])
             .round(0))

          mean      std
STABBR                 
AK      2493.0   4052.0
AL      2790.0   4658.0
AR      1644.0   3143.0
AS      1276.0      NaN
AZ      4130.0  14894.0
CA      3518.0   6709.0
CO      2325.0   4670.0
CT      1874.0   2871.0
DC      2645.0   3225.0
DE      2491.0   4503.0
FL      2493.0   7033.0
FM      2344.0      NaN
GA      2643.0   4422.0
GU      1894.0   1774.0
HI      2361.0   2999.0
IA      2294.0   5815.0
ID      2096.0   4865.0
IL      2189.0   4080.0
IN      2654.0   8278.0
KS      1861.0   3673.0
KY      1991.0   3783.0
LA      1660.0   3485.0
MA      2023.0   3267.0
MD      3003.0   5837.0
ME      1437.0   1808.0
MH      1078.0      NaN
MI      2643.0   5356.0
MN      2086.0   3580.0
MO      1557.0   3569.0
MP      1120.0      NaN
MS      2499.0   3848.0
MT      1471.0   3007.0
NC      2447.0   4212.0
ND      1460.0   2919.0
NE      1932.0   3579.0
NH      2060.0   4877.0
NJ      2353.0   4480.0
NM      2254.0   4566.0
NV      2274.0   6144.0
NY      2349.0  

In [23]:
# 找出各州中離均值最遠的值
# 自訂z-score
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

# 使用自定義聚合函式
print(college.groupby('STABBR')['UGDS']
             .agg(max_deviation)
             .round(1))

STABBR
AK     2.6
AL     5.8
AR     6.3
AS     NaN
AZ     9.9
CA     6.1
CO     5.0
CT     5.6
DC     2.4
DE     3.5
FL     8.4
FM     NaN
GA     5.4
GU     1.0
HI     3.8
IA     6.5
ID     4.5
IL     7.3
IN     9.1
KS     4.9
KY     5.2
LA     6.5
MA     6.1
MD     5.3
ME     4.0
MH     NaN
MI     6.7
MN     7.8
MO     7.2
MP     NaN
MS     4.0
MT     3.9
NC     4.9
ND     3.5
NE     5.0
NH     5.3
NJ     7.1
NM     4.5
NV     4.7
NY     8.2
OH    10.3
OK     5.9
OR     5.3
PA    10.1
PR     6.0
PW     NaN
RI     2.9
SC     6.0
SD     4.2
TN     6.0
TX     7.7
UT     5.1
VA     7.0
VI     NaN
VT     3.8
WA     6.6
WI     5.8
WV     7.2
WY     2.8
Name: UGDS, dtype: float64


In [24]:
# 將自定義函式用於多個欄位
print(college.groupby('STABBR')
             [['UGDS', 'SATVRMID', 'SATMTMID']]
             .agg(max_deviation)
             .round(1))

        UGDS  SATVRMID  SATMTMID
STABBR                          
AK       2.6       NaN       NaN
AL       5.8       1.6       1.8
AR       6.3       2.2       2.3
AS       NaN       NaN       NaN
AZ       9.9       1.9       1.4
CA       6.1       2.7       2.5
CO       5.0       2.1       2.3
CT       5.6       3.0       2.7
DC       2.4       1.7       1.6
DE       3.5       1.2       1.1
FL       8.4       2.2       2.4
FM       NaN       NaN       NaN
GA       5.4       2.5       3.0
GU       1.0       NaN       NaN
HI       3.8       0.9       1.0
IA       6.5       2.8       2.8
ID       4.5       2.2       1.7
IL       7.3       3.3       2.5
IN       9.1       3.7       3.3
KS       4.9       2.5       1.6
KY       5.2       2.1       2.1
LA       6.5       2.4       2.2
MA       6.1       2.0       2.0
MD       5.3       2.0       2.2
ME       4.0       2.4       2.4
MH       NaN       NaN       NaN
MI       6.7       2.5       2.5
MN       7.8       2.2       2.2
MO       7

In [25]:
# 自訂的聚合函式 可以搭配內建聚合函式
print(college.groupby('STABBR')['UGDS']
             .agg([max_deviation, 'mean', 'std'])
             .round(1))

        max_deviation    mean      std
STABBR                                
AK                2.6  2493.2   4051.7
AL                5.8  2789.9   4657.9
AR                6.3  1644.1   3142.8
AS                NaN  1276.0      NaN
AZ                9.9  4130.5  14893.6
CA                6.1  3518.3   6708.6
CO                5.0  2324.9   4670.3
CT                5.6  1873.6   2871.3
DC                2.4  2645.3   3224.8
DE                3.5  2491.1   4503.4
FL                8.4  2492.9   7033.4
FM                NaN  2344.0      NaN
GA                5.4  2642.6   4422.1
GU                1.0  1894.0   1773.8
HI                3.8  2360.7   2999.3
IA                6.5  2293.9   5815.2
ID                4.5  2096.2   4864.7
IL                7.3  2189.1   4079.6
IN                9.1  2653.6   8278.0
KS                4.9  1861.3   3672.8
KY                5.2  1991.0   3783.4
LA                6.5  1660.2   3484.8
MA                6.1  2023.1   3267.4
MD                5.3  30

In [26]:
# 自訂欄位顯示名稱
max_deviation.__name__ = 'Max Deviation'

print(college.groupby('STABBR')['UGDS']
             .agg([max_deviation, 'mean', 'std'])
             .round(1))

        Max Deviation    mean      std
STABBR                                
AK                2.6  2493.2   4051.7
AL                5.8  2789.9   4657.9
AR                6.3  1644.1   3142.8
AS                NaN  1276.0      NaN
AZ                9.9  4130.5  14893.6
CA                6.1  3518.3   6708.6
CO                5.0  2324.9   4670.3
CT                5.6  1873.6   2871.3
DC                2.4  2645.3   3224.8
DE                3.5  2491.1   4503.4
FL                8.4  2492.9   7033.4
FM                NaN  2344.0      NaN
GA                5.4  2642.6   4422.1
GU                1.0  1894.0   1773.8
HI                3.8  2360.7   2999.3
IA                6.5  2293.9   5815.2
ID                4.5  2096.2   4864.7
IL                7.3  2189.1   4079.6
IN                9.1  2653.6   8278.0
KS                4.9  1861.3   3672.8
KY                5.2  1991.0   3783.4
LA                6.5  1660.2   3484.8
MA                6.1  2023.1   3267.4
MD                5.3  30

## 9.5 可接收多個參數的自訂聚合函式

In [27]:
# 定義一個可傳回學生人數在1000-3000 之間的學校百分比 只接受Series
def pct_between1_3k(s):
    return (s.between(1000, 3000)
             .mean()* 100)

In [29]:
# 用州與宗教欄位分組
print(college.groupby(['STABBR', 'RELAFFIL'])['UGDS']
             .agg(pct_between1_3k)
             .round(1))

STABBR  RELAFFIL
AK      0           14.3
        1            0.0
AL      0           23.6
        1           33.3
AR      0           27.9
                    ... 
WI      0           13.8
        1           36.0
WV      0           24.6
        1           37.5
WY      0           54.5
Name: UGDS, Length: 112, dtype: float64


In [30]:
# 修改成可以自定義上下限的函式
def pct_between(s, low, high):
    return (s.between(low, high)
             .mean()* 100)

In [32]:
# 自己填入上下限 參數改為1000 10000
print(college.groupby(['STABBR', 'RELAFFIL'])['UGDS']
             .agg(pct_between, 1000, 10000)
             .round(1))

STABBR  RELAFFIL
AK      0           42.9
        1            0.0
AL      0           45.8
        1           37.5
AR      0           39.7
                    ... 
WI      0           31.0
        1           44.0
WV      0           29.2
        1           37.5
WY      0           72.7
Name: UGDS, Length: 112, dtype: float64


In [None]:
# 自己填入上下限 關鍵字
print(college.groupby(['STABBR', 'RELAFFIL'])['UGDS']
             .agg(pct_between, low=1000, high=10000)
             .round(1))

STABBR  RELAFFIL
AK      0           42.9
        1            0.0
AL      0           45.8
        1           37.5
AR      0           39.7
                    ... 
WI      0           31.0
        1           44.0
WV      0           29.2
        1           37.5
WY      0           72.7
Name: UGDS, Length: 112, dtype: float64


In [33]:
# 閉包寫法
def between_n_m(n, m):
    def wrapper(ser):
        return pct_between(ser, n, m)
    wrapper.__name__ = f'between_{n}_{m}'
    return wrapper

In [34]:
# 使用閉包函式就可以使用多個聚合函式
print(college.groupby(['STABBR', 'RELAFFIL'])['UGDS']
             .agg([between_n_m(1000, 10000), 'max', 'mean'])
             .round(1))

                 between_1000_10000      max    mean
STABBR RELAFFIL                                     
AK     0                       42.9  12865.0  3508.9
       1                        0.0    275.0   123.3
AL     0                       45.8  29851.0  3248.8
       1                       37.5   3033.0   979.7
AR     0                       39.7  21405.0  1793.7
...                             ...      ...     ...
WI     0                       31.0  29302.0  2879.1
       1                       44.0   8212.0  1716.2
WV     0                       29.2  44924.0  1873.9
       1                       37.5   1375.0   716.4
WY     0                       72.7   9910.0  2244.4

[112 rows x 3 columns]


## 9.6 深入了解groupby 物件

In [3]:
# 載入資料集
college = pd.read_csv('../../data/college.csv')

In [4]:
# groupby 兩個欄位
grouped = college.groupby(['STABBR', 'RELAFFIL'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [5]:
# 使用dir() 查看該物件有甚麼屬性
print([attr for attr in dir(grouped) if not attr.startswith('_')])

['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'ewm', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sample', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'value_counts', 'var']


In [6]:
# 查看組別數量
grouped.ngroups

112

In [7]:
# 取得前六組的標籤
groups = list(grouped.groups)
groups[:6]

[('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]

In [8]:
# 取得特定組別
print(grouped.get_group(('FL', 1)))

                                         INSTNM             CITY STABBR  HBCU  \
712              The Baptist College of Florida       Graceville     FL   0.0   
713                            Barry University            Miami     FL   0.0   
714       Gooding Institute of Nurse Anesthesia      Panama City     FL   0.0   
715                  Bethune-Cookman University    Daytona Beach     FL   1.0   
724                  Johnson University Florida        Kissimmee     FL   0.0   
...                                         ...              ...    ...   ...   
7486    Strayer University-Coral Springs Campus    Coral Springs     FL   NaN   
7487  Strayer University-Fort Lauderdale Campus  Fort Lauderdale     FL   NaN   
7488          Strayer University-Miramar Campus          Miramar     FL   NaN   
7489                   Strayer University-Doral            Miami     FL   NaN   
7490                Strayer University-Brickell            Miami     FL   NaN   

      MENONLY  WOMENONLY  R

In [11]:
# 迭代顯示每個物件的資料
from IPython.display import display
for name, group in grouped:
    print(name)
    display(group.head(3))
    break

('AK', np.int64(0))


Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,University of Alaska Anchorage,Anchorage,AK,0.0,0.0,0.0,0,,,0.0,...,0.098,0.0181,0.0457,0.4539,1,0.2385,0.2647,0.4386,42500,19449.5
62,University of Alaska Fairbanks,Fairbanks,AK,0.0,0.0,0.0,0,,,0.0,...,0.0401,0.011,0.306,0.3887,1,0.2263,0.255,0.4519,36200,19355.0
63,University of Alaska Southeast,Juneau,AK,0.0,0.0,0.0,0,,,0.0,...,0.0686,0.0049,0.2241,0.5112,1,0.1769,0.1996,0.555,37400,16875.0


In [12]:
# 只印出每一組的第一列
print(grouped.head(1))

                                                INSTNM        CITY STABBR  \
0                             Alabama A & M University      Normal     AL   
2                                   Amridge University  Montgomery     AL   
43                          Prince Institute-Southeast    Elmhurst     IL   
60                      University of Alaska Anchorage   Anchorage     AK   
61                                Alaska Bible College      Palmer     AK   
...                                                ...         ...    ...   
4561                   College of the Marshall Islands      Majuro     MH   
5289                        Pacific Islands University    Mangilao     GU   
6439                           Touro University Nevada   Henderson     NV   
7404  University of the Virgin Islands-Albert A. Sheen   St. Croix     VI   
7419                 Computer Career Center-Las Cruces  Las Cruces     NM   

      HBCU  MENONLY  WOMENONLY  RELAFFIL  SATVRMID  SATMTMID  DISTANCEONLY 

In [13]:
# 選出首列與最後一列
print(grouped.nth([1, -1]))

                                                 INSTNM              CITY  \
1                   University of Alabama at Birmingham        Birmingham   
10                          Birmingham Southern College        Birmingham   
62                       University of Alaska Fairbanks         Fairbanks   
64                            Alaska Pacific University         Anchorage   
70                 Empire Beauty School-Paradise Valley           Phoenix   
...                                                 ...               ...   
7519               Strayer University-Charleston Campus  North Charleston   
7531                  Rasmussen College - Overland Park     Overland Park   
7532  National Personal Training Institute of Cleveland  Highland Heights   
7533  Bay Area Medical Academy - San Jose Satellite ...          San Jose   
7534            Excel Learning Center-San Antonio South       San Antonio   

     STABBR  HBCU  MENONLY  WOMENONLY  RELAFFIL  SATVRMID  SATMTMID  \
1   

## 9.7 過濾特定的組別

In [2]:
# 載入資料集
college = pd.read_csv('../../data/college.csv', index_col='INSTNM')
grouped = college.groupby('STABBR')
grouped.ngroups

59

In [3]:
# 找各州內的所有大學中的非白人百分比是否超過特定閥值
def check_minority(df, threshold):
    minority_pct = 1 - df['UGDS_WHITE']
    total_minority = (df['UGDS'] * minority_pct).sum()

    total_ugds = df['UGDS'].sum()
    total_minority_pct = total_minority / total_ugds

    return total_minority_pct > threshold

In [4]:
# 找超過50% 的州
college_filtered = grouped.filter(check_minority, threshold=.5)
print(college_filtered)

                                                           CITY STABBR  HBCU  \
INSTNM                                                                         
Everest College-Phoenix                                 Phoenix     AZ   0.0   
Collins College                                         Phoenix     AZ   0.0   
Empire Beauty School-Paradise Valley                    Phoenix     AZ   0.0   
Empire Beauty School-Tucson                              Tucson     AZ   0.0   
Thunderbird School of Global Management                Glendale     AZ   0.0   
...                                                         ...    ...   ...   
WestMed College - Merced                                 Merced     CA   NaN   
Vantage College                                         El Paso     TX   NaN   
SAE Institute of Technology  San Francisco           Emeryville     CA   NaN   
Bay Area Medical Academy - San Jose Satellite L...     San Jose     CA   NaN   
Excel Learning Center-San Antonio South 

In [5]:
# 比較原始數據與篩選後的數據
print(college.shape)

print(college_filtered.shape)

print(college_filtered['STABBR'].nunique())

(7535, 26)
(3028, 26)
20


## 9.8 分組轉換特定欄位的資料

In [6]:
# 載入資料集
weight_loss = pd.read_csv('../../data/weight_loss.csv')
print(weight_loss.query('Month == "Jan"'))

  Name Month    Week  Weight
0  Bob   Jan  Week 1     291
1  Amy   Jan  Week 1     197
2  Bob   Jan  Week 2     288
3  Amy   Jan  Week 2     189
4  Bob   Jan  Week 3     283
5  Amy   Jan  Week 3     189
6  Bob   Jan  Week 4     283
7  Amy   Jan  Week 4     190


In [7]:
# 算出每月第一周與最後一周的變化的函式
def percent_loss(s):
    return((s - s.iloc[0]) / s.iloc[0]) * 100

In [8]:
# 算出Bob 一月的體重變化
print(weight_loss
        .query('Name=="Bob" and Month=="Jan"')
        ['Weight']
        .pipe(percent_loss))

0    0.000000
2   -1.030928
4   -2.749141
6   -2.749141
Name: Weight, dtype: float64


In [12]:
# 傳入兩人的資料
print(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight']
        .transform(percent_loss))

0     0.000000
1     0.000000
2    -1.030928
3    -4.060914
4    -2.749141
5    -4.060914
6    -2.749141
7    -3.553299
8     0.000000
9     0.000000
10   -2.826855
11   -3.157895
12   -5.300353
13   -6.842105
14   -5.300353
15   -8.947368
16    0.000000
17    0.000000
18    1.119403
19    0.000000
20   -1.119403
21   -1.734104
22   -2.611940
23   -1.734104
24    0.000000
25    0.000000
26   -1.149425
27   -3.529412
28   -3.065134
29   -3.529412
30   -4.214559
31   -5.294118
Name: Weight, dtype: float64


In [13]:
# 把算出來的資料放回原本的df中
print(weight_loss
        .assign(percent_loss=(weight_loss
            .groupby(['Name', 'Month'])
            ['Weight']
            .transform(percent_loss)
            .round(1)))
        .query('Name=="Bob" and Month in ["Jan", "Feb"]')
)
        

   Name Month    Week  Weight  percent_loss
0   Bob   Jan  Week 1     291           0.0
2   Bob   Jan  Week 2     288          -1.0
4   Bob   Jan  Week 3     283          -2.7
6   Bob   Jan  Week 4     283          -2.7
8   Bob   Feb  Week 1     283           0.0
10  Bob   Feb  Week 2     275          -2.8
12  Bob   Feb  Week 3     268          -5.3
14  Bob   Feb  Week 4     268          -5.3


In [14]:
# 看最後一周 就可以決定贏家
print(weight_loss
        .assign(percent_loss=(weight_loss
            .groupby(['Name', 'Month'])
            ['Weight']
            .transform(percent_loss)
            .round(1)))
        .query('Week == "Week 4"')
)

   Name Month    Week  Weight  percent_loss
6   Bob   Jan  Week 4     283          -2.7
7   Amy   Jan  Week 4     190          -3.6
14  Bob   Feb  Week 4     268          -5.3
15  Amy   Feb  Week 4     173          -8.9
22  Bob   Mar  Week 4     261          -2.6
23  Amy   Mar  Week 4     170          -1.7
30  Bob   Apr  Week 4     250          -4.2
31  Amy   Apr  Week 4     161          -5.3


In [16]:
# pivot 一下
print(weight_loss
        .assign(percent_loss=(weight_loss
            .groupby(['Name', 'Month'])
            ['Weight']
            .transform(percent_loss)
            .round(1)))
        .query('Week == "Week 4"')
        .pivot(index='Month', columns='Name',
               values='percent_loss')
)

Name   Amy  Bob
Month          
Apr   -5.3 -4.2
Feb   -8.9 -5.3
Jan   -3.6 -2.7
Mar   -1.7 -2.6


In [17]:
# 用 np.where() 放入獲勝者
print(weight_loss
        .assign(percent_loss=(weight_loss
            .groupby(['Name', 'Month'])
            ['Weight']
            .transform(percent_loss)
            .round(1)))
        .query('Week == "Week 4"')
        .pivot(index='Month', columns='Name',
               values='percent_loss')
        .assign(winner=lambda df_:
                np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
)

Name   Amy  Bob winner
Month                 
Apr   -5.3 -4.2    Amy
Feb   -8.9 -5.3    Amy
Jan   -3.6 -2.7    Amy
Mar   -1.7 -2.6    Bob


In [19]:
# 算出獲勝次數
print(weight_loss
        .assign(percent_loss=(weight_loss
            .groupby(['Name', 'Month'])
            ['Weight']
            .transform(percent_loss)
            .round(1)))
        .query('Week == "Week 4"')
        .pivot(index='Month', columns='Name',
               values='percent_loss')
        .assign(winner=lambda df_:
                np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
        .winner
        .value_counts()
)

winner
Amy    3
Bob    1
Name: count, dtype: int64


In [21]:
# 轉換月份排序
print(weight_loss
        .assign(percent_loss=(weight_loss
            .groupby(['Name', 'Month'])
            ['Weight']
            .transform(percent_loss)
            .round(1)),
                Month=pd.Categorical(weight_loss.Month,
                      categories=['Jan', 'Feb', 'Mar', 'Apr'],
                      ordered=True))
        .query('Week == "Week 4"')
        .pivot(index='Month', columns='Name',
               values='percent_loss')
)

Name   Amy  Bob
Month          
Jan   -3.6 -2.7
Feb   -8.9 -5.3
Mar   -1.7 -2.6
Apr   -5.3 -4.2


## 9.9 使用apply() 計算加權平均數

In [22]:
# 載入資料集
college = pd.read_csv('../../data/college.csv')
# 刪除這三個欄位的缺失值
subset = ['UGDS', 'SATMTMID', 'SATVRMID']
college2 = college.dropna(subset=subset)
print(college.shape)
print(college2.shape)


(7535, 27)
(1184, 27)


In [23]:
# 定義一個計算SAT 數學成績的加權平均
def weighted_math_average(df):
    weighted_math = df['UGDS'] * df['SATMTMID']
    return int(weighted_math.sum() / df['UGDS'].sum())

In [24]:
# 用在groupby 各州後的資料 可以回傳純量
print(college2.groupby('STABBR').apply(weighted_math_average))

STABBR
AK    503
AL    536
AR    529
AZ    569
CA    564
CO    553
CT    545
DC    621
DE    569
FL    565
GA    540
HI    534
IA    577
ID    509
IL    594
IN    546
KS    491
KY    525
LA    549
MA    597
MD    572
ME    524
MI    586
MN    598
MO    576
MS    527
MT    551
NC    552
ND    546
NE    567
NH    561
NJ    554
NM    529
NV    516
NY    578
OH    569
OK    557
OR    540
PA    553
PR    571
RI    567
SC    549
SD    528
TN    544
TX    548
UT    577
VA    550
VI    390
VT    566
WA    555
WI    593
WV    500
WY    540
dtype: int64


  print(college2.groupby('STABBR').apply(weighted_math_average))


In [26]:
# 算出每個科目的加權平均和算術平均 大學數量
def weighted_average(df):
    weight_m = df['UGDS'] * df['SATMTMID']
    weight_v = df['UGDS'] * df['SATVRMID']
    wm_avg = weight_m.sum() / df['UGDS'].sum()
    wv_avg = weight_v.sum() / df['UGDS'].sum()
    data = {'w_math_avg': wm_avg,
            'w_verbal_avg': wv_avg,
            'math_avg': df['SATMTMID'].mean(),
            'verbal_avg': df['SATVRMID'].mean(),
            'count': len(df)
            }
    return pd.Series(data)

In [None]:
# 用全部算
print(weighted_average(college2))

w_math_avg       559.408812
w_verbal_avg     542.989462
math_avg         530.958615
verbal_avg       522.775338
count           1184.000000
dtype: float64


In [29]:
# 用各州分組
print(college2
        .groupby('STABBR')
        .apply(weighted_average, include_groups=False)
        .astype(int))

        w_math_avg  w_verbal_avg  math_avg  verbal_avg  count
STABBR                                                       
AK             503           555       503         555      1
AL             536           533       504         508     21
AR             529           504       515         491     16
AZ             569           557       536         538      6
CA             564           539       562         549     72
CO             553           547       540         537     14
CT             545           533       522         517     14
DC             621           623       588         589      6
DE             569           553       495         486      3
FL             565           565       521         529     38
GA             540           541       503         509     42
HI             534           513       513         497      4
IA             577           533       534         511     23
ID             509           510       504         510      7
IL      

In [32]:
# 回傳df
from scipy.stats import gmean, hmean
def calculate_means(df):
    df_means = pd.DataFrame(index=['Arithmetic', 'Weighted',
                                   'Geometric', 'Harmonic'])
    cols = ['SATMTMID', 'SATVRMID']
    for col in cols:
        arithmetic = df[col].mean()
        weighted = np.average(df[col], weights=df['UGDS'])
        geometric = gmean(df[col])
        harmonic = hmean(df[col])
        df_means[col] = [arithmetic, weighted, geometric, harmonic]
    df_means['count'] = len(df)
    return df_means.astype(int)
print(college2
        .groupby('STABBR')
        .apply(calculate_means, include_groups = False))

                   SATMTMID  SATVRMID  count
STABBR                                      
AK     Arithmetic       503       555      1
       Weighted         503       555      1
       Geometric        503       555      1
       Harmonic         503       555      1
AL     Arithmetic       504       508     21
...                     ...       ...    ...
WV     Harmonic         480       472     17
WY     Arithmetic       540       535      1
       Weighted         540       535      1
       Geometric        540       534      1
       Harmonic         540       535      1

[212 rows x 3 columns]


## 9.10 以連續變化的數值進行分組

In [3]:
# 載入資料集
flights = pd.read_csv('../../data/flights.csv')

In [4]:
# 將飛行距離分成5個區間
bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
cuts = pd.cut(flights['DIST'], bins=bins)
print(cuts)

0         (500.0, 1000.0]
1        (1000.0, 2000.0]
2         (500.0, 1000.0]
3        (1000.0, 2000.0]
4        (1000.0, 2000.0]
               ...       
58487    (1000.0, 2000.0]
58488      (200.0, 500.0]
58489      (200.0, 500.0]
58490     (500.0, 1000.0]
58491     (500.0, 1000.0]
Name: DIST, Length: 58492, dtype: category
Categories (5, interval[float64, right]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]


In [5]:
# 查看分組結果
print(cuts.value_counts())

DIST
(500.0, 1000.0]     20659
(200.0, 500.0]      15874
(1000.0, 2000.0]    14186
(2000.0, inf]        4054
(-inf, 200.0]        3719
Name: count, dtype: int64


In [11]:
# 拿來分組 並算出各航空的相對頻率
print(flights
        .groupby(cuts, observed=False)
        ['AIRLINE']
        .value_counts(normalize=True)
        .round(3))

DIST           AIRLINE
(-inf, 200.0]  OO         0.326
               EV         0.289
               MQ         0.211
               DL         0.086
               AA         0.052
                          ...  
(2000.0, inf]  AS         0.012
               F9         0.004
               EV         0.000
               MQ         0.000
               OO         0.000
Name: proportion, Length: 70, dtype: float64


In [8]:
# 也可以拿來算飛行時間的四分位數
print(flights
        .groupby(cuts, observed=False)
        ['AIR_TIME']
        .quantile(q=[.25, .5, .75])
        .div(60)
        .round(2))

DIST                  
(-inf, 200.0]     0.25    0.43
                  0.50    0.50
                  0.75    0.57
(200.0, 500.0]    0.25    0.77
                  0.50    0.92
                  0.75    1.05
(500.0, 1000.0]   0.25    1.43
                  0.50    1.65
                  0.75    1.92
(1000.0, 2000.0]  0.25    2.50
                  0.50    2.93
                  0.75    3.40
(2000.0, inf]     0.25    4.30
                  0.50    4.70
                  0.75    5.03
Name: AIR_TIME, dtype: float64


## 9.11 案例演練: 計算城市之間的航班總數

In [12]:
# 載入資料集
flights = pd.read_csv('../../data/flights.csv')

In [13]:
# 計算起點與終點有多少種組合
flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
print(flights_ct)

ORG_AIR  DEST_AIR
ATL      ABE          31
         ABQ          16
         ABY          19
         ACY           6
         AEX          40
                    ... 
SFO      SNA         122
         STL          20
         SUN          10
         TUS          20
         XNA           2
Length: 1130, dtype: int64


In [15]:
# 找出IAH ATL 的雙線航班
print(flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]])

ORG_AIR  DEST_AIR
ATL      IAH         121
IAH      ATL         148
dtype: int64


In [16]:
# 做出新的標籤
f_part3 = (flights
           [['ORG_AIR', 'DEST_AIR']]
           .apply(lambda ser:
                  ser.sort_values().reset_index(drop=True),
                  axis='columns'))
print(f_part3)

         0    1
0      LAX  SLC
1      DEN  IAD
2      DFW  VPS
3      DCA  DFW
4      LAX  MCI
...    ...  ...
58487  DFW  SFO
58488  LAS  SFO
58489  SBA  SFO
58490  ATL  MSP
58491  BOI  SFO

[58492 rows x 2 columns]


In [None]:
# 重新命名欄位名稱
rename_dict = {0:'AIR1', 1:'AIR2'}
print(flights
           [['ORG_AIR', 'DEST_AIR']]
           .apply(lambda ser:
                  ser.sort_values().reset_index(drop=True),
                  axis='columns')
            .rename(columns=rename_dict)
            .groupby(['AIR1', 'AIR2'])
            .size())

AIR1  AIR2
ABE   ATL      31
      ORD      24
ABI   DFW      74
ABQ   ATL      16
      DEN      46
             ... 
SFO   SNA     122
      STL      20
      SUN      10
      TUS      20
      XNA       2
Length: 1085, dtype: int64


In [18]:
# 計算ATL 與IAH 之間航班數
rename_dict = {0:'AIR1', 1:'AIR2'}
print(flights
           [['ORG_AIR', 'DEST_AIR']]
           .apply(lambda ser:
                  ser.sort_values().reset_index(drop=True),
                  axis='columns')
            .rename(columns=rename_dict)
            .groupby(['AIR1', 'AIR2'])
            .size()
            .loc[('ATL', 'IAH')])

269


In [19]:
# 改用np.sort()
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])

# 改為df 並重新命名欄位名稱
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])

# 比較結果
flights_sort2.equals(f_part3.rename(columns={0:'AIR1', 1:'AIR2'}))

True

## 9.12 案例演練: 尋找航班的連續準時紀錄

In [20]:
# 先用一個簡單的Series 練習找出連續紀錄
s = pd.Series([0, 1, 1, 0, 1, 1, 1, 0])
print(s)

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    0
dtype: int64


In [21]:
# 使用cumsum()
s1 = s.cumsum()
print(s1)

0    0
1    1
2    2
3    2
4    3
5    4
6    5
7    5
dtype: int64


In [23]:
# 將s1 乘上 s
print(s.mul(s1))

0    0
1    1
2    2
3    0
4    3
5    4
6    5
7    0
dtype: int64


In [25]:
# 使用diff()
print(s.mul(s1).diff())

0    NaN
1    1.0
2    1.0
3   -2.0
4    3.0
5    1.0
6    1.0
7   -5.0
dtype: float64


In [27]:
# 只保留負數值
print(s.mul(s.cumsum())
  .diff()
  .where(lambda x: x < 0))

0    NaN
1    NaN
2    NaN
3   -2.0
4    NaN
5    NaN
6    NaN
7   -5.0
dtype: float64


In [30]:
# 填補缺失值
print(s.mul(s.cumsum())
  .diff()
  .where(lambda x: x < 0)
  .ffill())

0    NaN
1    NaN
2    NaN
3   -2.0
4   -2.0
5   -2.0
6   -2.0
7   -5.0
dtype: float64


In [None]:
# 與cumsum() 結果相加
print(s.mul(s.cumsum())
  .diff()
  .where(lambda x: x < 0)
  .ffill()
  .add(s.cumsum(), fill_value=0)
)

0    0.0
1    1.0
2    2.0
3    0.0
4    1.0
5    2.0
6    3.0
7    0.0
dtype: float64


In [32]:
# 載入資料集
flights = pd.read_csv('../../data/flights.csv')

In [33]:
# 把前面的計數器改寫為函式
def max_streak(s):
    s1 = s.cumsum()
    return (s.mul(s1)
            .diff()
            .where(lambda x: x < 0)
            .ffill()
            .add(s.cumsum(), fill_value=0)
            .max())

In [None]:
# 套用函式 找出每家航空的最大連續準時紀錄
print(flights
        .assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
        .sort_values(['MONTH', 'DAY', 'SCHED_DEP'])
        .groupby(['AIRLINE', 'ORG_AIR'])
        ['ON_TIME']
        .agg(['mean', 'size', max_streak])
        .round(2))

                 mean  size  max_streak
AIRLINE ORG_AIR                        
AA      ATL      0.82   233        15.0
        DEN      0.74   219        17.0
        DFW      0.78  4006        64.0
        IAH      0.80   196        24.0
        LAS      0.79   374        29.0
...               ...   ...         ...
WN      LAS      0.77  2031        39.0
        LAX      0.70  1135        23.0
        MSP      0.84   237        32.0
        PHX      0.77  1724        33.0
        SFO      0.76   445        17.0

[114 rows x 3 columns]
