In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 1, 1, 0, 1, 1, 1, 0])
s

0    1
1    1
2    1
3    0
4    1
5    1
6    1
7    0
dtype: int64

In [3]:
s1 = s.cumsum()
s1

0    1
1    2
2    3
3    3
4    4
5    5
6    6
7    6
dtype: int64

In [4]:
s.mul(s1).diff()

0    NaN
1    1.0
2    1.0
3   -3.0
4    4.0
5    1.0
6    1.0
7   -6.0
dtype: float64

In [5]:
s.mul(s1).diff().where(lambda x: x < 0)

0    NaN
1    NaN
2    NaN
3   -3.0
4    NaN
5    NaN
6    NaN
7   -6.0
dtype: float64

In [6]:
s.mul(s1).diff().where(lambda x: x < 0).ffill().add(s1, fill_value=0)

0    1.0
1    2.0
2    3.0
3    0.0
4    1.0
5    2.0
6    3.0
7    0.0
dtype: float64

In [7]:
flights = pd.read_csv('data/flights.csv')
flights['ON_TIME'] = flights['ARR_DELAY'].lt(15).astype(int)
flights[['AIRLINE', 'ORG_AIR', 'ON_TIME']].head(10)

Unnamed: 0,AIRLINE,ORG_AIR,ON_TIME
0,WN,LAX,0
1,UA,DEN,1
2,MQ,DFW,0
3,AA,DFW,1
4,WN,LAX,0
5,UA,IAH,1
6,AA,DFW,0
7,F9,SFO,1
8,AA,ORD,1
9,UA,IAH,1


In [8]:
def max_streak(s):
    s1 = s.cumsum()
    return s.mul(s1).diff().where(lambda x: x < 0) \
            .ffill().add(s1, fill_value=0).max()

In [9]:
flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']) \
       .groupby(['AIRLINE', 'ORG_AIR'])['ON_TIME'] \
       .agg(['mean', 'size', max_streak]).round(2).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size,max_streak
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,ATL,0.82,233,15
AA,DEN,0.74,219,17
AA,DFW,0.78,4006,64
AA,IAH,0.8,196,24
AA,LAS,0.79,374,29


In [10]:
def max_delay_streak(df):
    df = df.reset_index(drop=True)
    s = 1 - df['ON_TIME']
    s1 = s.cumsum()
    streak = s.mul(s1).diff().where(lambda x: x < 0) \
              .ffill().add(s1, fill_value=0)
    last_idx = streak.idxmax()
    first_idx = last_idx - streak.max() + 1
    df_return = df.loc[[first_idx, last_idx], ['MONTH', 'DAY']]
    df_return['streak'] = streak.max()
    df_return.index = ['first', 'last']
    df_return.index.name='streak_row'
    return df_return

In [11]:
flights.sort_values(['MONTH', 'DAY', 'SCHED_DEP']) \
       .groupby(['AIRLINE', 'ORG_AIR']) \
       .apply(max_delay_streak) \
       .sort_values(['streak','MONTH','DAY'], ascending=[False, True, True]).head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,MONTH,DAY,streak
AIRLINE,ORG_AIR,streak_row,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,DFW,first,2.0,26.0,38.0
AA,DFW,last,3.0,1.0,38.0
MQ,ORD,first,1.0,6.0,28.0
MQ,ORD,last,1.0,12.0,28.0
MQ,DFW,first,2.0,21.0,25.0
MQ,DFW,last,2.0,26.0,25.0
NK,ORD,first,6.0,7.0,15.0
NK,ORD,last,6.0,18.0,15.0
DL,ATL,first,12.0,23.0,14.0
DL,ATL,last,12.0,24.0,14.0
