In [2]:
import os
import pandas as pd
import numpy as np

os.chdir("../../coding_data/cookbook_data/")
os.getcwd()

"C:\\Users\\k2035549\\OneDrive - King's College London\\Desktop\\Coding\\coding_data\\cookbook_data"

Read data

In [3]:
college = pd.read_csv('data/college.csv')
flights = pd.read_csv('data/flights.csv')

# Groupby notes

With any kind of grouping operation, it helps to identify the three components:
* grouping columns
* aggregating columns
* aggregating functions

groupby object has four methods that accept a function to perform a calculation on each group.
* .agg (must return scalar)(applies to all columns)
* .filter (return a boolean)
* .transform (return series or df with the same length as the passed group)
* .apply (return scalar, series or df)
    * can create multiple columns but must return Series if so

# Groupby parameters

In [None]:
# to get rid of multiindex with group bys
(flights
    .groupby(['AIRLINE'], as_index=False)
    ['DIST']
    .agg('mean')
    .round(0)
)

# No explosion in rows when a groupby column is type categorical
res_no_explosion = (flights
    .assign(ORG_AIR=flights.ORG_AIR.astype('category'))
    .groupby(['ORG_AIR', 'DEST_AIR'], observed=True) ###
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)

# Examples

# Example 1: 
# Grouping and aggregating with multiple columns and functions
# Flattening column names

In [6]:
# define function
def flatten_cols(df):
    df.columns = ['_'.join(x) for x in
        df.columns.to_flat_index()]
    return df

# group by multiple columns and functions and flatten names
res = (flights
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
    .pipe(flatten_cols)
)

res.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_sum,CANCELLED_mean,CANCELLED_size,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.0,31,96.387097,45.778495
ATL,ABQ,0,0.0,16,170.5,87.866667
ATL,ABY,0,0.0,19,28.578947,6.590643


# Example 2: Avoiding combinatoric explosion

In [10]:
# with explosion
res_explosion = (flights
    .assign(ORG_AIR=flights.ORG_AIR.astype('category'))
    .groupby(['ORG_AIR', 'DEST_AIR'],)
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)
# without explosion
res_no_explosion = (flights
    .assign(ORG_AIR=flights.ORG_AIR.astype('category'))
    .groupby(['ORG_AIR', 'DEST_AIR'], observed=True) ###
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)

print(f'Difference in rows when using observed argument in groupby method: {len(res_explosion) - len(res_no_explosion)}')

Difference in rows when using observed argument in groupby method: 1580


# Example 3: Custom functions for aggregation

Find the maximum number of standard deviations from the mean that any single population value is per state

In [20]:
# function as there isn't a pandas or numpy one that does what we want above
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

# NB: NaNs can be raised as in above we calculate the std but some groups might have only one value and so error occurs
# # apply function
# (college
#     .groupby('STABBR')
#     ['UGDS']
#     .agg(max_deviation)
#     .round(1)
# )

# to output as df
(college
    .groupby('STABBR')
    .agg({'UGDS':['mean', 'std', max_deviation]})
    .round(1)
).head(3)

Unnamed: 0_level_0,UGDS,UGDS,UGDS
Unnamed: 0_level_1,mean,std,max_deviation
STABBR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
AK,2493.2,4051.7,2.6
AL,2789.9,4657.9,5.8
AR,1644.1,3142.8,6.3


In [24]:
# NB can change name of custom made functions

max_deviation.__name__ = 'Max Deviation'

(college
    .groupby(['STABBR', 'RELAFFIL'])
    .agg({'UGDS':[max_deviation, 'mean', 'std'],
         'SATVRMID':[max_deviation, 'mean', 'std'],
         'SATMTMID':[max_deviation, 'mean', 'std']})
    .round(1)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,UGDS,SATVRMID,SATVRMID,SATVRMID,SATMTMID,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,Max Deviation,mean,std,Max Deviation,mean,std,Max Deviation,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
AK,0,2.1,3508.9,4539.5,,,,,,
AK,1,1.1,123.3,132.9,,555.0,,,503.0,
AL,0,5.2,3248.8,5102.4,1.6,514.9,56.5,1.7,515.8,56.7
AL,1,2.4,979.7,870.8,1.5,498.0,53.0,1.4,485.6,61.4
AR,0,5.8,1793.7,3401.6,1.9,481.1,37.9,2.0,503.6,39.0
...,...,...,...,...,...,...,...,...,...,...
WI,0,5.3,2879.1,5031.5,1.3,558.8,47.5,1.3,591.2,85.7
WI,1,3.4,1716.2,1934.6,2.1,500.1,66.0,1.8,526.6,42.5
WV,0,6.9,1873.9,6271.7,1.6,466.7,27.9,1.8,480.0,27.7
WV,1,1.3,716.4,503.6,1.9,485.7,14.6,1.7,484.8,17.7


# Example 3: Multiple aggregation functions where some of them need parameters

Find the percentage of schools by state and religious affilation that have an undergrad population between two values

In [4]:
# percentage between 
def pct_between(s, low, high):
    return s.between(low, high).mean() * 100

# wrapper function
def between_n_m(n, m):
    def wrapper(ser):
        return pct_between(ser, n, m)
    wrapper.__name__ = f'between_{n}_{m}'
    return wrapper

In [5]:
# answering Q above
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg(pct_between, 1_000, 10_000)
    .round(1)
)

STABBR  RELAFFIL
AK      0           42.9
        1            0.0
AL      0           45.8
        1           37.5
AR      0           39.7
                    ... 
WI      0           31.0
        1           44.0
WV      0           29.2
        1           37.5
WY      0           72.7
Name: UGDS, Length: 112, dtype: float64

In [None]:
# answering Q above with other agg functions
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg([between_n_m(1_000, 10_000), 'max', 'mean'])
    .round(1)
)

# Example 4: Looking into groupby object

Define out groupby object

In [6]:
grouped = college.groupby(['STABBR', 'RELAFFIL'])

Code to look at groups within groupby object (first object)

In [7]:
for name, group in grouped:
    print(name)
    print(group)
    break

('AK', 0)
                                      INSTNM       CITY STABBR  HBCU  MENONLY  \
60            University of Alaska Anchorage  Anchorage     AK   0.0      0.0   
62            University of Alaska Fairbanks  Fairbanks     AK   0.0      0.0   
63            University of Alaska Southeast     Juneau     AK   0.0      0.0   
65    AVTEC-Alaska's Institute of Technology     Seward     AK   0.0      0.0   
66                 Charter College-Anchorage  Anchorage     AK   0.0      0.0   
67                     Alaska Career College  Anchorage     AK   0.0      0.0   
5171                       Ilisagvik College     Barrow     AK   0.0      0.0   

      WOMENONLY  RELAFFIL  SATVRMID  SATMTMID  DISTANCEONLY  ...  UGDS_2MOR  \
60          0.0         0       NaN       NaN           0.0  ...     0.0980   
62          0.0         0       NaN       NaN           0.0  ...     0.0401   
63          0.0         0       NaN       NaN           0.0  ...     0.0686   
65          0.0         0

Get first rows of each groupby 

In [None]:
grouped.head(2)

Grabbing rows from groupby object (first and last)

In [None]:
grouped.nth([1, -1])

# Filter groupby example

NB: .filter function for groupby method acts different to usual, you input a function to outputs True or False per group to see if we keep it

Filtering for states with a minority majority

In [9]:
# define function
def check_minority(df, threshold):
    minority_pct = 1 - df['UGDS_WHITE']
    total_minority = (df['UGDS'] * minority_pct).sum()
    total_ugds = df['UGDS'].sum()
    total_minority_pct = total_minority / total_ugds
    return total_minority_pct > threshold

# groupby object
college = pd.read_csv('data/college.csv', index_col='INSTNM')
grouped = college.groupby('STABBR')

# filter for minority majority 
college_filtered = grouped.filter(check_minority, threshold=.5)
college_filtered

Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Everest College-Phoenix,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,4102.0,...,0.0373,0.0,0.1026,0.4749,0,0.8291,0.7151,0.6700,28600,9500
Collins College,Phoenix,AZ,0.0,0.0,0.0,0,,,0.0,83.0,...,0.0241,0.0,0.3855,0.3373,0,0.7205,0.8228,0.4764,25700,47000
Empire Beauty School-Paradise Valley,Phoenix,AZ,0.0,0.0,0.0,1,,,0.0,25.0,...,0.0400,0.0,0.0000,0.1600,0,0.6349,0.5873,0.4651,17800,9588
Empire Beauty School-Tucson,Tucson,AZ,0.0,0.0,0.0,0,,,0.0,126.0,...,0.0000,0.0,0.0079,0.2222,1,0.7962,0.6615,0.4229,18200,9833
Thunderbird School of Global Management,Glendale,AZ,0.0,0.0,0.0,0,,,0.0,1.0,...,0.0000,0.0,0.0000,1.0000,0,0.0000,0.0000,0.0000,118900,PrivacySuppressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WestMed College - Merced,Merced,CA,,,,1,,,,,...,,,,,1,,,,,15623.5
Vantage College,El Paso,TX,,,,1,,,,,...,,,,,1,,,,,9500
SAE Institute of Technology San Francisco,Emeryville,CA,,,,1,,,,,...,,,,,1,,,,,9500
Bay Area Medical Academy - San Jose Satellite Location,San Jose,CA,,,,1,,,,,...,,,,,1,,,,,PrivacySuppressed


In [15]:
print(f'Dataframe shape before filtering: {college.shape}')
print(f'Dataframe shape after filtering: {college_filtered.shape}')
print(f"Number of states remaining: {college_filtered['STABBR'].nunique()}")

Dataframe shape before filtering: (7535, 26)
Dataframe shape after filtering: (3028, 26)
Number of states remaining: 20


# .apply with groupby

Calculating weighted mean SAT scores per state with apply

In [17]:
# filter data set
subset = ['UGDS', 'SATMTMID', 'SATVRMID'] # undergrad enrollment, SAT math median, SAT verbal median
college2 = college.dropna(subset=subset)

# function to 
def weighted_average(df):
    weight_m = df['UGDS'] * df['SATMTMID']
    weight_v = df['UGDS'] * df['SATVRMID']
    wm_avg = weight_m.sum() / df['UGDS'].sum()
    wv_avg = weight_v.sum() / df['UGDS'].sum()
    data = {'w_math_avg': wm_avg,
           'w_verbal_avg': wv_avg,
           'math_avg': df['SATMTMID'].mean(),
           'verbal_avg': df['SATVRMID'].mean(),
           'count': len(df)
    }
    return pd.Series(data) # return multiple data points which will be a few columns when applying below

# apply above function
(college2
    .groupby('STABBR')
    .apply(weighted_average)
    .astype(int)
).head(5)

Unnamed: 0_level_0,w_math_avg,w_verbal_avg,math_avg,verbal_avg,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,503,555,503,555,1
AL,536,533,504,508,21
AR,529,504,515,491,16
AZ,569,557,536,538,6
CA,564,539,562,549,72


Above we returned a single row as a series for each group.

Now we will return any number of rows and columns for each group by returning a df

In [18]:
from scipy.stats import gmean, hmean

# function to use
def calculate_means(df):
    df_means = pd.DataFrame(index=['Arithmetic', 'Weighted',
                                   'Geometric', 'Harmonic'])
    cols = ['SATMTMID', 'SATVRMID']
    for col in cols:
        arithmetic = df[col].mean()
        weighted = np.average(df[col], weights=df['UGDS'])
        geometric = gmean(df[col])
        harmonic = hmean(df[col])
        df_means[col] = [arithmetic, weighted,
                         geometric, harmonic]
    df_means['count'] = len(df)
    return df_means.astype(int)

# chain operation
(college2
    .groupby('STABBR')
    .apply(calculate_means)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,SATMTMID,SATVRMID,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,Arithmetic,503,555,1
AK,Weighted,503,555,1
AK,Geometric,503,555,1
AK,Harmonic,503,555,1
AL,Arithmetic,504,508,21
...,...,...,...,...
WV,Harmonic,480,472,17
WY,Arithmetic,540,535,1
WY,Weighted,540,535,1
WY,Geometric,540,534,1


In [19]:
# demonstrate above functionality
df_means = pd.DataFrame(index=['Arithmetic', 'Weighted',
                                   'Geometric', 'Harmonic'])

In [20]:
df_means['SATMTMID'] = [1,2,3,4]

# Example: Grouping continuous variables

Looking to discover the distribution of airlines for different travel distances. This will allow us to find the airline that makes the m0st flights between 500 - 1,000 miles for example.

Going to use 'cut' function to discretize the distance of each flight flown.


Place flights into bins

In [23]:
bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
cuts = pd.cut(flights['DIST'], bins=bins)
cuts

0         (500.0, 1000.0]
1        (1000.0, 2000.0]
2         (500.0, 1000.0]
3        (1000.0, 2000.0]
4        (1000.0, 2000.0]
               ...       
58487    (1000.0, 2000.0]
58488      (200.0, 500.0]
58489      (200.0, 500.0]
58490     (500.0, 1000.0]
58491     (500.0, 1000.0]
Name: DIST, Length: 58492, dtype: category
Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]

look at the distribution of flights in those respective bins

In [24]:
cuts.value_counts()

(500.0, 1000.0]     20659
(200.0, 500.0]      15874
(1000.0, 2000.0]    14186
(2000.0, inf]        4054
(-inf, 200.0]        3719
Name: DIST, dtype: int64

groupby

In [25]:
(flights
    .groupby(cuts)
    ['AIRLINE']
    .value_counts(normalize=True) 
    .round(3)
)

DIST              AIRLINE
(-inf, 200.0]     OO         0.326
                  EV         0.289
                  MQ         0.211
                  DL         0.086
                  AA         0.052
                  UA         0.027
                  WN         0.009
(200.0, 500.0]    WN         0.194
                  DL         0.189
                  OO         0.159
                  EV         0.156
                  MQ         0.100
                  AA         0.071
                  UA         0.062
                  VX         0.028
                  US         0.016
                  NK         0.012
                  B6         0.007
                  F9         0.005
                  AS         0.001
(500.0, 1000.0]   DL         0.206
                  AA         0.144
                  WN         0.138
                  UA         0.131
                  OO         0.106
                  EV         0.101
                  MQ         0.051
                  F9         

groupby via percentiles (for air time)

In [27]:
(flights
  .groupby(cuts)
  ['AIR_TIME']
  .quantile(q=[.25, .5, .75]) # 25th, 50th and 75th percentile airtime for each distance grouping
  .div(60)
  .round(2)
)

DIST                  
(-inf, 200.0]     0.25    0.43
                  0.50    0.50
                  0.75    0.57
(200.0, 500.0]    0.25    0.77
                  0.50    0.92
                  0.75    1.05
(500.0, 1000.0]   0.25    1.43
                  0.50    1.65
                  0.75    1.92
(1000.0, 2000.0]  0.25    2.50
                  0.50    2.93
                  0.75    3.40
(2000.0, inf]     0.25    4.30
                  0.50    4.70
                  0.75    5.03
Name: AIR_TIME, dtype: float64

adding strings to bins

In [32]:
labels=['Under 200 miles', '200-500 miles', '500-1k miles',
        '1-2k miles', '2k+ miles']
cuts2 = pd.cut(flights['DIST'], bins=bins, labels=labels)
(flights
   .groupby(cuts2)
   ['AIRLINE']
   .value_counts(normalize=True) 
   .round(3) 
   .unstack() 
)

AIRLINE,AA,AS,B6,DL,EV,F9,HA,MQ,NK,OO,UA,US,VX,WN
DIST,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Under 200 miles,0.052,,,0.086,0.289,,,0.211,,0.326,0.027,,,0.009
200-500 miles,0.071,0.001,0.007,0.189,0.156,0.005,,0.1,0.012,0.159,0.062,0.016,0.028,0.194
500-1k miles,0.144,0.023,0.003,0.206,0.101,0.038,,0.051,0.03,0.106,0.131,0.025,0.004,0.138
1-2k miles,0.264,0.016,0.003,0.165,0.016,0.031,,0.003,0.045,0.046,0.199,0.04,0.012,0.16
2k+ miles,0.212,0.012,0.08,0.171,,0.004,0.028,,0.019,,0.289,0.065,0.074,0.046


# Examples: Finding streaks 

Finding the longest streak of on-time arrivals (flights)


Add column to flights data that indicates if that flight was ontime

In [43]:
(flights
    .assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
    [['AIRLINE', 'ORG_AIR', 'ON_TIME']]
)

Unnamed: 0,AIRLINE,ORG_AIR,ON_TIME
0,WN,LAX,0
1,UA,DEN,1
2,MQ,DFW,0
3,AA,DFW,1
4,WN,LAX,0
...,...,...,...
58487,AA,SFO,1
58488,F9,LAS,1
58489,OO,SFO,1
58490,WN,MSP,0


Define function to find the longest streak

In [44]:
def max_streak(s):
    s1 = s.cumsum()
    return (s
       .mul(s1)
       .diff()
       .where(lambda x: x < 0) 
       .ffill()
       .add(s1, fill_value=0)
       .max()
    )

# see bottom of nb as to how I worked out streaks

Apply above function to find the longest ontime streak per airline per state

In [45]:
(flights
    .assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
    .sort_values(['MONTH', 'DAY', 'SCHED_DEP']) 
    .groupby(['AIRLINE', 'ORG_AIR'])
    ['ON_TIME'] 
    .agg(['mean', 'size', max_streak])
    .round(2)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size,max_streak
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,ATL,0.82,233,15
AA,DEN,0.74,219,17
AA,DFW,0.78,4006,64
AA,IAH,0.80,196,24
AA,LAS,0.79,374,29
...,...,...,...,...
WN,LAS,0.77,2031,39
WN,LAX,0.70,1135,23
WN,MSP,0.84,237,32
WN,PHX,0.77,1724,33


# Examples

### Total number of flights betweeen cities (two values in dataframe regardless of order)

using numpy to sort column but could use apply axis = 1 but takes longer to do.

In [34]:
# sorting each pair of origin and destination
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
data_sorted[:10]

# df
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
flights_sort2

Unnamed: 0,AIR1,AIR2
0,LAX,SLC
1,DEN,IAD
2,DFW,VPS
3,DCA,DFW
4,LAX,MCI
...,...,...
58487,DFW,SFO
58488,LAS,SFO
58489,SBA,SFO
58490,ATL,MSP


In [40]:
# see number of flights between Atlanta and Houston
(
    flights_sort2
    .groupby(['AIR1', 'AIR2'])
    .size()
    .loc[('ATL', 'IAH')] # note other way will cause an error
)

269

In [None]:
# time the two approaches

# faster, np.sort
%%timeit
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
flights_sort2 = pd.DataFrame(data_sorted,
    columns=['AIR1', 'AIR2'])



In [None]:
%%timeit
# slower (.apply, axis=1)
flights_sort = (flights   # doctest: +SKIP
    [['ORG_AIR', 'DEST_AIR']] 
   .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
)

# Streaks

Finding streak for following series: [0, 1, 1, 0, 1, 1, 1, 0]

In [46]:
# define series of 0's and 1's
s = pd.Series([0, 1, 1, 0, 1, 1, 1, 0])
s

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    0
dtype: int64

In [47]:
# cumulate the series
s1 = s.cumsum()
s1

0    0
1    1
2    2
3    2
4    3
5    4
6    5
7    5
dtype: int64

In [48]:
# multiply first series with second series where zeros are the first part of the series
s.mul(s1)

0    0
1    1
2    2
3    0
4    3
5    4
6    5
7    0
dtype: int64

In [49]:
# difference between first value above
s.mul(s1).diff()

0    NaN
1    1.0
2    1.0
3   -2.0
4    3.0
5    1.0
6    1.0
7   -5.0
dtype: float64

In [50]:
(s
    .mul(s.cumsum()) # s1
    .diff()
    .where(lambda x: x < 0) # if not less than zero ie streak ends then nan
)

0    NaN
1    NaN
2    NaN
3   -2.0
4    NaN
5    NaN
6    NaN
7   -5.0
dtype: float64

In [51]:
(s
    .mul(s.cumsum())
    .diff()
    .where(lambda x: x < 0)
    .ffill() # fill downwards
)

0    NaN
1    NaN
2    NaN
3   -2.0
4   -2.0
5   -2.0
6   -2.0
7   -5.0
dtype: float64

In [52]:
(s
    .mul(s.cumsum())
    .diff()
    .where(lambda x: x < 0)
    .ffill()
    .add(s.cumsum(), fill_value=0) # s1 # fill nan's with zero
)

0    0.0
1    1.0
2    2.0
3    0.0
4    1.0
5    2.0
6    3.0
7    0.0
dtype: float64