In [2]:
import numpy as np
import pandas as pd

values = np.array([1, 3, 2, 4, 1, 6, 4])
example_df = pd.DataFrame({
    'value': values,
    'even': values % 2 == 0,
    'above_three': values > 3
}, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])


example_df

Unnamed: 0,above_three,even,value
a,False,False,1
b,False,False,3
c,False,True,2
d,True,True,4
e,False,False,1
f,True,True,6
g,True,True,4


In [6]:
# Standardize each group

def standardize(xs):
    return (xs - xs.mean()) / xs.std()


grouped_data = example_df.groupby('even')
list(grouped_data['value'])

[(False, a    1
  b    3
  e    1
  Name: value, dtype: int64), (True, c    2
  d    4
  f    6
  g    4
  Name: value, dtype: int64)]

In [9]:
grouped_data.apply(standardize)

Unnamed: 0,above_three,even,value
a,,,-0.57735
b,,,1.154701
c,-1.5,,-1.224745
d,0.5,,0.0
e,,,-0.57735
f,0.5,,1.224745
g,0.5,,0.0


In [11]:
# limits selection to 'value' column
grouped_data['value'].apply(standardize)

a   -0.577350
b    1.154701
c   -1.224745
d    0.000000
e   -0.577350
f    1.224745
g    0.000000
Name: value, dtype: float64

In [12]:
# Find second largest value in each group

def second_largest(xs):
    sorted_xs = xs.sort(inplace=False, ascending=False)
    return sorted_xs.iloc[1]


grouped_data = example_df.groupby('even')
grouped_data.groups

{False: ['a', 'b', 'e'], True: ['c', 'd', 'f', 'g']}

In [13]:
grouped_data['value'].apply(second_largest)



even
False    1
True     4
Name: value, dtype: int64

In [9]:
ridership_df = pd.DataFrame({
    'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
    'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00',
              '16:00:00'],
    'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
    'EXITSn': [1088151, 13755385, 1088159, 13755393, 1088177, 13755598, 1088231, 13756191, 1088275]
})

ridership_df

Unnamed: 0,ENTRIESn,EXITSn,TIMEn,UNIT
0,3144312,1088151,00:00:00,R051
1,8936644,13755385,02:00:00,R079
2,3144335,1088159,04:00:00,R051
3,8936658,13755393,06:00:00,R079
4,3144353,1088177,08:00:00,R051
5,8936687,13755598,10:00:00,R079
6,3144424,1088231,12:00:00,R051
7,8936819,13756191,14:00:00,R079
8,3144594,1088275,16:00:00,R051


In [20]:
def get_hourly_entries_and_exits(entries_and_exits):
    '''
    Fill in this function to take a DataFrame with cumulative entries
    and exits and return a DataFrame with hourly entries and exits.
    The hourly entries and exits should be calculated separately for
    each station (the 'UNIT' column).
    '''
        
    return entries_and_exits.groupby('UNIT')[['ENTRIESn','EXITSn']].apply(lambda x: x-x.shift())

get_hourly_entries_and_exits(ridership_df)

Unnamed: 0,ENTRIESn,EXITSn
0,,
1,,
2,23.0,8.0
3,14.0,8.0
4,18.0,18.0
5,29.0,205.0
6,71.0,54.0
7,132.0,593.0
8,170.0,44.0
