In [1]:
import pandas
import matplotlib.pyplot as pyplot
import numpy
import arrow
import datetime

from sentenai import *

%matplotlib inline

In [2]:
_API_KEY = 'YOUR_API_KEY_HERE'

In [3]:
sentenai = Sentenai(auth_key=_API_KEY)

# Generate Data

In [4]:
letters = "ABCABCAABB"
numbers = [1, 1, 1, 2, 2, 2, 1, 1, 1, 2]
timestamps = pandas.date_range(
    start=arrow.get('2017-09-20 09:00:00').datetime, 
    periods=len(numbers),
    freq='15min'
)

In [212]:
stream_abc = stream('abc')

In [213]:
eids = []
for ts, letter, number in zip(timestamps, letters, numbers):
    evt = {'letter': letter, 'number': number}
    eid = sentenai.put(stream_abc, evt, timestamp=ts)
    eids.append(eid)

In [162]:
# Add another batch of events if we want
eids = []
for ts, letter, number in zip(timestamps, letters, numbers):
    evt = {'letter': letter, 'number': number}
    eid = sentenai.put(stream_abc, evt, timestamp=ts + datetime.timedelta(minutes=120+15))
    eids.append(eid)

In [211]:
# Delete a stream if we want to start over.
sentenai.destroy(stream_abc)

In [214]:
# Query the whole stream
result = sentenai.query(select(), returning={stream_abc: True})
df = result.dataframe(stream_abc)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc,0,00:00:00,A,1
2017-09-20 09:15:00+00:00,abc,0,00:15:00,B,1
2017-09-20 09:30:00+00:00,abc,0,00:30:00,C,1
2017-09-20 09:45:00+00:00,abc,0,00:45:00,A,2
2017-09-20 10:00:00+00:00,abc,0,01:00:00,B,2
2017-09-20 10:15:00+00:00,abc,0,01:15:00,C,2
2017-09-20 10:30:00+00:00,abc,0,01:30:00,A,1
2017-09-20 10:45:00+00:00,abc,0,01:45:00,A,1
2017-09-20 11:00:00+00:00,abc,0,02:00:00,B,1
2017-09-20 11:15:00+00:00,abc,0,02:15:00,B,2


# Spans

In [177]:
# Get all letter A
result = sentenai.query(
    select().span(stream_abc.letter == 'A'),
    returning={stream_abc: True}
)

# Notice how spans include all events that continuously
# satisfy the condition. There are two As in a row
# that get put in the same span.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc-1,0,00:00:00,A,1
2017-09-20 09:45:00+00:00,abc-1,1,00:00:00,A,2
2017-09-20 10:30:00+00:00,abc-1,2,00:00:00,A,1
2017-09-20 10:45:00+00:00,abc-1,2,00:15:00,A,1


In [178]:
# Get all number 1.
result = sentenai.query(
    select().span(stream_abc.number == 1),
    returning={stream_abc: True}
)

# Again notice the spans include
# all back to back instances of the number 
# 1 in the same span.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc-1,0,00:00:00,A,1
2017-09-20 09:15:00+00:00,abc-1,0,00:15:00,B,1
2017-09-20 09:30:00+00:00,abc-1,0,00:30:00,C,1
2017-09-20 10:30:00+00:00,abc-1,1,00:00:00,A,1
2017-09-20 10:45:00+00:00,abc-1,1,00:15:00,A,1
2017-09-20 11:00:00+00:00,abc-1,1,00:30:00,B,1


In [241]:
# Get all number 1. between 9 and 10am
result = sentenai.query(
    select(start=arrow.get('2017-09-20 09:00:00').datetime,
           end=arrow.get('2017-09-20 10:00:00').datetime
    ).span(stream_abc.number == 1),
    returning={stream_abc: True}
)

# Again notice the spans include
# all back to back instances of the number 
# 1 in the same span.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc,0,00:00:00,A,1
2017-09-20 09:15:00+00:00,abc,0,00:15:00,B,1
2017-09-20 09:30:00+00:00,abc,0,00:30:00,C,1


In [179]:
# Get all number 1.
result = sentenai.query(
    select().span(stream_abc.number == 1, min=delta(minutes=46)),
    returning={stream_abc: True}
)

# The condition now is that the number variables
# needs to be 1 for a minimum of 46 minutes to be a span.
# It is only 1 for 45 minutes, so no results are returned.
df = result.dataframe(stream_abc)
df.head(10)

In [187]:
# Get all number 2.
stream_abc = stream('abc-1')
result = sentenai.query(
    select().span(stream_abc.number == 2, max=delta(minutes=20)),
    returning={stream_abc: True}
)

# Now we are only returning spans where the stream
# number variable is 2 for a maximum 20 minutes.
# The last event in the stream has no duration because
# the stream ends, so it is not included in a span.
df = result.dataframe(stream_abc)
df.head(10)

In [188]:
# Lets find all instances of the letter A 
# followed by the letter C

result = sentenai.query(
    select(
    ).span(stream_abc.letter == 'A'
    ).then(stream_abc.letter == 'C'),
    returning={stream_abc: True}
)

# Notice how this returns nothing. That is
# because by default `then` statements set
# the `within` duration to 0, meaning that
# subsequent events must occur immediately
# after the events from the first span.
df = result.dataframe(stream_abc)
df.head(10)

In [56]:
# Lets find all instances of the letter A 
# followed by the letter C
result = sentenai.query(
    select(
    ).span(stream_abc.letter == 'A'
    ).then(stream_abc.letter == 'C', within=delta(days=1)),
    returning={stream_abc: True}
)

# In this case, the letter C occurs multiple times
# after the letter A within at least a day.
# When this occurs all events between the A and C
# are captured as part of the span.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc-1,0,00:00:00,A,1
2017-09-20 09:15:00+00:00,abc-1,0,00:15:00,B,1
2017-09-20 09:30:00+00:00,abc-1,0,00:30:00,C,1
2017-09-20 09:45:00+00:00,abc-1,1,00:00:00,A,2
2017-09-20 10:00:00+00:00,abc-1,1,00:15:00,B,2
2017-09-20 10:15:00+00:00,abc-1,1,00:30:00,C,2


In [263]:
# Lets find all instances of the letter A 
# followed by the letter C
result = sentenai.query(
    select(
    ).span(stream_abc.letter == 'A'
    ).then(stream_abc.letter == 'C', within=delta(days=1), after=delta(minutes=60)),
    returning={stream_abc: True}
)

# This query returns the same set of events, but
# note that everything is considered part of the same
# span. Because we require that the letter 'C' that closes
# a span come within 1 day of the 'A' but after 60 minutes
# have passed, the first occurance of 'C' at 9:30am does
# not trigger the end of the span.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc,0,00:00:00,A,1
2017-09-20 09:15:00+00:00,abc,0,00:15:00,B,1
2017-09-20 09:30:00+00:00,abc,0,00:30:00,C,1
2017-09-20 09:45:00+00:00,abc,0,00:45:00,A,2
2017-09-20 10:00:00+00:00,abc,0,01:00:00,B,2
2017-09-20 10:15:00+00:00,abc,0,01:15:00,C,2


In [189]:
# Lets find all instances of the letter A 
# followed by the letter C
result = sentenai.query(
    select(
    ).span(stream_abc.letter == 'A', min=delta(minutes=20)
    ).then(stream_abc.letter == 'C', within=delta(days=1), after=delta(minutes=60)),
    returning={stream_abc: True}
)

# Now we get no results, because we've added a condition
# that the first 'A' in a span must last for at least
# 20 minutes. With feed forward, the stream state is 'A'
# for only 15 minutes, thus no events are returned.
df = result.dataframe(stream_abc)
df.head(10)

In [190]:
# Lets find all instances of the letter A 
# followed by the letter C
result = sentenai.query(
    select(
    ).span(stream_abc.letter == 'A' and stream_abc.number == 1
    ).then(stream_abc.letter == 'C' and stream_abc.number == 1, within=delta(days=1)),
    returning={stream_abc: True}
)

# The result of this query may be surprising
# It returns events after the 'C, 1' (letter, number) event
# That is because the span continues until the conditions
# are met to constitute the start of a new span. Contrast this to
# the query two cells above which did not have condition on the value of
# `number`. That pattern repeated at the first 'A' after 'C'
# so subsequent events are not returned.
df = result.dataframe(stream_abc)
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc-1,0,00:00:00,A,1
2017-09-20 09:15:00+00:00,abc-1,0,00:15:00,B,1
2017-09-20 09:30:00+00:00,abc-1,0,00:30:00,C,1
2017-09-20 09:45:00+00:00,abc-1,0,00:45:00,A,2
2017-09-20 10:00:00+00:00,abc-1,0,01:00:00,B,2
2017-09-20 10:15:00+00:00,abc-1,0,01:15:00,C,2
2017-09-20 10:30:00+00:00,abc-1,0,01:30:00,A,1
2017-09-20 10:45:00+00:00,abc-1,0,01:45:00,A,1
2017-09-20 11:00:00+00:00,abc-1,0,02:00:00,B,1


In [195]:
# Lets find all instances of the letter A 
# followed by the letter C
result = sentenai.query(
    select(
    ).span(stream_abc.letter == 'A'
    ).then(stream_abc.letter == 'B'
    ).then(stream_abc.letter == 'C'),
    returning={stream_abc: True}
)

# We can chain multiple then statements
# together to find more complicated patterns
df = result.dataframe(stream_abc)
df.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc-1,0,00:00:00,A,1
2017-09-20 09:15:00+00:00,abc-1,0,00:15:00,B,1
2017-09-20 09:30:00+00:00,abc-1,0,00:30:00,C,1
2017-09-20 09:45:00+00:00,abc-1,1,00:00:00,A,2
2017-09-20 10:00:00+00:00,abc-1,1,00:15:00,B,2
2017-09-20 10:15:00+00:00,abc-1,1,00:30:00,C,2


# Switches

In [204]:
# What if we didn't want to chain all of those
# 'then' statements together? Lets use a switch statement.
# Lets find all instances of the letter A 
# followed by the letter C

a_to_b = event(V.letter == 'A') >> event(V.letter == 'B')
b_to_c = event(V.letter == 'B') >> event(V.letter == 'C')

result = sentenai.query(
    select(
    ).span(stream_abc(a_to_b)
    ).then(stream_abc(b_to_c), within=delta(days=2)),
    returning={stream_abc: True}
)

# Switches define spans by identifying
# transitions between states. Here the span begins
# when the stream transitions from 'A' to 'B' and
# ends when it transitions from 'B' to 'C'. These are
# not inclusive to the 'A' and 'C' events are not
# captured, leaving only the 'B' in the middle.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:15:00+00:00,abc-1,0,0 days,B,1
2017-09-20 10:00:00+00:00,abc-1,1,0 days,B,2


In [209]:
# The same query can be written slightly more
# susinctly by chaining switches together.

a_to_b_to_c = (
    event(V.letter == 'A') >> 
    event(V.letter == 'B') >> 
    event(V.letter == 'C')
)

result = sentenai.query(
    select(
    ).span(stream_abc(a_to_b_to_c)),
    returning={stream_abc: True}
)

# Again, the span begins after the first
# transition and ends before the last thus
# the A and C events are not included in the span.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:15:00+00:00,abc-1,0,0 days,B,1
2017-09-20 10:00:00+00:00,abc-1,1,0 days,B,2


In [206]:
# We can find find the entire 'ABC' chain
# with the following.
a_to_b = event(V.letter == 'C' and V.number == 1) >> event(V.letter == 'A' and V.number == 2)
b_to_c = event(V.letter == 'C' and V.number == 2) >> event(V.letter == 'A' and V.number == 1)

result = sentenai.query(
    select(
    ).span(stream_abc(a_to_b)
    ).then(stream_abc(b_to_c), within=delta(days=2)),
    returning={stream_abc: True}
)

# This query uses switch statements to define the
# beginning and end of spans. The span here begins
# when (C, 1) transitions to (A, 2) and ends after
# (C, 2) transitions to (A, 1). All events between
# are captured.
df = result.dataframe(stream_abc)
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:45:00+00:00,abc-1,0,00:00:00,A,2
2017-09-20 10:00:00+00:00,abc-1,0,00:15:00,B,2
2017-09-20 10:15:00+00:00,abc-1,0,00:30:00,C,2


# Joining streams.

In [215]:
# Lets make a new stream
stream_xyz = stream('xyz')

In [216]:
# Add some events that occur slightly offset from
# the 'abc' stream.
letters = "XXYYZZXYZX"
numbers = [1, 1, 1, 1, 2, 2, 2, 2, 1]

offset = datetime.timedelta(minutes=5)
timestamps = pandas.date_range(
    start=arrow.get('2017-09-20 09:00:00').datetime + offset, 
    periods=len(numbers),
    freq='15min'
)

In [217]:
eids = []
for ts, letter, number in zip(timestamps, letters, numbers):
    evt = {'letter': letter, 'number': number}
    eid = sentenai.put(stream_xyz, evt, timestamp=ts)
    eids.append(eid)

In [None]:
# Delete the entire stream
# sentenai.destroy(stream_xyz)

In [218]:
# Query the whole stream
result = sentenai.query(select(), returning={stream_xyz: True})
df = result.dataframe(stream_xyz)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:05:00+00:00,xyz,0,00:00:00,X,1
2017-09-20 09:20:00+00:00,xyz,0,00:15:00,X,1
2017-09-20 09:35:00+00:00,xyz,0,00:30:00,Y,1
2017-09-20 09:50:00+00:00,xyz,0,00:45:00,Y,1
2017-09-20 10:05:00+00:00,xyz,0,01:00:00,Z,2
2017-09-20 10:20:00+00:00,xyz,0,01:15:00,Z,2
2017-09-20 10:35:00+00:00,xyz,0,01:30:00,X,2
2017-09-20 10:50:00+00:00,xyz,0,01:45:00,Y,2
2017-09-20 11:05:00+00:00,xyz,0,02:00:00,Z,1


In [231]:
# Now lets look at both streams
result = sentenai.query(
    select(),
    returning={stream_abc: True, stream_xyz: True}
)
# Sentenai returns a dictionary of dataframes.
# Keys are the names of the stream. Lets concat them
# and compare. Note the index has the stream name.
# Note i've sorted events by their absolute time.
df = pandas.concat(result.dataframe().values()).sort_index(level=0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:00:00+00:00,abc,0,00:00:00,A,1
2017-09-20 09:05:00+00:00,xyz,0,00:00:00,X,1
2017-09-20 09:15:00+00:00,abc,0,00:15:00,B,1
2017-09-20 09:20:00+00:00,xyz,0,00:15:00,X,1
2017-09-20 09:30:00+00:00,abc,0,00:30:00,C,1
2017-09-20 09:35:00+00:00,xyz,0,00:30:00,Y,1
2017-09-20 09:45:00+00:00,abc,0,00:45:00,A,2
2017-09-20 09:50:00+00:00,xyz,0,00:45:00,Y,1
2017-09-20 10:00:00+00:00,abc,0,01:00:00,B,2
2017-09-20 10:05:00+00:00,xyz,0,01:00:00,Z,2


In [248]:
# Find events in 'abc' where 'xyz' is the letter 'X'
result = sentenai.query(
    select(
    ).span(stream_xyz.letter == 'X'),
    returning={stream_abc: True, stream_xyz: True}
)
# This query returns all events in stream 'xyz' where letter is 'X'.
# During the first span, 'abc' records a 'B' event which is 
# returned here. During the second span, 'xyz' is in state 'X' for
# 15 minutes during which an 'A' event is recorded. However, this is
# not returned in the query because the stream stays in that state
# after 'xyz' switches. It is not fully encompased within the span.
df = pandas.concat(result.dataframe().values()).sort_index(level=0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:05:00+00:00,xyz,0,00:00:00,X,1
2017-09-20 09:15:00+00:00,abc,0,00:00:00,B,1
2017-09-20 09:20:00+00:00,xyz,0,00:15:00,X,1
2017-09-20 10:35:00+00:00,xyz,1,00:00:00,X,2


In [253]:
# Conditions across both streams
result = sentenai.query(
    select(
    ).span(stream_abc.letter == 'A' and stream_xyz.number == 1),
    returning={stream_abc: True, stream_xyz: True}
)
# Here we are looking for spans in time where 'abc' has an 'A'
# event AND 'xyz' has a 1 event. Note that this doesn't mean
# that the 'A' and 1 need to occur at the same time.
df = pandas.concat(result.dataframe().values()).sort_index(level=0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:05:00+00:00,xyz,0,00:00:00,X,1
2017-09-20 09:15:00+00:00,abc,0,00:00:00,B,1
2017-09-20 09:20:00+00:00,xyz,0,00:15:00,X,1
2017-09-20 09:30:00+00:00,abc,0,00:15:00,C,1
2017-09-20 09:35:00+00:00,xyz,0,00:30:00,Y,1
2017-09-20 09:45:00+00:00,abc,0,00:30:00,A,2
2017-09-20 09:50:00+00:00,xyz,0,00:45:00,Y,1
2017-09-20 11:05:00+00:00,xyz,1,00:00:00,Z,1


In [260]:
# Conditions across both streams
a_to_b = event(V.letter == 'A') >> event(V.letter == 'B')
c_to_a = event(V.letter == 'C') >> event(V.letter == 'A')
result = sentenai.query(
    select(
    ).span(stream_abc(one_to_two)
    ).then(stream_abc(two_to_one), within=delta(days=1)),
    returning={stream_abc: True, stream_xyz: True}
)
# These results illustrate how the switch statement defines
# the start and end points of a span. One (X, 1) event falls
# within those bounds from the 'xyz' stream.
df = pandas.concat(result.dataframe().values()).sort_index(level=0)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,letter,number
.ts,.stream,.span,.delta,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-09-20 09:15:00+00:00,abc,0,00:00:00,B,1
2017-09-20 09:20:00+00:00,xyz,0,00:00:00,X,1
2017-09-20 09:30:00+00:00,abc,0,00:15:00,C,1
