In [1]:
import numpy as np

# Motivations
On top of the similarity measures that we've been exploring in terms of co-occurrences and likelihood ratios, there may be some interest to look at longer term dependencies in conversation traces. These longer term dependencies can also be documented via frequency and compared, but determining the validity of a longer sequence in a robust manner is challenging.

We consider first simple notions of logical statements that pattern match longer sequences, and eventually look at logics such as LTL and variations of such in order to specify these longer sequence concepts, which we call "sub-conversations."

In [78]:
sequence = "abbabbabbabbaccbabacbabcbcbcbabacbcbcbcbababcdcbcbabaacbcbcbacbacbabcabcbbcbcababcabcabcbacbabcababcbacbcbacbabcbcbacbbcccabcabcbacbcbabcbcabcbcabccacdbabbcabcbacba"
a = [x for x in sequence]
len(a)

164

In [79]:
#for this 162 sequence, let's find the long sequence "abba" like we would find a 4-gram
def match_pattern(trace, pat):
    
    assert type(pat) == list
    assert type(trace) == list
    
    loc = {}
    freq = {}
    
    for item in pat:
        for i in range(len(trace)-len(item)):
            if "".join(trace[i:i+len(item)]) == item:
                try:
                    loc["{0}".format(item)][i] = 1
                    freq["{0}".format(item)] += 1
                except:
                    loc["{0}".format(item)] = np.zeros((len(trace),))
                    freq["{0}".format(item)] = 1
    
    return loc, freq

In [80]:
l1, f1 = match_pattern(a, ["abba","ac"])

In [81]:
l1

{'abba': array([0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'ac': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 

In [82]:
f1

{'abba': 4, 'ac': 13}

In [95]:
#define a find all instances of "x until y"
def until (trace, x, y):
    
    assert type(trace) == list
    assert type(x) == str
    assert type(y) == str
    
    sol = []
    i = 0
    j = 0
    s = 0
    e = 0
    for i in range(len(trace)):
        if trace[i] == x:
            s = i
            for j in range(i+1, len(trace)):
                if trace[j] == y:
                    e = j
                    sol.append((s,e))
                    i = j
                    break
                    
    return sol
            
                


In [96]:
until(a, "a","d")

[(0, 45),
 (3, 45),
 (6, 45),
 (9, 45),
 (12, 45),
 (16, 45),
 (18, 45),
 (21, 45),
 (29, 45),
 (31, 45),
 (40, 45),
 (42, 45),
 (50, 150),
 (52, 150),
 (53, 150),
 (60, 150),
 (63, 150),
 (66, 150),
 (69, 150),
 (77, 150),
 (79, 150),
 (82, 150),
 (85, 150),
 (89, 150),
 (92, 150),
 (95, 150),
 (97, 150),
 (101, 150),
 (106, 150),
 (109, 150),
 (115, 150),
 (122, 150),
 (125, 150),
 (129, 150),
 (134, 150),
 (139, 150),
 (144, 150),
 (148, 150)]