In [1]:
import numpy as np
import pandas as pd

Edit-distance operations are defined as 
* (a,a) denotes a match of symbols at the given position
* (a,-) denotes deletion of symbol 'a' at some position
* (-,b) denotes insertion of symbol 'b' at some position
* (a,b) denotes replacement of 'a' with 'b' at some position, and a != b
We assign a separate cost for each of these operations according to our needs

In [2]:
#for simplicity and completeness sake, we define a simple levenshtein distance function first

def basic_distance (trace1, trace2):

    M = len(trace1)
    N = len(trace2)
    edit_table = np.zeros((M,N)) #establish table

    #fill table
    for i in range(M):
        for j in range(N):

            if i == 0:
                edit_table[i][j] = j

            elif j ==0:
                edit_table[i][j] = i

            elif trace1[i-1] == trace2[j-1]:
                edit_table[i][j] = edit_table[i-1][j-1]

            else:
                edit_table[i][j] = 1 + min(edit_table[i-1][j], edit_table[i][j-1], edit_table[i-1][j-1]) #scoring done here

    return edit_table[i][j]

In [3]:
#sample test
dat1 = pd.read_csv('./graham.norton.s22.e08_data.csv')
dat2 = pd.read_csv('./graham.norton.s22.e12_data.csv')
dat3 = pd.read_csv('./blackpink_data.csv')
test1 = list(dat1.L)
test2 = list(dat2.L)
test3 = list(dat3.L)

In [4]:
basic_distance(test1,test2)

665.0

In [5]:
a = ["a","a","a","a","a"]
c = ["a","a","a","a","a"]
b = ["b","b","b","b","b"]
d = ["a","a","a","a","a","a","a","a"]
basic_distance(a,d)

3.0

In [6]:
a = ["a","a","a","a","b","a","c","c"]
b = ["b","b","b","b","a","b","c","c","c","c"]
basic_distance(a,b)

6.0

In [7]:
basic_distance(test1, test3)

984.0

In [8]:
#baseline
import nltk
nltk.edit_distance(test1,test2)

665

In [9]:
t1 = ["a","b"]
t2 = ["a","b","c","d"]
nltk.edit_distance(t1,t2)

2

# Now we have to alter the function to accomodate for different scoring metrics according to the paper
* substitution of uncorrelated activities should be discouraged
* substitution of contrasting activities should be penalized
* insertion of activities out of context should be discouraged
* substitution of correlated activities should be encouraged in proportion to the degree of similarity

In [10]:
#STEP 1: Define the symbols in the list of traces
def define_symbols (traces):
    assert type(traces) == list
    symbols = []
    for item in traces:
        symbols.append(set(item))
    x = symbols[0]
    for i in range(len(symbols)):
        x = x.union(symbols[i])
    
    return list(x)

In [11]:
# A != B
a = ["a","b","c","d","e"]
b = ["a","g","q","q","e","f"]
c = [a, b]
print(define_symbols(c))

# A == B
a = ["a", "b", "c", "d"]
b = ["a", "a", "c", "d", "b", "b"]
c = [a,b]
print(define_symbols(c))

# on the dataset
c = [test1,test2]
print(define_symbols(c))

['f', 'a', 'b', 'q', 'e', 'g', 'd', 'c']
['a', 'b', 'd', 'c']
['give.statement', 'respond.deny', 'use.social.convention', 'recall', 'open.question', 'deflection', 'relax.atmosphere', 'closed.question', 'misc', 'respond.agree', 'give.opinion']


In [12]:
#STEP 2: Define the set of all 3-grams in the logs and their frequencies
def three_grams (traces):
    assert type(traces) == list
    g3 = []
    g3_freq = {}
    for trace in traces:
        for i in range(len(trace)-2):
            g3.append(", ".join(list(trace[i:i+3])))
            try:
                g3_freq[", ".join(list(trace[i:i+3]))] += 1
            except:
                g3_freq[", ".join(list(trace[i:i+3]))] = 1
    return list(set(g3)), g3_freq

In [13]:
#A = abcdefghijk, B = kjihgfedcba
a = ["a","b","c","d","e","f","g","h","i","j","k"]
b = ["k","j","i","h","g","f","e","d","c","b","a"]
c = [a,b]
print(three_grams(c))

#A = abcdefghijk, B = abcdefghijk
a = ["a","b","c","d","e","f","g","h","i","j","k"]
b = ["a","b","c","d","e","f","g","h","i","j","k"]
c = [a,b]
print(three_grams(c))

(['c, b, a', 'j, i, h', 'i, j, k', 'b, c, d', 'g, f, e', 'e, f, g', 'a, b, c', 'c, d, e', 'k, j, i', 'h, i, j', 'g, h, i', 'i, h, g', 'd, e, f', 'h, g, f', 'f, e, d', 'e, d, c', 'd, c, b', 'f, g, h'], {'a, b, c': 1, 'b, c, d': 1, 'c, d, e': 1, 'd, e, f': 1, 'e, f, g': 1, 'f, g, h': 1, 'g, h, i': 1, 'h, i, j': 1, 'i, j, k': 1, 'k, j, i': 1, 'j, i, h': 1, 'i, h, g': 1, 'h, g, f': 1, 'g, f, e': 1, 'f, e, d': 1, 'e, d, c': 1, 'd, c, b': 1, 'c, b, a': 1})
(['i, j, k', 'b, c, d', 'e, f, g', 'a, b, c', 'c, d, e', 'h, i, j', 'g, h, i', 'd, e, f', 'f, g, h'], {'a, b, c': 2, 'b, c, d': 2, 'c, d, e': 2, 'd, e, f': 2, 'e, f, g': 2, 'f, g, h': 2, 'g, h, i': 2, 'h, i, j': 2, 'i, j, k': 2})


In [14]:
#STEP 3: Define the context for symbol a
def define_context(grams):
    
    assert type(grams) == list
    
    context = {}
    for gram in grams:
        x,a,y = gram.split(", ")
        try:
            context[a].append("{0}, {1}".format(x,y))
        except:
            context[a] = []
            context[a].append("{0}, {1}".format(x,y))
            
    #clear dups
    for k in list(context.keys()):
        context[k] = list(set(context[k]))
    
    return context

In [15]:
#test context, should have keys a and b, where each has aa and bb respectively
a = ["a","a","a","a","a","a","a"]
b = ["b","b","b","b","b"]
c = [a,b]
grams, g3_freq = three_grams(c)
print(define_context(grams))

#test context
a = ["a","a","a","a","a","a","a","c","e","f","a","a","b","c"]
b = ["b","b","b","b","b","c","c","a"]
c = [a,b]
grams, g3_freq = three_grams(c)
print(define_context(grams))

{'b': ['b, b'], 'a': ['a, a']}
{'b': ['b, c', 'b, b', 'a, c'], 'e': ['c, f'], 'c': ['a, e', 'b, c', 'c, a'], 'a': ['a, a', 'a, b', 'f, a', 'a, c'], 'f': ['e, a']}


In [16]:
#STEP 4: define pairs of context
def context_pairs (context):
    
    assert type(context) == dict
    
    context_pairs = {}
    for a in list(context.keys()):
        for b in list(context.keys()):
            if a != b:
                context_pairs["{0}, {1}".format(a, b)] = list(set(context[a]).intersection(set(context[b])))
    
    return context_pairs

In [17]:
#test context pairs, should be empty
a = ["a","a","a","a","a","a","a"]
b = ["b","b","b","b","b"]
c = [a,b]
grams, g3_freq = three_grams(c)
context = define_context(grams)
print(context_pairs(context))

#test context pairs, should be aa and bb
a = ["a","a","a","a","b","a"]
b = ["b","b","b","b","a","b"]
c = [a,b]
grams, g3_freq = three_grams(c)
context = define_context(grams)
print(context_pairs(context))

{'b, a': [], 'a, b': []}
{'b, a': ['b, b', 'a, a'], 'a, b': ['b, b', 'a, a']}


In [18]:
#STEP 5: define co-occurrence combinations
def define_cooccurrence(symbols, context_pairs, gram_freq):
    
    assert type(context_pairs) == dict
    assert type(gram_freq) == dict
    assert type(symbols) == list
    
    co_occur = {}
    for k in list(context_pairs.keys()):
        for item in context_pairs[k]:
            for a in symbols:
                for b in symbols:
                    x,y = item.split(", ")[0], item.split(", ")[1]
                    if a == b:
                        try:
                            n = gram_freq["{0}, {1}, {2}".format(x,a,y)]
                            co_occur["{0}, {1}({2}, {3})".format(x,y,a,b)] = (n*(n-1))/2
                        except:
                            co_occur["{0}, {1}({2}, {3})".format(x,y,a,b)] = 0.0
                        
                    elif a != b:
                        try:
                            n_i = gram_freq["{0}, {1}, {2}".format(x,a,y)]
                            n_j = gram_freq["{0}, {1}, {2}".format(x,b,y)]
                            co_occur["{0}, {1}({2}, {3})".format(x,y,a,b)] = n_i*n_j
                        except:
                            co_occur["{0}, {1}({2}, {3})".format(x,y,a,b)] = 0.0
    
    return co_occur
            

In [19]:
#test
a = ["a","a","a","a","b","a","c"]
b = ["b","b","b","b","a","b","c"]
c = [a,b]
grams, g3_freq = three_grams(c)
print(grams)
print(g3_freq)
context = define_context(grams)
print(context)
con_pairs = context_pairs(context)
print(con_pairs)
print(define_cooccurrence(define_symbols(c),con_pairs,g3_freq))

['b, a, c', 'b, b, b', 'a, b, a', 'b, b, a', 'a, b, c', 'a, a, a', 'b, a, b', 'a, a, b']
{'a, a, a': 2, 'a, a, b': 1, 'a, b, a': 1, 'b, a, c': 1, 'b, b, b': 2, 'b, b, a': 1, 'b, a, b': 1, 'a, b, c': 1}
{'a': ['b, c', 'a, b', 'b, b', 'a, a'], 'b': ['a, c', 'b, a', 'b, b', 'a, a']}
{'a, b': ['b, b', 'a, a'], 'b, a': ['b, b', 'a, a']}
{'b, b(c, c)': 0.0, 'b, b(c, a)': 0.0, 'b, b(c, b)': 0.0, 'b, b(a, c)': 0.0, 'b, b(a, a)': 0.0, 'b, b(a, b)': 2, 'b, b(b, c)': 0.0, 'b, b(b, a)': 2, 'b, b(b, b)': 1.0, 'a, a(c, c)': 0.0, 'a, a(c, a)': 0.0, 'a, a(c, b)': 0.0, 'a, a(a, c)': 0.0, 'a, a(a, a)': 1.0, 'a, a(a, b)': 2, 'a, a(b, c)': 0.0, 'a, a(b, a)': 2, 'a, a(b, b)': 0.0}


In [20]:
#STEP 6: Define the count of co-occurrences for symbols a,b for all contexts
def co_occur_combos(symbols, con_pairs, co_occurs):
    assert type(symbols) == list
    assert type(con_pairs) == dict
    assert type(co_occurs) == dict
    
    co_occur_combos = {}
    for a in symbols:
        for b in symbols:
            total = 0.0
            for k in list(con_pairs.keys()):
                for item in con_pairs[k]:
                    total += co_occurs["{0}({1}, {2})".format(item,a,b)]
            co_occur_combos["{0}, {1}".format(a,b)] = total
    
    return co_occur_combos


In [21]:
#test
a = ["a","a","a","a","b","a","a","b","b"]
b = ["a","a","a","a","b","a","a","b","a","a","b","a","a","b","a","a","b","a"]
c = [a,b]
grams, g3_freq = three_grams(c)
print(g3_freq)
context = define_context(grams)
print(context)
con_pairs = context_pairs(context)
print(con_pairs)
co_occurs = define_cooccurrence(define_symbols(c),con_pairs,g3_freq)
print(co_occurs)
print(co_occur_combos(define_symbols(c),con_pairs,co_occurs))

{'a, a, a': 4, 'a, a, b': 7, 'a, b, a': 6, 'b, a, a': 5, 'a, b, b': 1}
{'b': ['a, b', 'a, a'], 'a': ['b, a', 'a, b', 'a, a']}
{'b, a': ['a, b', 'a, a'], 'a, b': ['a, b', 'a, a']}
{'a, b(a, a)': 21.0, 'a, b(a, b)': 7, 'a, b(b, a)': 7, 'a, b(b, b)': 0.0, 'a, a(a, a)': 6.0, 'a, a(a, b)': 24, 'a, a(b, a)': 24, 'a, a(b, b)': 15.0}
{'a, a': 54.0, 'a, b': 62.0, 'b, a': 62.0, 'b, b': 30.0}


In [22]:
#STEP 7: Define norm on the count of co-occur combos
def define_norm (co_combos):
    assert type(co_combos) == dict
    norm = 0.0
    for k in list(co_combos.keys()):
        norm += co_combos[k]
    
    return norm

In [23]:
#test
a = ["a","a","a","a","b","a","a","b","b"]
b = ["a","a","a","a","b","a","a","b","a","a","b","a","a","b","a","a","b","a"]
c = [a,b]
grams, g3_freq = three_grams(c)
context = define_context(grams)
con_pairs = context_pairs(context)
co_occurs = define_cooccurrence(define_symbols(c),con_pairs,g3_freq)
print(define_norm(co_occur_combos(define_symbols(c),con_pairs,co_occurs)))

208.0


In [24]:
#STEP 8: Define matrix M over A x A
def define_matrix (symbols, co_combos, norm):
    assert type(symbols) == list
    assert type(co_combos) == dict
    assert type(norm) == float
    
    mat_M = {}
    for a in symbols:
        for b in symbols:
            mat_M["{0}, {1}".format(a,b)] = co_combos["{0}, {1}".format(a,b)]/norm
    
    return mat_M

In [25]:
a = ["a","a","a","a","b","a","a","b","b"]
b = ["a","a","a","a","b","a","a","b","a","a","b","a","a","b","a","a","b","a"]
c = [a,b]
grams, g3_freq = three_grams(c)
context = define_context(grams)
con_pairs = context_pairs(context)
co_occurs = define_cooccurrence(define_symbols(c),con_pairs,g3_freq)
norm = define_norm(co_occur_combos(define_symbols(c),con_pairs,co_occurs))
co_combos = co_occur_combos(define_symbols(c), con_pairs, co_occurs)
print(define_matrix(define_symbols(c), co_combos, norm))

{'a, a': 0.25961538461538464, 'a, b': 0.2980769230769231, 'b, a': 0.2980769230769231, 'b, b': 0.14423076923076922}


In [26]:
def prob_occur (symbols, mat_M):
    assert type(symbols) == list
    assert type(mat_M) == dict
    
    p = {}
    for a in symbols:
        total = 0
        for b in symbols:
            if a != b:
                total += mat_M["{0}, {1}".format(a,b)]
        total += mat_M["{0}, {1}".format(a,a)]
        p["{0}".format(a)] = total
    
    return p
                

In [27]:
a = ["a","a","a","a","b","a","a","b","b"]
b = ["a","a","a","a","b","a","a","b","a","a","b","a","a","b","a","a","b","a"]
c = [a,b]
grams, g3_freq = three_grams(c)
context = define_context(grams)
con_pairs = context_pairs(context)
co_occurs = define_cooccurrence(define_symbols(c),con_pairs,g3_freq)
norm = define_norm(co_occur_combos(define_symbols(c),con_pairs,co_occurs))
co_combos = co_occur_combos(define_symbols(c), con_pairs, co_occurs)
matM = define_matrix(define_symbols(c), co_combos, norm)
print(prob_occur(define_symbols(c), matM))

{'a': 0.5576923076923077, 'b': 0.4423076923076923}


In [28]:
def exp_val (symbols, prob):
    assert type(symbols) == list
    assert type(prob) == dict
    
    e_val = {}
    for a in symbols:
        for b in symbols:
            if a == b:
                e_val["{0}, {1}".format(a,b)] = prob["{0}".format(a)]**2
            else:
                e_val["{0}, {1}".format(a,b)] = 2*prob["{0}".format(a)]*prob["{0}".format(b)]
    
    return e_val

In [29]:
a = ["a","a","a","a","b","a","a","b","b"]
b = ["a","a","a","a","b","a","a","b","a","a","b","a","a","b","a","a","b","a"]
c = [a,b]
grams, g3_freq = three_grams(c)
context = define_context(grams)
con_pairs = context_pairs(context)
co_occurs = define_cooccurrence(define_symbols(c),con_pairs,g3_freq)
norm = define_norm(co_occur_combos(define_symbols(c),con_pairs,co_occurs))
co_combos = co_occur_combos(define_symbols(c), con_pairs, co_occurs)
matM = define_matrix(define_symbols(c), co_combos, norm)
probs = prob_occur(define_symbols(c), matM)
print(exp_val(define_symbols(c), probs))

{'a, a': 0.3110207100591716, 'a, b': 0.4933431952662722, 'b, a': 0.4933431952662722, 'b, b': 0.1956360946745562}


In [30]:
def sub_scores (traces):
    assert type(traces) == list
    
    symbols = define_symbols(traces)
    three_gs, three_gs_freq = three_grams(traces)
    cons = define_context(three_gs)
    con_pairs = context_pairs(cons)
    co_occurs = define_cooccurrence(symbols, con_pairs, three_gs_freq)
    co_combos = co_occur_combos(symbols, con_pairs, co_occurs)
    norm = define_norm(co_combos)
    matM = define_matrix(symbols, co_combos, norm)
    probs = prob_occur(symbols, matM)
    e_val = exp_val(symbols, probs)
    
    sub_costs = {}
    for a in symbols:
        for b in symbols:
            if a!=b:
                try:
                    sub_costs["{0}, {1}".format(a,b)] = np.log2(matM["{0}, {1}".format(a,b)]/e_val["{0}, {1}".format(a,b)])
                except:
                    sub_costs["{0}, {1}".format(a,b)] = -np.inf
    
    return sub_costs
    

In [31]:
s_score = sub_scores([test1, test2])
s_score



{'give.statement, respond.deny': -1.2189510662537508,
 'give.statement, use.social.convention': -0.7991671206359782,
 'give.statement, recall': -3.279259072099281,
 'give.statement, open.question': -1.1368280394978416,
 'give.statement, deflection': -1.342408219630197,
 'give.statement, relax.atmosphere': -0.7135337660807433,
 'give.statement, closed.question': -0.9102743960234426,
 'give.statement, misc': -1.8514323278005085,
 'give.statement, respond.agree': -0.7666064185194705,
 'give.statement, give.opinion': -1.279163087998234,
 'respond.deny, give.statement': -1.2189510662537508,
 'respond.deny, use.social.convention': -2.536784950339194,
 'respond.deny, recall': -3.546590963390785,
 'respond.deny, open.question': -0.21821161911223122,
 'respond.deny, deflection': 1.0815261709710615,
 'respond.deny, relax.atmosphere': -1.1657398104870977,
 'respond.deny, closed.question': -0.3093607060100491,
 'respond.deny, misc': -inf,
 'respond.deny, respond.agree': 0.8257198911647854,
 'respo

In [32]:
#gut check
print(sub_scores([test1, test2]) == sub_scores([test2, test1]))



False


# Note that with very broad categories of labels, there is going to be little defined similarity between each label, and switching the labels will always be bad (thus negative score for all substitutions), however this is expected, and we can see something like, what are the least harmful switches, to show similarity. Notice closed.question and open.question have higher similarity, which is something expected

In [33]:
#Insertion STEP 4: define Cxy(a) as the count of occurrences of 3-gram xay
def occ_count (symbols, cons, grams, gfreq):
    assert type(symbols) == list
    assert type(grams) == list
    assert type(cons) == dict
    
    o_counts = {}
    for a in list(cons.keys()):
        for pair in cons[a]:
            x = pair.split(", ")[0]
            y = pair.split(", ")[1]
            o_counts["{0}, {1}({2})".format(x,y,a)] = gfreq["{0}, {1}, {2}".format(x,a,y)]
    
    return o_counts

In [34]:
#test occ_count with values easily verifiable
t1 = ["a","a","a","a","a"]
t2 = ["a","a","a","a","b"]
tlog = [t1, t2]
symbols = define_symbols(tlog)
grams, gfreq = three_grams(tlog)
cons = define_context(grams)
oc = occ_count(symbols, cons, grams, gfreq)
print(oc)

{'a, b(a)': 1, 'a, a(a)': 5}


In [35]:
#Insertion STEP 5: define countRgivenL
def countRgL (symbols, ocounts):
    assert type(symbols) == list
    assert type(ocounts) == dict
    
    rgl_counts = {}
    
    for a in symbols:
        for x in symbols:
            #if a !=x:
            total = 0
            for k in list(ocounts.keys()):
                if k.split("(")[0].split(", ")[0] == x and k.split("(")[1] == "{0})".format(a):
                    total += ocounts[k]
            rgl_counts["{0}/{1}".format(a,x)] = total
    
    return rgl_counts

In [36]:
#test with values easily verifiable
t1 = ["a","a","a","a","a","a"]
t2 = ["a","a","a","a","b","a"]
tlog = [t1, t2]
symbols = define_symbols(tlog)
grams, gfreq = three_grams(tlog)
cons = define_context(grams)
oc = occ_count(symbols, cons, grams, gfreq)
rgl = countRgL(symbols, oc)
print(rgl)

#should add c to the results table, but no occurrences since it's at the end, opens 1 more for a/b
t1 = ["a","a","a","a","a","a","c"]
t2 = ["a","a","a","a","b","a","c"]
tlog = [t1, t2]
symbols = define_symbols(tlog)
grams, gfreq = three_grams(tlog)
cons = define_context(grams)
oc = occ_count(symbols, cons, grams, gfreq)
rgl = countRgL(symbols, oc)
print(rgl)

{'a/a': 7, 'a/b': 0, 'b/a': 1, 'b/b': 0}
{'c/c': 0, 'c/a': 0, 'c/b': 0, 'a/c': 0, 'a/a': 8, 'a/b': 1, 'b/c': 0, 'b/a': 1, 'b/b': 0}


In [37]:
#Insertion STEP 6: define norm(a)
def rgl_norm (symbols, rgl_counts):
    assert type(symbols) == list
    assert type(rgl_counts) == dict
    
    rgl_norms = {}
    
    for a in symbols:
        total = 0
        for x in symbols:
            #if a !=x:
            total += rgl_counts["{0}/{1}".format(a,x)]
        rgl_norms["{0}".format(a)] = total
    
    return rgl_norms

In [38]:
#test with values easily verifiable
t1 = ["a","a","a","a","a","a"]
t2 = ["a","a","a","a","b","a"]
tlog = [t1, t2]
symbols = define_symbols(tlog)
grams, gfreq = three_grams(tlog)
cons = define_context(grams)
oc = occ_count(symbols, cons, grams, gfreq)
rgl = countRgL(symbols, oc)
norms = rgl_norm(symbols, rgl)
print(norms)

{'a': 7, 'b': 1}


In [39]:
#Insertion STEP 7: define the probability of all symbols
def rgl_prob (trace):
    assert type(trace) == list
    
    p = {}
    for item in trace:
        for a in item:
            try:
                p["{0}".format(a)] += 1
            except:
                p["{0}".format(a)] = 1
    
    tot_len = 0
    for item in trace:
        tot_len += len(item)
    
    for k in list(p.keys()):
        p[k] = p[k]/tot_len
    
    return p

In [40]:
#test with values easily verifiable
t1 = ["a","a","a","a","a"]
t2 = ["a","a","a","a","b"]
tlog = [t1, t2]
symbols = define_symbols(tlog)
grams, gfreq = three_grams(tlog)
cons = define_context(grams)
oc = occ_count(symbols, cons, grams, gfreq)
rgl = countRgL(symbols, oc)
norms = rgl_norm(symbols, rgl)
probs = rgl_prob(tlog)
print(probs)

{'a': 0.9, 'b': 0.1}


In [41]:
#Insertion STEP 8: define rglNorm
def normed_counts (symbols, rgl, norms):
    assert type(symbols) == list
    assert type(rgl) == dict
    assert type(norms) == dict
    
    normed_rgls = {}
    
    for a in symbols:
        for b in symbols:
            normed_rgls["{0}/{1}".format(a,b)] = rgl["{0}/{1}".format(a,b)]/norms["{0}".format(a)]
    
    return normed_rgls

In [42]:
#test with values easily verifiable
t1 = ["a","a","a","a","a","a"]
t2 = ["a","a","a","a","b","a"]
tlog = [t1, t2]
symbols = define_symbols(tlog)
grams, gfreq = three_grams(tlog)
cons = define_context(grams)
oc = occ_count(symbols, cons, grams, gfreq)
rgl = countRgL(symbols, oc)
norms = rgl_norm(symbols, rgl)
probs = rgl_prob(tlog)
norm_rgls = normed_counts(symbols, rgl, norms)
print(norm_rgls)

{'a/a': 1.0, 'a/b': 0.0, 'b/a': 1.0, 'b/b': 0.0}


In [43]:
def insert_scores (traces):
    assert type(traces) == list
    
    symbols = define_symbols(traces)
    grams, freq = three_grams(traces)
    cons = define_context(grams)
    oc = occ_count(symbols, cons, grams, freq)
    rgl = countRgL(symbols, oc)
    norms = rgl_norm(symbols, rgl)
    probs = rgl_prob(traces)
    norm_rgls = normed_counts(symbols ,rgl, norms)
    
    scores = {}
    for a in symbols:
        for b in symbols:
            scores["{0}/{1}".format(a,b)] = np.log2(norm_rgls["{0}/{1}".format(a,b)]/probs["{0}".format(a)]*probs["{0}".format(b)])
    
    return scores

In [44]:
t1 = ["a","a","a","a","a","a"]
t2 = ["a","a","a","a","a","a"]
tlog = [t1, t2]
print(insert_scores(tlog))

t1 = ["a","a","a","a","a","a"]
t2 = ["a","a","a","a","b","a"]
tlog = [t1, t2]
print(insert_scores(tlog))

t1 = ["a","a","b","a","a","b","c","c"]
t2 = ["a","a","b","a","a","b","a","a"]
tlog = [t1, t2]
print(insert_scores(tlog))

{'a/a': 0.0}
{'a/a': 0.0, 'a/b': -inf, 'b/a': 3.4594316186372973, 'b/b': -inf}
{'c/c': -inf, 'c/a': -inf, 'c/b': 1.0, 'a/c': -inf, 'a/a': -0.8073549220576042, 'a/b': -2.5443205162238103, 'b/c': -inf, 'b/a': 1.3219280948873624, 'b/b': -inf}


  app.launch_new_instance()


# Here this seems correct since adding b given a will make t1 more similar to t2, where the only difference is that t2 has an extra b!

In [45]:
#test on our logs, specifically looking for respond.agree, open.question or respond.agree, closed.question to have high scores, since they usually
#appear next to each other in that order
ins_score = insert_scores([test1,test2])
ins_score

  app.launch_new_instance()


{'give.statement/give.statement': -1.1606828491374637,
 'give.statement/respond.deny': -10.130437092921554,
 'give.statement/use.social.convention': -3.96989332777355,
 'give.statement/recall': -6.486153501265199,
 'give.statement/open.question': -7.563396500197659,
 'give.statement/deflection': -12.622290189251228,
 'give.statement/relax.atmosphere': -4.22622063161136,
 'give.statement/closed.question': -7.151970254471194,
 'give.statement/misc': -10.452365187808915,
 'give.statement/respond.agree': -3.695253020530933,
 'give.statement/give.opinion': -3.1130100211633693,
 'respond.deny/give.statement': 0.6788138413804108,
 'respond.deny/respond.deny': -2.7548875021634687,
 'respond.deny/use.social.convention': -inf,
 'respond.deny/recall': -inf,
 'respond.deny/open.question': -2.187846909439575,
 'respond.deny/deflection': -inf,
 'respond.deny/relax.atmosphere': -inf,
 'respond.deny/closed.question': 1.5455074311742525,
 'respond.deny/misc': -inf,
 'respond.deny/respond.agree': -1.880

# scores generally make sense

# we can define similarity function

In [46]:
#gutcheck
insert_scores([test1,test2]) == insert_scores([test2, test1])

  app.launch_new_instance()


True

In [100]:
#similarity calculations between two traces is defined by the edit distance, modified by the substitution and indel scores
def calc_distance(trace1, trace2):
    
    assert type(trace1) == type(trace2) == list
    
    #switch so that trace1 is always > trace2
    if len(trace1) < len(trace2):
        trace_copy = trace1
        trace1 = trace2
        trace2 = trace_copy

    M = len(trace1)
    N = len(trace2)
    sim_table = np.zeros((M,N)) #establish table
    s_score = sub_scores([trace1, trace2]) #get substitution score
    ins_score = insert_scores([trace1, trace2]) #get insertion score
    
    #fill table, horizontal -> vertical
    for i in range(M):
        for j in range(N):
            
            #original fill horizontal
            if i == 0:
                if j == 0 or j == 1: #first fill
                    sim_table[i][j] = 0
                else: #rest fill, base insert scores
                    sim_table[i][j] = -1/ins_score["{0}/{1}".format(trace2[j], trace2[j-1])] + sim_table[i][j-1]
            
            #original fill vertical
            elif j == 0:
                sim_table[i][j] = i #consider deletion as unit 1 cost
            
            elif trace1[i] == trace2[j]: #no changes
                sim_table[i][j] = sim_table[i-1][j-1]
            
            else: #substitution, insertion or deletion
                
                #determine the max to take
                op = np.argmin([sim_table[i-1][j], sim_table[i][j-1], sim_table[i-1][j-1]]) #in order, removal, insertion, substitution
                if op == 0:
                    sim_table[i][j] = 1 + sim_table[i-1][j] #removal
                elif op == 1:
                    sim_table[i][j] = -1/ins_score["{0}/{1}".format(trace2[j],trace2[j-1])] + sim_table[i][j-1] #insertion
                elif op == 2:
                    sim_table[i][j] = -1/s_score["{0}, {1}".format(trace1[i],trace2[j])] + sim_table[i-1][j-1] #substitution
                
    return sim_table[i][j] #final score
            

In [101]:
a = ["a","a","a","a","b","a","c","c"]
b = ["b","b","b","b","a","b","c","c","c","c"]
c = ["a","a","b","a"]
d = ["b","a","b","b"]
print(calc_distance(a,b))

print(calc_distance(b,c))

print(calc_distance(a,c))

print(calc_distance(d,c))

print(calc_distance(a,a)==0)
print(calc_distance(a,b)==calc_distance(b,a))

7.044083186301329
7.0
4.0
-inf
True
True


  app.launch_new_instance()


In [102]:
calc_distance(test1,test2)

  app.launch_new_instance()


427.81181431085855

In [103]:
calc_distance(test1,test3)

  app.launch_new_instance()


60.98780335263162

In [104]:
calc_distance(test2, test3)

  app.launch_new_instance()


-411.088249513836