In [1]:
import numpy as np
import pandas as pd

Edit-distance operations are defined as 
* (a,a) denotes a match of symbols at the given position
* (a,-) denotes deletion of symbol 'a' at some position
* (-,b) denotes insertion of symbol 'b' at some position
* (a,b) denotes replacement of 'a' with 'b' at some position, and a != b
We assign a separate cost for each of these operations according to our needs

In [2]:
#for simplicity and completeness sake, we define a simple levenshtein distance function first

def basic_distance (trace1, trace2):

    M = len(trace1)
    N = len(trace2)
    edit_table = np.zeros((M,N)) #establish table

    #fill table
    for i in range(M):
        for j in range(N):

            if i == 0:
                edit_table[i][j] = j

            elif j ==0:
                edit_table[i][j] = i

            elif trace1[i-1] == trace2[j-1]:
                edit_table[i][j] = edit_table[i-1][j-1]

            else:
                edit_table[i][j] = 1 + min(edit_table[i-1][j], edit_table[i][j-1], edit_table[i-1][j-1]) #scoring done here

    return edit_table[i][j]

In [15]:
#sample test
dat1 = pd.read_csv('./graham.norton.s22.e08_data.csv')
dat2 = pd.read_csv('./graham.norton.s22.e12_data.csv')
dat3 = pd.read_csv('./blackpink_data.csv')
test1 = list(dat1.L)
test2 = list(dat2.L)
test3 = list(dat3.L)

In [4]:
basic_distance(test1,test2)

517.0

In [5]:
#baseline
import nltk
nltk.edit_distance(test1,test2)

517

# Now we have to alter the function to accomodate for different scoring metrics according to the paper
* substitution of uncorrelated activities should be discouraged
* substitution of contrasting activities should be penalized
* insertion of activities out of context should be discouraged
* substitution of correlated activities should be encouraged in proportion to the degree of similarity

In [18]:
#similarity is calculated to address substitution costs
def sub_cost (log):
    
    g3_freq = {}
    conteX = {}
    
    '''
    g3_freq is a dictionary in the form:
    key: triple of labels, i.e: "respond.agree, open.question, give.opinion"
    value: int of frequency value, i.e: 20
    
    conteX is a dictionary in the form:
    key: single or pair label, i.e: "respond.agree", "respond.agree, open.question"
    value: list of label pairs
    value: set of label pairs, i.e: ["respond.agree, give.opinion", "closed.question, relax.atmosphere"]
    **value depends on whether it is a singular or pair label used as a key
    
    co_occur is a dictionary in the form:
    key: a context and label pair, i.e: "open.question, give.opinion"("relax.atmosphere", "use.social.convention")
    value: int of the co-occurence value i.e: 20
    
    '''
   
    #STEP 1: let A be the alphabet
    symbols = list(set(log))
    
    #set conteX
    for item in symbols:
        conteX[item] = []
        
    #STEP 2: find the 3 grams and their freq 
    for i in range(len(log)-3):
        try:
            g3_freq[", ".join(list(log[i:i+3]))] += 1
        except:
            g3_freq[", ".join(list(log[i:i+3]))] = 1
    
    #STEP 3: determine the set of contexts for each symbols from the 3-grams
    for threegram in list(g3_freq.keys()):
        con_a, sym, con_b = threegram.split(", ")
        conteX[sym].append(con_a + ", " + con_b)

    #STEP 4: determine context for each pair
    conteX_pairs = {}
    for p1 in symbols:
        for p2 in symbols:
            conteX_pairs["{0}, {1}".format(p1, p2)] = list(set(conteX[p1] + conteX[p2]))

    #STEP 5: determine co-occurrence
    co_occur = {}
    for pair in list(conteX_pairs.keys()):
        for a in symbols:
            for b in symbols:
                if a == b:
                    try:
                        n = g3_freq[pair.split(", ")[0] + ", " + a + ", " + pair.split(", ")[1]]
                        co_occur["{0}({1},{2})".format(pair, a, b)] = (n*(n-1))/2
                    except:
                        co_occur["{0}({1},{2})".format(pair, a, b)] = 0
                elif a != b:
                    try:
                        n_i = g3_freq[pair.split(", ")[0] + ", " + a + ", " + pair.split(", ")[1]]
                        n_j = g3_freq[pair.split(", ")[0] + ", " + b + ", " + pair.split(", ")[1]]
                        co_occur["{0}({1},{2})".format(pair, a, b)] = n_i*n_j
                    except:
                        co_occur["{0}({1},{2})".format(pair, a, b)] = 0
    
    #STEP 6: calculate co-occurrence combinations
    co_occur_combo = {}
    for pair in list(conteX_pairs.keys()):
        total = 0
        for a in symbols:
            for b in symbols:
                total += co_occur["{0}({1},{2})".format(pair,a,b)]
        co_occur_combo[pair] = total
        
    #STEP 7: define the norm of all the count of co-occurence combinations
    norm = 0
    for a in symbols:
        for b in symbols:
            norm += co_occur_combo["{0}, {1}".format(a,b)]
    
    #STEP 8: define the matrix
    mat = {}
    for p1 in symbols:
        for p2 in symbols:
            mat["{0}, {1}".format(p1,p2)] = co_occur_combo["{0}, {1}".format(p1, p2)]/norm
    
    #STEP 9: define probability of occurrence
    p_occur = {}
    for s in symbols:
        par = 1
        for b in symbols:
            if s != b:
                try:
                    p_occur[s] += mat["{0}, {1}".format(s,b)]
                    par += 1
                except:
                    p_occur[s] = 0
        p_occur[s] += mat["{0}, {1}".format(s,s)]
        p_occur[s] = p_occur[s]/par
    
    #STEP 10: define expected values matrix
    E_mat = {}
    for a in symbols:
        for b in symbols:
            if a == b:
                E_mat["{0}, {1}".format(a,b)] = p_occur[a]**2
            else:
                E_mat["{0}, {1}".format(a,b)] = 2*p_occur[a]*p_occur[b]
    
    #STEP 11: define matrix of scores
    scores = {}
    for a in symbols:
        for b in symbols:
            if a != b:
                scores["{0}, {1}".format(a,b)] = np.log2(mat["{0}, {1}".format(a,b)]/E_mat["{0}, {1}".format(a,b)])
    
    return scores

In [19]:
print(sub_cost(test1+test2))

{'recall, closed.question': 3.523190915023277, 'recall, give.opinion': 3.8993787254190617, 'recall, use.social.convention': 0.3392726415014651, 'recall, relax.atmosphere': 2.594831926773112, 'recall, respond.deny': -inf, 'recall, deflection': -inf, 'recall, misc': 0.24328603503162152, 'recall, give.statement': 2.3770776437505035, 'recall, respond.agree': 2.4243417895982877, 'recall, open.question': 4.158513172676578, 'closed.question, recall': 2.148795400241779, 'closed.question, give.opinion': 5.2179210354519245, 'closed.question, use.social.convention': 3.261431135587172, 'closed.question, relax.atmosphere': 6.502343644894418, 'closed.question, respond.deny': 7.836851393927039, 'closed.question, deflection': 8.523351921110258, 'closed.question, misc': -inf, 'closed.question, give.statement': 6.133650072118235, 'closed.question, respond.agree': 7.40245668965801, 'closed.question, open.question': 7.546335239111096, 'give.opinion, recall': 3.708822391204619, 'give.opinion, closed.questi



In [20]:
print(sub_cost(test1+test3))

{'recall, closed.question': 3.8035271075966586, 'recall, give.opinion': 5.179544295097426, 'recall, use.social.convention': 1.7085675441562718, 'recall, relax.atmosphere': 3.2031720469263423, 'recall, respond.deny': -inf, 'recall, deflection': -inf, 'recall, x': -inf, 'recall, mis': -inf, 'recall, misc': 4.487676611251478, 'recall, give.statement': 3.4712020622339392, 'recall, respond.agree': 3.941737049015713, 'recall, open.question': 4.7420051322944765, 'closed.question, recall': 2.3661217952893603, 'closed.question, give.opinion': 4.7206766001728715, 'closed.question, use.social.convention': 4.7278556870839, 'closed.question, relax.atmosphere': 9.065832869596106, 'closed.question, respond.deny': 10.111076517203145, 'closed.question, deflection': 11.959073423758095, 'closed.question, x': 6.4215580854394, 'closed.question, mis': -inf, 'closed.question, misc': -inf, 'closed.question, give.statement': 8.948405768388406, 'closed.question, respond.agree': 9.617278986850287, 'closed.questi

