In [15]:
import numpy as np
import pandas as pd

Edit-distance operations are defined as 
* (a,a) denotes a match of symbols at the given position
* (a,-) denotes deletion of symbol 'a' at some position
* (-,b) denotes insertion of symbol 'b' at some position
* (a,b) denotes replacement of 'a' with 'b' at some position, and a != b
We assign a separate cost for each of these operations according to our needs

In [16]:
#for simplicity and completeness sake, we define a simple levenshtein distance function first

def basic_distance (trace1, trace2):

    M = len(trace1)
    N = len(trace2)
    edit_table = np.zeros((M,N)) #establish table

    #fill table
    for i in range(M):
        for j in range(N):

            if i == 0:
                edit_table[i][j] = j

            elif j ==0:
                edit_table[i][j] = i

            elif trace1[i-1] == trace2[j-1]:
                edit_table[i][j] = edit_table[i-1][j-1]

            else:
                edit_table[i][j] = 1 + min(edit_table[i-1][j], edit_table[i][j-1], edit_table[i-1][j-1]) #scoring done here

    return edit_table[i][j]

In [17]:
#sample test
dat1 = pd.read_csv('./graham.norton.s22.e08_data.csv')
dat2 = pd.read_csv('./graham.norton.s22.e12_data.csv')
test1 = list(dat1.L)
test2 = list(dat2.L)

In [17]:
basic_distance(test1,test2)

517.0

In [18]:
#baseline
import nltk
nltk.edit_distance(test1,test2)

517

# Now we have to alter the function to accomodate for different scoring metrics according to the paper
* substitution of uncorrelated activities should be discouraged
* substitution of contrasting activities should be penalized
* insertion of activities out of context should be discouraged
* substitution of correlated activities should be encouraged in proportion to the degree of similarity

In [77]:
#similarity is calculated to address substitution costs
def sub_cost (log):
    
    g3_freq = {}
    conteX = {}
    
    '''
    g3_freq is a dictionary in the form:
    key: triple of labels, i.e: "respond.agree, open.question, give.opinion"
    value: int of frequency value, i.e: 20
    
    conteX is a dictionary in the form:
    key: single or pair label, i.e: "respond.agree", "respond.agree, open.question"
    value: list of label pairs
    value: set of label pairs, i.e: ["respond.agree, give.opinion", "closed.question, relax.atmosphere"]
    **value depends on whether it is a singular or pair label used as a key
    
    co_occur is a dictionary in the form:
    key: a context and label pair, i.e: "open.question, give.opinion"("relax.atmosphere", "use.social.convention")
    value: int of the co-occurence value i.e: 20
    
    '''
   
    #STEP 1: let A be the alphabet
    symbols = set(log)
    
    #set conteX
    for item in symbols:
        conteX[item] = []
        
    #STEP 2: find the 3 grams and their freq 
    for i in range(len(log)-3):
        try:
            g3_freq[", ".join(list(log[i:i+3]))] += 1
        except:
            g3_freq[", ".join(list(log[i:i+3]))] = 1
    
    #STEP 3: determine the set of contexts for each symbols from the 3-grams
    for threegram in list(g3_freq.keys()):
        con_a, sym, con_b = threegram.split(", ")
        conteX[sym].append(con_a + ", " + con_b)

    #STEP 4: determine context for each pair
    for p1 in symbols:
        for p2 in symbols:
            conteX["{0}, {1}".format(p1, p2)] = list(set(conteX[p1] + conteX[p2]))

    #STEP 5: determine co-occurrence
    co_occur = {}
    for p1 in symbols:
        for p2 in symbols:
            if p1 != p2:
                for con in conteX["{0}, {1}".format(p1,p2)]:
                    #consider the 3-grams for each of the context in this pair
                    try:
                        co_occur["{0}({1}, {2})".format(con,p1,p2)] = g3_freq["{0}, {1}, {2}".format(con.split(", ")[0], p1, con.split(", ")[1])] * g3_freq["{0}, {1}, {2}".format(con.split(", ")[0], p2, con.split(", ")[1])]
                    except:
                        co_occur["{0}({1}, {2})".format(con,p1,p2)] = 0
            elif p1 == p2:
                for con in conteX["{0}, {1}".format(p1,p2)]:
                    try:
                        co_occur["{0}({1}, {2})".format(con,p1,p2)] = (g3_freq["{0}, {1}, {2}".format(con.split(", ")[0], p1, con.split(", ")[1])]*(g3_freq["{0}, {1}, {2}".format(con.split(", ")[0], p1, con.split(", ")[1])]-1))//2
                    except:
                        co_occur["{0}({1}, {2})".format(con,p1,p2)] = 0
    #STEP 6: calculate co-occurrence combinations
    co_occur_combo = {}
    for p1 in symbols:
        for p2 in symbols:
            total = 0
            for con in conteX["{0}, {1}".format(p1,p2)]:
                total += co_occur["{0}({1}, {2})".format(con, p1, p2)]
            co_occur_combo["{0}, {1}".format(p1, p2)] = total
        
    #STEP 7: define the norm of all the count of co-occurence combinations
    norm = 0
    for k in co_occur_combo.keys():
        norm += co_occur_combo[k]
    
    #STEP 8: define the matrix
    mat = {}
    for p1 in symbols:
        for p2 in symbols:
            mat["{0}, {1}".format(p1,p2)] = co_occur_combo["{0}, {1}".format(p1, p2)]/norm
    
    #STEP 9: define probability of occurrence
    p_occur = {}
    for s in symbols:
        par = 1
        for b in symbols:
            if s != b:
                try:
                    p_occur[s] += mat["{0}, {1}".format(s,b)]
                    par += 1
                except:
                    p_occur[s] = 0
        p_occur[s] += mat["{0}, {1}".format(s,s)]
        p_occur[s] = p_occur[s]/par
    
    #STEP 10: define expected values matrix
    E_mat = {}
    for a in symbols:
        for b in symbols:
            if a == b:
                E_mat["{0}, {1}".format(a,b)] = p_occur[a]**2
            else:
                E_mat["{0}, {1}".format(a,b)] = 2*p_occur[a]*p_occur[b]
    
    #STEP 11: define matrix of scores
    scores = {}
    for a in symbols:
        for b in symbols:
            if a != b:
                scores["{0}, {1}".format(a,b)] = np.log2(mat["{0}, {1}".format(a,b)]/E_mat["{0}, {1}".format(a,b)])
    
    return scores

In [80]:
print(sub_cost(test1+test2))

{'use.social.convention, respond.agree': 4.307970174951491, 'use.social.convention, relax.atmosphere': 5.522587734962121, 'use.social.convention, give.opinion': 5.0279074603308755, 'use.social.convention, x': 1.680969281578785, 'use.social.convention, deflection': 3.7710490200474336, 'use.social.convention, closed.question': 4.130652341532316, 'use.social.convention, open.question': 4.467716233974865, 'use.social.convention, respond.deny': 3.820834443562954, 'respond.agree, use.social.convention': 4.307970174951491, 'respond.agree, relax.atmosphere': 5.200470411838418, 'respond.agree, give.opinion': 5.296264655822938, 'respond.agree, x': 0.9953794112482077, 'respond.agree, deflection': 5.995774832667948, 'respond.agree, closed.question': 5.524601236186945, 'respond.agree, open.question': 5.669793858904678, 'respond.agree, respond.deny': 7.2558498173775, 'relax.atmosphere, use.social.convention': 5.522587734962121, 'relax.atmosphere, respond.agree': 5.200470411838418, 'relax.atmosphere,