# Calculate Similarities

The following code calculates the similarity between every pair of phrases using Sent2Vec

In [1]:
# Read the sent2vec embeddings from file
import numpy as np

def read_embeddings(embeddings_path):
    """Arguments:
        - embeddings_path: path to the embeddings
    """
    with open(embeddings_path, 'r') as in_stream:
        embeddings = []
        for line in in_stream:
            line = '['+line.replace(' ',',')+']'
            embeddings.append(eval(line))
        return embeddings
    return []

embeddings = np.array(read_embeddings("./bddx_embeddings.txt"))

In [5]:
def dist(e1,e2):
    return np.linalg.norm(e1-e2) # Euclidean Distance

In [7]:
print(embeddings.shape[0])

26539


In [None]:
# Calculate the similarity between every pair of phrases
matrix_size = embeddings.shape[0]
dist_matrix = np.zeros((matrix_size,matrix_size))

for i in range(matrix_size):
    dist_matrix[i][i] = 0.0
    if i%1000 == 0:
        print(i) # Lets the user track progress
    for j in range(i+1,matrix_size):
        d = dist(embeddings[i],embeddings[j])
        dist_matrix[i][j] = d
        dist_matrix[j][i] = d


In [1]:
# Save the distance matrix to file. Since the matrix is very large, we split it across
# multiple files
def save_to_file(matrix,filename,page_size=1000):
    import pickle as pkl
    from math import ceil

    path = "/Volumes/External HD/Deep Learning"
    for i in range(ceil(matrix.shape[0]/page_size)):
        print("%i / %i"%(i,matrix.shape[0]/page_size))
        with open("%s/%s_%i.pkl"%(path,filename,i),"wb") as _output:
            if i == ceil(matrix.shape[0]/page_size):
                pkl.dump(matrix[i*page_size:],_output)
            else:
                pkl.dump(matrix[i*page_size:(i+1)*page_size],_output)


In [None]:
save_to_file(dist_matrix,"sent2vec_dist")

# Perform Similarity Tests

The following code looks at the results of the calculated similarities.

WARNING: High memory required to open both matrices simultaneously.

In [2]:
# Read the distance matrix from file. Skip this step if you still have the distance matrix from
# the above calculations
import numpy as np
import pickle as pkl
from math import ceil

def load_from_file(filename,page_size=1000):
    path = "/Volumes/External HD/Deep Learning"

    with open("%s/%s_%i.pkl"%(path,filename,0),"rb") as _init_in:
        init_matrix = pkl.load(_init_in)

    page_size = init_matrix.shape[0]
    matrix_size = init_matrix.shape[1]

    output = np.zeros((matrix_size,matrix_size))
    output[0:page_size] = init_matrix

    for i in range(1,ceil(matrix_size/page_size)):
        with open("%s/%s_%i.pkl"%(path,filename,i),"rb") as _init_in:
            init_matrix = pkl.load(_init_in)
        output[i*page_size:(i+1)*page_size] = init_matrix

    return output


In [3]:
dist_matrix = load_from_file("sent2vec_dist")
print(dist_matrix[-1,-2])


3.4841733350580784


In [2]:
sdl_matrix = load_from_file("sdl_dist")
print(sdl_matrix[-1,-2])

1.0


In [4]:
#d_avg = np.average(dist_matrix)
d_std = np.std(dist_matrix)

dist_matrix = dist_matrix/d_std

#print(d_avg)
print(d_std)


0.7566601926122294


In [5]:
#s_avg = np.average(sdl_matrix)
s_std = np.std(sdl_matrix)

sdl_matrix = sdl_matrix/s_std

#print(s_avg)
print(s_std)


0.4707403197629428


In [6]:
dist_matrix-=sdl_matrix

# delete sdl_matrix to save on memory overhead
sdl_matrix = None
del sdl_matrix

dist_matrix = np.absolute(dist_matrix)

In [9]:
save_to_file(dist_matrix,"diff")

0 / 26
1 / 26
2 / 26
3 / 26
4 / 26
5 / 26
6 / 26
7 / 26
8 / 26
9 / 26
10 / 26
11 / 26
12 / 26
13 / 26
14 / 26
15 / 26
16 / 26
17 / 26
18 / 26
19 / 26
20 / 26
21 / 26
22 / 26
23 / 26
24 / 26
25 / 26
26 / 26


In [2]:
dist_matrix = load_from_file("diff")

print(dist_matrix[-1][-2])

2.4803602957450845


In [3]:
avg = np.average(dist_matrix)
std = np.std(dist_matrix)

print(avg)
print(std)


2.7851664212355964
1.3568757958621107


In [5]:
similar = []
not_similar = []

sim_val      = 0.005
sim_ignore   = 0.0001
unsim_val    = 5.45
unsim_ignore = 5.50

for i in range(dist_matrix.shape[0]):
    for j in range(i+1,dist_matrix.shape[1]):
        if dist_matrix[i][j] < sim_val and dist_matrix[i][j] > sim_ignore:
            similar.append((i,j))
        elif dist_matrix[i][j] > unsim_val and dist_matrix[i][j] < unsim_ignore:
            not_similar.append((i,j))

print(len(similar))
print(len(not_similar))


65267
735795


In [6]:
for sim in similar:
    if sim[0] > 7:
        break
    print(sim)
    
print()
for not_sim in not_similar:
    if not_sim[0] > 0:
        break
    print(not_sim)


(0, 14642)
(1, 4588)
(1, 9011)
(1, 10390)
(2, 25014)
(5, 14833)
(5, 23282)
(6, 10992)
(7, 810)
(7, 1400)
(7, 2752)
(7, 4227)
(7, 8395)
(7, 8852)
(7, 9065)
(7, 9558)
(7, 11504)
(7, 16965)
(7, 18581)
(7, 21277)
(7, 21378)
(7, 24561)

(0, 2632)
(0, 4420)
(0, 5583)
(0, 5606)
(0, 6503)
(0, 9542)
(0, 10978)
(0, 11197)
(0, 11890)
(0, 14553)
(0, 15323)
(0, 16068)
(0, 17037)
(0, 17364)
(0, 18451)
(0, 22149)
(0, 22369)
(0, 22602)
(0, 22696)
(0, 22793)
(0, 23105)
(0, 24275)
(0, 25103)
(0, 25266)


In [15]:
for n in not_similar:
    if n[0] == 1592:
        print(n[1])

3261
5365
6412
7555
8845
9742
10293
10607
11016
11855
12860
13334
13868
14312
14358
14821
14869
15262
16107
16743
19288
20886
20916
21459
23528
23540
24429
24944
25909


In [7]:
with open("sim.pkl","wb") as _output:
    pkl.dump(similar,_output)

with open("not_sim.pkl","wb") as _output:
    pkl.dump(not_similar,_output)


In [None]:
similar = []
not_similar = []

with open("sim.pkl","rb") as _in:
    similar = pkl.load(,_in)

with open("not_sim.pkl","rb") as _in:
    not_similar = pkl.load(_in)


In [63]:
sanity_check = []
check_sim = False

if check_sim:
    for sim in similar:
        if sim[0] > 7:
            break
        sanity_check.append(sim)
        print(phrase_lookup[sim[0]])
        print(phrase_lookup[sim[1]])
        print()
else:
    for not_sim in not_similar:
        if not_sim[0] > 10:
            break
        sanity_check.append(not_sim)
        print(phrase_lookup[not_sim[0]])
        print(phrase_lookup[not_sim[1]])



The car stops because it turns to the right.
The car stops bec
The car accelerates slowly to a maintained speed because the light has turned green and traffic is flowing smoothly.
Driver removes camera Driver removes camera
The car accelerates slowly to a maintained speed because the light has turned green and traffic is flowing smoothly.
Driver removes camera Driver removes camera
The car is driving forward as traffic flows freely.
Driver removes camera Driver removes camera
The car is driving forward as traffic flows freely.
Driver removes camera Driver removes camera
The car merges into the lane to its left to get around a slower car in front of it.
Driver removes camera Driver removes camera
The car merges into the lane to its left to get around a slower car in front of it.
Driver removes camera Driver removes camera
The car drives at a normal speed as traffic moves freely.
[Camera is blocked] [Camera is blocked]
The car drives at a normal speed as traffic moves freely.
Driver take

In [7]:
print(np.count_nonzero(dist_matrix <= 2.0))
print(np.count_nonzero(dist_matrix <= 3.0))
print(dist_matrix.shape)

437124287
(26539, 26539)


# Phrase Lookup
This code loads the phrases and arranges them such that the phrases align with the indices of the previous methods (e.g., phrase_lookup\[0\] refers to the phrase associated with dist_matrix\[0\])

In [13]:
# Read phrases from file to be able to manually examine their similarity
import pandas as pd # Pandas library enables data manipulation
data_url = "./revisedBDDX.csv"
def load_bddx_data(csv_name):
    column_names = ['Index', 'InputVideo', '1S', '1E', '1A', '1J', '2S', '2E', '2A', '2J', '3S', '3E', '3A', '3J',
                    '4S', '4E', '4A', '4J','5S', '5E', '5A', '5J','6S', '6E', '6A', '6J','7S', '7E', '7A', '7J',
                    '8S', '8E', '8A', '8J','9S', '9E', '9A', '9J','10S', '10E', '10A', '10J','11S', '11E', '11A', '11J',
                    '12S', '12E', '12A', '12J','13S', '13E', '13A', '13J','14S', '14E', '14A', '14J','15S', '15E', '15A', '15J']
    
    return pd.read_csv(csv_name, names=column_names)

bddx = load_bddx_data(data_url)
bddx = bddx.drop(['1S', '1E','2S', '2E','3S', '3E','4S', '4E','5S', '5E','6S', '6E','7S', '7E','8S', '8E','9S', '9E','10S', '10E','11S', '11E','12S', '12E','13S', '13E','14S', '14E','15S', '15E', ], axis=1)
bddx = bddx.fillna("")

bddx['1AJ'] = bddx[['1A', '1J']].agg(' '.join, axis=1)
bddx['2AJ'] = bddx[['2A', '2J']].agg(' '.join, axis=1)
bddx['3AJ'] = bddx[['3A', '3J']].agg(' '.join, axis=1)
bddx['4AJ'] = bddx[['4A', '4J']].agg(' '.join, axis=1)
bddx['5AJ'] = bddx[['5A', '5J']].agg(' '.join, axis=1)
bddx['6AJ'] = bddx[['6A', '6J']].agg(' '.join, axis=1)
bddx['7AJ'] = bddx[['7A', '7J']].agg(' '.join, axis=1)
bddx['8AJ'] = bddx[['8A', '8J']].agg(' '.join, axis=1)
bddx['9AJ'] = bddx[['9A', '9J']].agg(' '.join, axis=1)
bddx['10AJ'] = bddx[['10A', '10J']].agg(' '.join, axis=1)
bddx['11AJ'] = bddx[['11A', '11J']].agg(' '.join, axis=1)
bddx['12AJ'] = bddx[['12A', '12J']].agg(' '.join, axis=1)
bddx['13AJ'] = bddx[['13A', '13J']].agg(' '.join, axis=1)
bddx['14AJ'] = bddx[['14A', '14J']].agg(' '.join, axis=1)
bddx['15AJ'] = bddx[['15A', '15J']].agg(' '.join, axis=1)

bddx = bddx.drop(['Index', '1A', '1J', '2A', '2J', '3A', '3J', '4A', '4J', '5A', '5J', '6A', '6J', '7A', '7J', '8A', '8J', '9A', '9J', '10A', '10J', '11A', '11J', '12A', '12J', '13A', '13J', '14A', '14J', '15A', '15J', ], axis=1)
bddx = bddx.drop(bddx.index[0])

sdlList = []
for index, row in bddx.iterrows():
    sdlList.append(row.astype(str))
    
sdlStatements = []
for i in range(len(sdlList)):
    sdlStatements.append({'1': sdlList[i]['1AJ'], '2': sdlList[i]['2AJ'], '3': sdlList[i]['3AJ'], '4': sdlList[i]['4AJ'], 
                        '5': sdlList[i]['5AJ'], '6': sdlList[i]['6AJ'], '7': sdlList[i]['7AJ'], '8': sdlList[i]['8AJ'], 
                        '9': sdlList[i]['9AJ'], '10': sdlList[i]['10AJ'], '11': sdlList[i]['11AJ'], '12': sdlList[i]['12AJ'], 
                        '13': sdlList[i]['13AJ'], '14': sdlList[i]['14AJ'], '15': sdlList[i]['15AJ']})

phrase_lookup = []

for i in range(len(sdlStatements)):
    keysTemp = list(sdlStatements[i].keys())
    for j in keysTemp:
        if (sdlStatements[i][j]==" "):
            continue
        else:
            phrase_lookup.append(sdlStatements[i][j])

# SDL Ranking Consistency
Checks whether the rankings are consistent between S2V and SDL embeddings

In [12]:
matrix_size = sdl_matrix.shape[0]
output = []
mask = 9999

for i in range(matrix_size):
    if i%1000 == 0:
        print(i)
    temp = sdl_matrix[i][i] # = 0.0
    sdl_matrix[i][i] = mask # ignore self
    output.append(np.where(sdl_matrix[i] == sdl_matrix[i].min())[0].tolist())
    sdl_matrix[i][i] = temp # ignore self

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000


In [27]:
import pickle as pkl

with open("sdl_rank.pkl","wb") as _out:
    pkl.dump(output,_out)


In [None]:
matrix_size = dist_matrix.shape[0]
output = []
mask = 9999
mismatch = 0

for i in range(matrix_size):
    if i%1000 == 0:
        print(i)
    temp = [(i,dist_matrix[i][i])]
    dist_matrix[i][i] = mask # ignore self
    indices = np.argsort(dist_matrix[i])
    cutoff = len(sdl[i])-1
    dist_matrix[i][indices[cutoff]] == dist_matrix[i][indices[cutoff+1]]
    if dist_matrix[i][indices[cutoff]] == dist_matrix[i][indices[cutoff+1]]:
        mismatch+=1 # Count the number of mismatches for bookkeeping purposes
    while dist_matrix[i][indices[cutoff]] == dist_matrix[i][indices[cutoff+1]] and cutoff > 0:
        cutoff-=1
    output.append(indices[0:cutoff+1].tolist())
        

0
1000
2000
3000
4000


In [14]:
import pickle as pkl

with open("s2v_rank.pkl","wb") as _out:
    pkl.dump(output,_out)


In [15]:
import pickle as pkl

with open("sdl_rank.pkl","rb") as _in:
    sdl = pkl.load(_in)

with open("s2v_rank.pkl","rb") as _in:
    s2v = pkl.load(_in)
    
if len(sdl) != len(s2v):
    print("Warning: list lengths do not match")


In [2]:

output = []
# The format of output will be
#   (TP, FP, FN)


for i in range(len(s2v)):
    if i%1000 == 0:
        print(i)
    tp = 0
    fp = 0
    fn = 0
    gt = set(s2v[i]) # samples of interest
    pred = set(sdl[i])
    n = gt.intersection(pred)
    tp = len(n)
    fp = len(pred-n)
    fn = len(gt-n)
    output.append((tp,fp,fn))


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000


In [3]:

with open("rank_analysis.pkl","wb") as _out:
    pkl.dump(output,_out)


In [4]:

with open("rank_analysis.pkl","rb") as _in:
    analysis = pkl.load(_in)


In [5]:
f1 = []

for i in range(len(analysis)):
    tp,fp,fn = analysis[i]
    if tp == 0 and (fp == 0 or fn == 0):
        print(i)
        print(analysis[i])
    p = tp/(tp+fp)
    r = tp/(tp+fn)
    if tp == 0:
        f1.append(0)
    else:
        f1.append(2*(p*r)/(p+r))


In [4]:
j = []

for i in range(len(analysis)):
    tp,fp,fn = analysis[i]
    j.append(tp/(fp+fn+tp))

In [6]:
import numpy as np

n = np.array(f1)

print(n.mean())
print(n.std())

srt = np.argsort(-n)

print("===")
print(f1[srt[int(len(srt)/2)]])


0.24606881080551876
0.16487261998853636
===
0.23393939393939395


In [21]:
z = np.where(n == 0.0)[0]
print(z[0:20])

[ 11  23 123 124 146 168 183 222 258 278 282 293 304 341 357 382 406 438
 443 473]


In [31]:
case = 473

print(phrase_lookup[case])
print("====")
print("S2V:")
for i in s2v[case][0:3]:
    print(phrase_lookup[i])
print("====")
print("SDL:")
for i in sdl[case][0:10]:
    print(phrase_lookup[i])

The car is leaving its parking space. The car is beginning to travel down the road.
====
S2V:
The car is accelerating and making a u-turn to the left. The car is leaving its parking space.
The car is merging left and accelerating. The car is leaving its parking space.
The car is travelling down the road. The car is at an intersection with a red light.
====
SDL:
The car inches forward because the car in front moves forward.
The car is negotiating a left hand curve The car has come to a curve in the road and is turning 180 degrees to the left.
The car veers right to pass a vehicle that stopped in the car's lane.
The car's windshield wipers run because there is rain on the windshield.
The car gets into close proximity of the forward vehicle because the forward vehicle slowed down.
The car brakes quickly because traffic ahead brakes.
The car remains in the same lane and goes the same speed. Because the car is merging onto the highway with no traffic ahead of it.
The car veers left to pass 

# SDL Ranking Average Distance
Calculates the average sent2vec distance between the SDL's most-similar samples

In [9]:
# Average Distance of SDL Ranking
import numpy as np

avg_dist = np.zeros(len(sdl))

for i,sample in enumerate(sdl):
    if i%2000 == 0:
        print(i)
    avg = 0
    for closest in sample:
        avg+=dist_matrix[i][closest]
    avg_dist[i] = avg/len(sample)

ranking = np.argsort(avg_dist)


0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000


In [34]:
print(avg_dist[ranking[-1]])
print(len(sdl[ranking[-1]]))


17.79138845407426
2049


# SDL Embedding Code
Calculates the pairwise distance between SDL Objects. This code should be placed in the SDL_Data_Mining notebook and used after the SDL Objects have been created and the descriptions have been data mined.

In [None]:
# Returns a list of unique actors, actions, and scenes

actors = []
actions = []
scenes = []

# Find unique identifiers in each category
for sdl in sdlObjectList:
    for timestep in range(1,len(sdl.actors)+1):
        for obj in sdl.actors[str(timestep)]:
            if not obj.description in actors:
                actors.append(obj.description)
            if not obj.action in actions:
                actions.append(obj.action)
        for obj in sdl.scene[str(timestep)]:
            if not obj in scenes:
                scenes.append(obj)

# Transform lists into (key,index) pairs
actor_encoding = {}
scene_encoding = {}
action_encoding = {}
for idx in range(len(actors)):
    actor_encoding[actors[idx]] = idx
for idx in range(len(scenes)):
    scene_encoding[scenes[idx]] = idx
for idx in range(len(actions)):
    action_encoding[actions[idx]] = idx


In [None]:
'''
- Each sdl object has a 7x21x22 one hot encoding representing its action, actor and scene element
- These one hot encodings are stacked on top of each other to produce a 4D tensor [examples x actor x action x scene]
- Last element of each dimension represents an NaN value (or empty string for action)

For each SDL object in sdlObjectList, 15 (7 x 21 x 22) numpy arrays are generated to represent the 15 time segments in each object

'''
actor_encoding = {'light vehicle': 0, 'heavy vehicle': 1, 'cyclist': 2, 'pedestrian': 3, 'traffic': 4, 'ego': 5, 'NaN': 6}

action_encoding = {'turn': 0, 'turn left': 1, 'turn right': 3, 'merge': 4, 'accelerate': 5, 'brake': 6, 'stop': 7, 
                   'forward': 8, 'walk': 9, 'park': 10, 'drive': 11, 'reverse': 12, 'merge center': 13, 'merge left': 14, 
                   'merge right': 15, 'turn through': 16, 'merge u turn': 17, 'u-turn': 18, 'NaN': 19, '':20}

scene_encoding = {'intersection': 0, 'crosswalk': 1, 'bridge': 2, 'green light': 3, 'stop sign': 4, 'yield sign': 5, 'sign': 6, 
                  'u-turn': 7, 'traffic light': 8, 'traffic signal': 9, 'turn lane': 10, 'crosswalks': 11, 'green traffic light': 12, 
                  'light': 13, 'lights': 14, 'red light': 15, 'red traffic light': 16, 'signs': 17, 'traffic lights': 18, 
                  'yellow light': 19, 'yellow traffic light': 20, 'NaN': 21}

one_hot_sdlEmbedding = []

examples = [3, 413]
for example in range(len(sdlObjectList)): #loops through 6996 sdl objects in sdlObjectList
    # Each sdl object has a 7x21x22 one hot encoding representing its action, actor and scene element
    for a in range(len(sdlObjectList[example].actors)): #loops through 15 time segments
        #print("time segments: ", len(sdlObjectList[example].actors))
        #print(a)
        indices = np.zeros((7,21,22))
        actorsIndex = str(a+1)
        actor_list = []
        action_list = []
        scene_list = []
        for j in range(len(sdlObjectList[example].actors[actorsIndex])):
            actor_list.append(sdlObjectList[example].actors[actorsIndex][j].description)
            action_list.append(sdlObjectList[example].actors[actorsIndex][j].action)

        scene_list.append(sdlObjectList[example].scene[actorsIndex])

        #print("actor list: ", actor_list)
        #print("action list: ", action_list)
        #print("scene list: ", scene_list)

        actor_indices = []
        action_indices = []
        scene_indices = []


        if(len(actor_list) != len(action_list)):
            print("Actor and action list don't match up, this may cause 1 to 1 actor to action correspondence errors")
            break

        for a_index in actor_list:
            actor_indices.append(actor_encoding[a_index])

        for act_index in action_list:
            action_indices.append(action_encoding[act_index])

        if( ((len(scene_list)) > 0) and scene_list[0] != 'NaN'):
            for i in scene_list:
                for j in i:
                    scene_indices.append(scene_encoding[j])
        else:
            scene_indices.clear()
            scene_indices.append(21)

        if(len(actor_indices) != len(action_indices)):
            print("make sure each actor is matched up with an action")
            break
        # if each sdl has an actor paired with each action, how do we account for multiple scene elements
        #print("actor indices: ", actor_indices)
        #print("action indices: ", action_indices)
        #print("scene indices: ", scene_indices)

        if(len(scene_indices) > 0):
            for scene_index in scene_indices:
                for i, actor_index in enumerate(actor_indices):
                # since there is a one to one mapping between actor and actions, we can use the same index
                action_index = action_indices[i]
                #print("for")
                #print("actor index: ", actor_index, " action index: ", action_index, " scene_index: ", scene_index)
                indices[actor_index][action_index][scene_index] = 1.0
        else:
            scene_index = 21
            for i, actor_index in enumerate(actor_indices):
                action_index = action_indices[i]
                #print("else")
                #print("actor index: ", actor_index, " action index: ", action_index, " scene_index: ", scene_index)
                indices[actor_index][action_index][scene_index] = 1.0

        one_hot_sdlEmbedding.append(tf.convert_to_tensor(indices))
        

In [None]:
def dist(e1,e2):
    return np.linalg.norm(e1-e2) # Euclidean Distance

dist_matrix = np.zeros((matrix_size,matrix_size))

for i in range(matrix_size):
    if i%500 == 0:
        print(i)
    dist_matrix[i][i] = 0
    for j in range(i+1,matrix_size):
        d = dist(one_hot_sdlEmbedding[i],one_hot_sdlEmbedding[j])
        dist_matrix[i][j] = d
        dist_matrix[j][i] = d


In [None]:
save_to_file(dist_matrix,"sdl_dist")