In [1]:
import pickle
from scipy.spatial.distance import cosine
from wordembeddings.transform_bert import dump_word_vectors

sasho_word_vectors_location = 'vectors_for_cosine_distance/sasho_bert_vectors_for_cosine.pickle'
utah_word_vectors_location = 'vectors_for_cosine_distance/utah_bert_vectors_for_cosine.pickle'

dump_word_vectors(templates_location='../data/openstack/sasho/parsed/logs_aggregated_full.csv_templates',
                  word_embeddings_location=sasho_word_vectors_location,
                  bert_model='finetuning-models')
dump_word_vectors(templates_location='../data/openstack/utah/parsed/openstack_18k_plus_52k_merged_templates',
                  word_embeddings_location=utah_word_vectors_location,
                  bert_model='finetuning-models')

sasho = pickle.load(open(sasho_word_vectors_location, 'rb'))
utah = pickle.load(open(utah_word_vectors_location, 'rb'))
keys_sasho = [key[0] for key in sasho]
keys_utah = [key[0] for key in utah]
intersection = list(set(keys_sasho) & set(keys_utah))

In [2]:
sasho_vectors_intersecting = []
utah_vectors_intersecting = []
utah = dict(utah)
sasho = dict(sasho)
print(utah["[CLS]"])

for intersecting_word in intersection:
    sasho_vectors_intersecting.append(tuple((intersecting_word, sasho[intersecting_word])))
    utah_vectors_intersecting.append(tuple((intersecting_word, utah[intersecting_word])))

cosine_distances_with_scipy = []
for sa, ut in zip (sasho_vectors_intersecting, utah_vectors_intersecting):
    assert sa[0] == ut[0], "strings from intersection should be the same, something is wrong!"
    cosine_distances_with_scipy.append(tuple((sa[0], cosine(sa[1], ut[1]))))

[-0.43093973  0.02270981  0.18791805 ... -0.5942187  -0.11230399
  0.5877969 ]


In [4]:
cos_scipy_sorted = sorted(cosine_distances_with_scipy, key=lambda tup: tup[1])
for i, val in enumerate(cos_scipy_sorted):
    # val[0] word, val[1] distance
    if i == 0: print("Top 10") 
    if i == len(cos_scipy_sorted)-10: print("\nBottom 10")
    if 0 < i < 10 or len(cos_scipy_sorted)-10 < i < len(cos_scipy_sorted):
        print("{} {}".format(val[0], val[1]))
# pickle.dump(cos_scipy_sorted, open("cos_distances_for_finetune.pickle", 'wb'))

Top 10
life 0.0
terminating 0.0
##cycle 0.0
remove 0.0009971261024475098
young 0.001008152961730957
too 0.0010106563568115234
status 0.012241601943969727
nova 0.013717353343963623
##eti 0.01682746410369873

Bottom 10
on 0.4524269700050354
instance 0.45709383487701416
the 0.45835477113723755
" 0.4652404189109802
##d 0.49717140197753906
##m 0.5103933215141296
for 0.5324618518352509
t 0.6007658839225769
' 0.9519659355282784


In [5]:
sum_dist = 0
for v in cos_scipy_sorted:
    sum_dist += v[1]
print("avg distance: {}".format(sum_dist / len(cos_scipy_sorted)))


avg distance: 0.22348313039140916


In [6]:
cos_scipy_sorted_finetune = pickle.load(open("cos_distances_for_finetune.pickle", 'rb'))
cos_scipy_sorted_finetune = dict(cos_scipy_sorted_finetune)
assert type(cos_scipy_sorted_finetune) == dict


differences_list = []
number_of_increases = 0
number_of_decreases = 0
overall_difference = 0
no_changes = 0
for word, no_finetune_distance in cos_scipy_sorted:
    finetune_distance = cos_scipy_sorted_finetune.get(word)
    difference = finetune_distance - no_finetune_distance
    differences_list.append(tuple((word, difference)))
    overall_difference += difference
    if difference < 0:
        number_of_decreases += 1
    elif difference > 0:
        number_of_increases += 1
    else:
       no_changes += 1 

print("Pairwise cosine difference between finetune and non-finetune")
print("{} positive values-> distance increased with finetuning".format(number_of_increases))
print("{} negative values-> distance decreased with finetuning".format(number_of_decreases))
print("{} no changes".format(no_changes))
print("sum of all differences (+ improvement - degradation): {}\n---------------------".format(overall_difference))

      
differences_list_sorted = sorted(differences_list, key=lambda tup: tup[1])

for word, val in differences_list_sorted:
    print("{} {}".format(word, val))


Pairwise cosine difference between finetune and non-finetune
51 positive values-> distance increased with finetuning
79 negative values-> distance decreased with finetuning
4 no changes
sum of all differences (+ improvement - degradation): -0.5508349500596523
---------------------
. -0.28494852781295776
instance -0.2547459006309509
for -0.24548062682151794
while -0.19624817371368408
- -0.1563989520072937
< -0.15595674514770508
found -0.14557427167892456
> -0.14138448238372803
sync -0.12947797775268555
_ -0.12786436080932617
or -0.12340420484542847
* -0.11791259050369263
power -0.10754024982452393
/ -0.10707813501358032
the -0.09971559047698975
file -0.09708404541015625
claim -0.08427786827087402
swap -0.08217859268188477
during -0.07628178596496582
[CLS] -0.057077884674072266
server -0.052199602127075195
failed -0.04528355598449707
successfully -0.045195698738098145
host -0.04375791549682617
has -0.041618406772613525
c -0.03799140453338623
network -0.03641319274902344
disk -0.036107420

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
                    z=cosine_distances))
fig.show()

