-
Notifications
You must be signed in to change notification settings - Fork 4
/
similarity.py
134 lines (114 loc) · 4.07 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import math
import random
from nltk import word_tokenize
from collections import Counter, defaultdict
# Tokenize sentences with NLTK's word_tokenize to get lists of tokens.
# Represent tf-isf vectors as dictionaries mapping words to weights
# returns index of c in candidates st the tf-isf vector for c is the closest
# to target of all candidates. Use lisf for the inverse sentence freqs
# REQUIRES: candidates is the same list used to calculate lisf!
def closest_sentence(target, candidates, lisf, tiebreak=False):
best_cand_index = 0
best_cosine = 0
best_overlap = 0
# calculate target sentence vector
target_words = word_tokenize(target)
target_tf = tf(target_words)
target_vector = {}
for word in target_tf:
target_vector[word] = target_tf[word] * lisf[word]
# for each candidate:
for cand_index in xrange(len(candidates)):
candidate = candidates[cand_index]
# calculate candidate sentence vector
candidate_words = word_tokenize(candidate)
candidate_tf = tf(candidate_words)
candidate_vector = {}
for word in candidate_tf:
candidate_vector[word] = candidate_tf[word] * lisf[word]
# compare cosine similarity to best so far
cosine_sim = dot(target_vector, candidate_vector) / norm(candidate_vector)
if cosine_sim > best_cosine:
best_cand_index = cand_index
best_cosine = cosine_sim
if tiebreak:
best_overlap = overlap(target_words, candidate_words)
# possible tiebreak with overlap
elif cosine_sim == best_cosine and tiebreak:
candidate_overlap = overlap(target_words, candidate_words)
if candidate_overlap > best_overlap:
best_cand_index = cand_index
# best_cosine is same
best_overlap = candidate_overlap
return best_cand_index
# returns a tuple (sentence, overlap) where sentence is the sentence in
# candidates with the longest contiguous sequence of overlapping words in
# common with target, and overlap is the overlap
def max_overlap_sentence(target, candidates):
best_sentence = None
max_overlap = 0
random.shuffle(candidates) # the final tiebreaker is luck!
# target word list
target_words = word_tokenize(target)
for candidate in candidates:
candidate_words = word_tokenize(candidate)
candidate_overlap = overlap(target_words, candidate_words)
if candidate_overlap > max_overlap:
best_sentence = candidate
max_overlap = candidate_overlap
return (best_sentence, max_overlap)
# TODO: inefficient - use dynamic programming!
# calculates the length of the longest common contiguous sequence of words in
# word lists s1 and s2
def overlap(s1, s2):
length = 0
for i in xrange(len(s1)):
for j in xrange(len(s2)):
prefix = longest_common_prefix(s1, i, s2, j)
if prefix > length:
length = prefix
return length
# calculates the length of the longest common prefix of word lists s1 and s2
def longest_common_prefix(s1, start1, s2, start2):
length = 0
for i in xrange(min(len(s1) - start1, len(s2) - start2)):
if s1[start1 + i] != s2[start2 + i]:
break
length += 1
return length
# calculate the log inverse sentence frequencies of each word in the sentences
# RETURNS: a dictionary mapping words to their inverse sentence frequencies
def calculate_lisf(sentences):
N = len(sentences)
sf = Counter()
# count sentence frequencies of all words
for sentence in sentences:
seen_words = set()
for word in word_tokenize(sentence):
if word not in seen_words:
sf[word] += 1
seen_words.add(word)
lisf = defaultdict(int)
# lisf = log(N / sf)
for word in sf:
lisf[word] = math.log(float(N) / sf[word])
return lisf
# returns a Counter mapping each word to its frequency in word_list
def tf(word_list):
tfs = Counter()
for word in word_list:
tfs[word] += 1
return tfs
# calculates the dot product of vectors v1 and v2
def dot(v1, v2):
total = 0
for dim in v1:
if dim in v2:
total += v1[dim] * v2[dim]
return total
# calculate and return the norm (length) of a vector v
def norm(v):
total = 0
for dim in v:
total += v[dim] ** 2
return math.sqrt(total)