/
simple.py
116 lines (91 loc) 路 2.91 KB
/
simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# built-in
from itertools import takewhile
# app
from .base import Base as _Base, BaseSimilarity as _BaseSimilarity
__all__ = [
'Prefix', 'Postfix', 'Length', 'Identity', 'Matrix',
'prefix', 'postfix', 'length', 'identity', 'matrix',
]
try:
string_types = (str, unicode)
except NameError:
string_types = (str, )
class Prefix(_BaseSimilarity):
"""prefix similarity
"""
def __init__(self, qval=1, sim_test=None):
self.qval = qval
self.sim_test = sim_test or self._ident
def __call__(self, *sequences):
if not sequences:
return 0
sequences = self._get_sequences(*sequences)
test = lambda seq: self.sim_test(*seq) # noQA
result = [c[0] for c in takewhile(test, zip(*sequences))]
s = sequences[0]
if isinstance(s, string_types):
return ''.join(result)
if isinstance(s, bytes):
return b''.join(result)
return result
def similarity(self, *sequences):
return len(self(*sequences))
class Postfix(Prefix):
"""postfix similarity
"""
def __call__(self, *sequences):
s = sequences[0]
sequences = [reversed(s) for s in sequences]
result = reversed(super(Postfix, self).__call__(*sequences))
if isinstance(s, string_types):
return ''.join(result)
if isinstance(s, bytes):
return b''.join(result)
return list(result)
class Length(_Base):
"""Length distance
"""
def __call__(self, *sequences):
lengths = list(map(len, sequences))
return max(lengths) - min(lengths)
class Identity(_BaseSimilarity):
"""Identity similarity
"""
def maximum(self, *sequences):
return 1
def __call__(self, *sequences):
return int(self._ident(*sequences))
class Matrix(_BaseSimilarity):
"""Matrix similarity
"""
def __init__(self, mat=None, mismatch_cost=0, match_cost=1, symmetric=True):
self.mat = mat
self.mismatch_cost = mismatch_cost
self.match_cost = match_cost
self.symmetric = symmetric
# self.alphabet = sum(mat.keys(), ())
def maximum(self, *sequences):
return self.match_cost
def __call__(self, *sequences):
if not self.mat:
if self._ident(*sequences):
return self.match_cost
return self.mismatch_cost
# search in matrix
if sequences in self.mat:
return self.mat[sequences]
# search in symmetric matrix
if self.symmetric:
sequences = tuple(reversed(sequences))
if sequences in self.mat:
return self.mat[sequences]
# if identity then return match_cost
if self._ident(*sequences):
return self.match_cost
# not found
return self.mismatch_cost
prefix = Prefix()
postfix = Postfix()
length = Length()
identity = Identity()
matrix = Matrix()