In [1]:
from linguistics.similarity import Sentence

In [2]:
s1 = Sentence('John Joseph Nicholson')
s2 = Sentence('Nicholson, Jack')

In [3]:
s1.words

[John, Joseph, Nicholson]

In [4]:
s2.words

[Nicholson, Jack]

In [5]:
s1.get_similarity(s2)

0.29320987654320996

In [6]:
s1.words[1].get_similarity(s2.words[0], case_sensitivity=1)

0.35185185185185186

In [7]:
s1.words[1].get_similarity(s2.words[0], case_sensitivity=0)

0.35185185185185186

In [8]:
def get_similarity_matrix(s1, s2, case_sensitivity=1, first_char_weight=0):
    s1 = Sentence(s1)
    s2 = Sentence(s2)
    
    max_length = max(s1.length, s2.length)
    
    def get_similarity_by_index(i1, i2):
        try:
            return s1.words[i1].get_similarity(
                s2.words[i2], case_sensitivity=case_sensitivity, first_char_weight=first_char_weight
            )
        except:
            return 0
    
    return [
        [(i1, i2, get_similarity_by_index(i1, i2)) 
        for i2 in range(max_length)]
        for i1 in range(max_length)
    ]

In [9]:
def get_similarity_order(s1, s2, case_sensitivity=1, first_char_weight=0):
    similarity_matrix = get_similarity_matrix(
        s1, s2, case_sensitivity=case_sensitivity, first_char_weight=first_char_weight
    )
    
    flat_list = [e for l in similarity_matrix for e in l]
    sorted_similarities = sorted(flat_list, key=lambda x: -x[2])
    
    result = []
    for i1, i2, similarity in sorted_similarities:
        if similarity_matrix[i1][i2] is not None:
            if i1 < s1.length and i2 < s2.length:
                word_1, word_2 = s1.words[i1], s2.words[i2]
                index_1, index_2 = i1, i2
            elif i1 < s1.length:
                word_1, word_2 = s1.words[i1], None
                index_1, index_2 = i1, None
            else:
                word_1, word_2 = None, s2.words[i2]
                index_1, index_2 = None, i2
                
            result.append({
                'word_1': word_1, 'word_2': word_2, 'similarity': similarity,
                'index_1': index_1, 'index_2': index_2
            })
            
            # remove all elements at column i2 (iterate over rows)
            for j1 in range(len(similarity_matrix)):
                similarity_matrix[j1][i2] = None
                
            # remove all elements at row i1 (iterate over columns)
            for j2 in range(len(similarity_matrix[i1])):
                similarity_matrix[i1][j2] = None        
        
    return result

In [10]:
get_similarity_order(s1, s2)

[{'word_1': Nicholson,
  'word_2': Nicholson,
  'similarity': 1.0,
  'index_1': 2,
  'index_2': 0},
 {'word_1': John,
  'word_2': Jack,
  'similarity': 0.5,
  'index_1': 0,
  'index_2': 1},
 {'word_1': Joseph,
  'word_2': None,
  'similarity': 0,
  'index_1': 1,
  'index_2': None}]

In [11]:
get_similarity_order(s1, s2, case_sensitivity=0)

[{'word_1': Nicholson,
  'word_2': Nicholson,
  'similarity': 1.0,
  'index_1': 2,
  'index_2': 0},
 {'word_1': John,
  'word_2': Jack,
  'similarity': 0.5,
  'index_1': 0,
  'index_2': 1},
 {'word_1': Joseph,
  'word_2': None,
  'similarity': 0,
  'index_1': 1,
  'index_2': None}]

In [12]:
get_similarity_order(s1, s2, first_char_weight=1, case_sensitivity=0)

[{'word_1': Nicholson,
  'word_2': Nicholson,
  'similarity': 1.0,
  'index_1': 2,
  'index_2': 0},
 {'word_1': John,
  'word_2': Jack,
  'similarity': 0.75,
  'index_1': 0,
  'index_2': 1},
 {'word_1': Joseph,
  'word_2': None,
  'similarity': 0,
  'index_1': 1,
  'index_2': None}]

In [21]:
display(s1.get_similar_pairs(s2))

[{'word_1': Nicholson,
  'word_2': Nicholson,
  'similarity': 1.0,
  'index_1': 2,
  'index_2': 0},
 {'word_1': John,
  'word_2': Jack,
  'similarity': 0.5,
  'index_1': 0,
  'index_2': 1},
 {'word_1': Joseph,
  'word_2': None,
  'similarity': 0,
  'index_1': 1,
  'index_2': None}]

In [14]:
s1.get_unordered_similarity(s2)

0.5

In [15]:
s1.get_unordered_similarity(s2, case_sensitivity=0)

0.5

In [16]:
s1.get_unordered_similarity(s2, case_sensitivity=0, weights=[1, 1])

0.75

In [17]:
s1.get_unordered_similarity(s2, case_sensitivity=0, weights=[2, 1])

0.8333333333333334

In [18]:
s1.get_unordered_similarity(s2, case_sensitivity=0, weights=[1])

1.0

In [19]:
s1.get_unordered_similarity(s2, case_sensitivity=0, first_char_weight=1, weights=[1, 1])

0.875