In [1]:
import re
import numpy as np

In [2]:
from typing import List, Tuple

In [3]:
WORD = re.compile(r'\w+')

In [4]:
def regTokenize(text):
    words = WORD.findall(text)
    return words

In [5]:
def sim(text1: str, text2: str) -> float:
    set1 = set(regTokenize(text1))
    set2 = set(regTokenize(text2))
    return 2*len(set1.intersection(set2))/(len(set1) + len(set2)) 

In [6]:
mylist = [
    "Apple’s M-series processors are a hit. They deliver spectacular performance and power efficiency—so much so that they instantly caused a huge shakeup in Intel and AMD processors that suddenly found themselves looking slow, hot, and power-hungry.",
    "The inaugural M1 chip revitalized the MacBook and brought us the first exciting desktop Macs in years, including a redesigned iMac and brand-new Mac Studio. But since then? The MacBooks are still keeping pace, but desktop Macs are being left behind.",
    "Let’s start with the iMac. Updated to a new design and a single, 24-inch size just about two years ago, it’s got an M1 chip. It’s been 10 months since the M2 was released–why does the 24-inch iMac still come with an M1? Why does it still cost $1,299, just as it did two years ago?",
    "Apple should have updated all M1-bearing products simultaneously when the M2 came out. There’s little excuse to sell products that haven’t changed for two years without so much as a price adjustment. (In fact, if you live outside the U.S. the price of iMacs has actually gone up!)",
    "The Mac mini took a little longer to get its M2 update because Apple was waiting for the M2 Pro to be ready, offering that as an option. The M2 Pro and M2 Max landed early this year in the MacBook, and Apple dutifully updated the old Mac mini to M2 and M2 Pro.",
]

In [7]:
type(mylist), type(mylist[0]), len(mylist)

(list, str, 5)

In [8]:
def get_sim_matrix(list_of_texts:List[str]) -> np.ndarray:
    n = len(list_of_texts)
    X = np.zeros((n, n))
    for i in range(n):
        X[i, i] = 1.0
        for j in range(i+1, n):
            X[i, j] = sim(list_of_texts[i], list_of_texts[j])
            X[j, i] = X[i, j]
    return X

In [9]:
X = get_sim_matrix(mylist)
X

array([[1.        , 0.11594203, 0.07792208, 0.14814815, 0.14285714],
       [0.11594203, 1.        , 0.25      , 0.0952381 , 0.19178082],
       [0.07792208, 0.25      , 1.        , 0.19565217, 0.19753086],
       [0.14814815, 0.0952381 , 0.19565217, 1.        , 0.23529412],
       [0.14285714, 0.19178082, 0.19753086, 0.23529412, 1.        ]])

In [10]:
def get_k_smallest(X: np.ndarray, k: int) -> List[Tuple[int, int, float]]:
    A = X.copy()
    A[np.diag_indices_from(A)] = np.Inf
    A[np.tril_indices_from(A)] = np.Inf
    r,c = np.unravel_index(A.ravel().argsort()[:k], A.shape)
    return list(zip(r, c, A[r, c]))

In [11]:
def get_k_largest(X: np.ndarray, k: int) -> List[Tuple[int, int, float]]:
    A = X.copy()
    A[np.diag_indices_from(A)] = -np.Inf
    A[np.tril_indices_from(A)] = -np.Inf
    r,c = np.unravel_index((-A).ravel().argsort()[:k], A.shape)
    return list(zip(r, c, A[r, c]))

In [12]:
get_k_smallest(X, 3)

[(0, 2, 0.07792207792207792),
 (1, 3, 0.09523809523809523),
 (0, 1, 0.11594202898550725)]

In [13]:
mytuple = get_k_largest(X, 3)
mytuple

[(1, 2, 0.25), (3, 4, 0.23529411764705882), (2, 4, 0.19753086419753085)]

In [14]:
def find_k_most_similar_texts(list_of_texts:List[str], k:int) -> list[str]:
    X = get_sim_matrix(list_of_texts)
    k_similar_indices = get_k_largest(X, k)
    row_idx = [i[0] for i in k_similar_indices]
    col_idx = [i[1] for i in k_similar_indices]
    sim_indices = set(row_idx).union(set(col_idx))
    return sim_indices

In [15]:
find_k_most_similar_texts(mylist, 2)

{1, 2, 3, 4}