In [1]:
import numpy as np
from scipy.sparse import csr_matrix, find

def generalized_jaccard_similarity_sparse(S, T):
    """Calculates the generalized Jaccard similarity between two sparse weighted sets.

    Args:
        S: A scipy.sparse.csr_matrix representing the first weighted set.
        T: A scipy.sparse.csr_matrix representing the second weighted set.

    Returns:
        The generalized Jaccard similarity between S and T.
    """

    # Find the non-zero indices and values of both matrices
    S_row, S_col, S_data = find(S)
    T_row, T_col, T_data = find(T)

    # Merge indices and values for efficient computation
    all_row = np.hstack([S_row, T_row])
    all_col = np.hstack([S_col, T_col])
    all_data = np.hstack([S_data, T_data])

    # Create a sparse matrix to represent the union of the two sets
    union_matrix = csr_matrix((all_data, (all_row, all_col)), shape=S.shape)

    # Find the min and max values for each overlapping element
    min_values = np.minimum(S_data, T_data[np.in1d(T_col, S_col)])
    max_values = union_matrix.data  # Use the data from the union matrix for max

    # Calculate the generalized Jaccard similarity
    min_sum = min_values.sum()
    max_sum = max_values.sum()
    if max_sum == 0:
        return 0
    else:
        return min_sum / max_sum


In [8]:
from scipy import sparse

S = sparse.csr_matrix([1.5, 0, 2.0])
T = sparse.csr_matrix([1.2, 1, 1.0])


In [9]:
from sklearn.feature_extraction.text import TfidfTransformer


In [10]:
S

<1x3 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [11]:
TfidfTransformer(use_idf=True, smooth_idf=True).fit_transform(S)

<1x3 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [15]:
S.todense()

matrix([[1.5, 0. , 2. ]])

In [16]:
T.todense()

matrix([[1.2, 1. , 1. ]])

In [29]:
X = sparse.vstack([S, T])

In [30]:
X.min(axis=0).sum() / X.max(axis=0).sum()

0.48888888888888893

In [27]:
type(S)

scipy.sparse._csr.csr_matrix

In [28]:
S.shape

(1, 3)

In [31]:
x = np.arange(3)
y = np.arange(3)
s = sparse.vstack([x, y])

ValueError: blocks must be 2-D