Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Representation: most_similar, closes #45 #147

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion texthero/representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
from sklearn.decomposition import PCA, NMF
from sklearn.cluster import KMeans, DBSCAN, MeanShift
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import normalize as sklearn_normalize
from scipy.sparse import coo_matrix

from typing import Optional, Union, Any
from typing import Optional, Union, Any, List

from texthero import preprocessing
from texthero._types import TextSeries, VectorSeries, RepresentationSeries, InputSeries

import logging
import warnings
Expand Down Expand Up @@ -1019,3 +1021,88 @@ def normalize(s: pd.Series, norm="l2") -> pd.Series:
s_result.index = s.index

return s_result


"""
Most similar
"""


@InputSeries(TextSeries)
def most_similar(
s: TextSeries,
s_represented: Union[VectorSeries, RepresentationSeries],
vector: List[float],
max_number=None,
) -> TextSeries:
"""
Return the most similar vectors in s to the given vector.

To find the most similar documents to a document, first represent
the Pandas Series with the documents, e.g. with
:meth:`hero.representation.tfidf`_ . Then use this function
to find the most similar documents according to the representation.
Similar vectors are returned sorted by similarity descending.

Internally, euclidian distance is used to judge similarity.

Series s can either be a :class:`texthero._types.RepresentationSeries`
or a :class:`texthero._types.VectorSeries`.

Parameters
----------
s : :class:`texthero._types.TextSeries`
The Series in which we want to find similar documents.

s_represented : :class:`texthero._types.RepresentationSeries` or
:class:`texthero._types.VectorSeries`
The Series by which the similarity is calculated.

vector : List[float]
The vector to which we want to find the most similar documents.

max_number: int or None, default 100
Maximum amount of indexes of similar documents to return.
If None, returns all .

Examples
--------
>>> import texthero as hero
>>> import pandas as pd
>>> s = pd.Series(["I like football", "Hey, watch out", "I like sports", "Cool stuff"])
>>> s_pca = s.pipe(hero.tokenize).pipe(hero.tfidf).pipe(hero.flatten).pipe(hero.pca) # TODO: remove flatten when pca is updated w.r.t. Representation Series
>>> # want to find the two most similar to "I like football", which has index 0
>>> s_most_similar = hero.most_similar(s, s_pca, s_pca[0], max_number=2)
>>> s_most_similar
0 I like football
2 I like sports
dtype: object

"""
if _check_is_valid_representation(s_represented):
if pd.api.types.is_sparse(s_represented):
s_represented_coo_matrix = s_represented.sparse.to_coo()[0]
else:
s_represented = s_represented.astype("Sparse")
s_represented_coo_matrix = s_represented.sparse.to_coo()[0]

s_represented_for_vectorization = s_represented_coo_matrix
s_represented_flat_index = s_represented.index.levels[0]

else:
s_represented_for_vectorization = list(s_represented)
s_represented_flat_index = s_represented.index

s_distances = pd.Series(
pairwise_distances(
s_represented_for_vectorization, np.array([vector])
).tolist(),
index=s_represented_flat_index,
)

s_distances = s_distances.sort_values()

if max_number is not None:
return s[s_distances[:max_number].index]
else:
return s[s_distances.index]