In [1]:
documents = ["Jogging is one of the best sports, but I love football", "I love pizza and pasta."]
query = ["I love sports, concretely football."]

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', 
                     strip_accents='unicode', # Remove accents during the preprocessing step
                     stop_words = 'english', 
                     lowercase=True, # Convert all characters to lowercase before tokenizing
                     use_idf=True, # Enable inverse-document-frequency reweightening
                     smooth_idf = True, # Smooth idf weights by adding one to document frequencies. Prevents zero divisions.
                     norm='l2',
                    )

In [3]:
# Represent each document as a weighted tf-idf vector
tfidf_matrix = tf.fit_transform(documents)
print(tf.vocabulary_)

{u'jogging': 2, u'love': 3, u'pizza': 5, u'pasta': 4, u'football': 1, u'best': 0, u'sports': 6}


In [4]:
# Represent the query as a weighted tf-idf vector
tfidf_query = tf.transform(query)

# Compute the cosine similarity score for the query vector and each document vector.
# Note: Cosine for length-normalized vectors is simply the dot product (or scalar product).
cosine_similarity = (tfidf_matrix * tfidf_query.T).A
print(cosine_similarity)

[[ 0.74576899]
 [ 0.20199309]]


In [5]:
# TODO: Rank documents with respect to the query by score (the higher, the better)
# TODO: Return best one.