In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from tabulate import tabulate

Usually, when evaluation a information retrieval model, the basic concepts are the documents $d$ and a query $q$. The documents are the static elements, which express ideas about some topic in natural language. On the other hand, the queries represents a variable information need for documents pertaining to some topic. A document is represented by a set of index terms $k_i$. 

For our particular case, the <u>topics</u> represent conceptually the documents, since they are permanent along the model, and the <u>text snippets</u> are the queries, since we pretend to match to which profile is the query related.

In [2]:
# Defining topics.
# Each topic is treated conceptually as a document.
topic_music    = ["music", "sound", "song", "songs", "Taylor", "Swift", "Justin", "Bieber", "Mozart", "pop", "stars", "singing"]
topic_film     = ["movie", "movies", "film", "Tarantino", "Pulp", "Fiction", "actor", "director", "economics"]
topic_sports   = ["football", "star", "soccer", "goal", "messi", "jogging", "swimming", "fitness"]
topic_cars     = ["Ferrari", "Lamborghini", "car", "speed", "high", "enzo", "aventador", "driving"]
topic_politics = ["politics", "economics", "trump", "stock", "market" ]

# Dictionary of topics
topics_values = {topic_music, topic_film, topic_sports, 
          "topic_cars": topic_cars,
          "topic_politics": topic_politics}
topics_names = ["topic_music", "topic_film", "topic_sports", "topic_cars", "topic_politics"]

We match the topics to some fictitious users:  

In [13]:
# Defining the users
user1  = ["topic_cars", "topic_sports", "topic_film"]
user2  = ["topic_music", "topic_politics"]
user3  = ["topic_film"]
users = {"user1": user1,
         "user2": user2,
         "user3": user3}

In [4]:
# Loading in text snippets that need to be classified.
# Every text snippet only has 1 topic per definition.
# Each text snippet is a query
text_snippets = ["Jogging is one of the best sports, but I love football", 
                 "Politics news are strongly affecting economics.",
                 "Tarantino is a bad actor, but a good director.", 
                 "Ferrari Enzo is faster than Lamborghini Aventador.", 
                 "Ferrari builds the best high speed cars.",
                 "The football star Messi is driving a car from Lamborghini.",
                 "Pulp fiction is one of the best movies ever made.",
                 "Taylor Swift and Justin Bieber are pop stars.",
                 "Mozart is much better than Justin Bieber.",
                 "Many people do not like to hear Taylor Swift singing in spite of she might be married with Tarantino."]

In [5]:
tf = TfidfVectorizer(analyzer='word', 
                     strip_accents='unicode', # Remove accents during the preprocessing step
                     stop_words = 'english', 
                     lowercase=True, # Convert all characters to lowercase before tokenizing
                     use_idf=False, # Enable inverse-document-frequency reweightening
                     sublinear_tf = True,
                     norm='l2')

tfidf = TfidfVectorizer(analyzer='word', 
                     strip_accents='unicode', # Remove accents during the preprocessing step
                     stop_words = 'english', 
                     lowercase=True, # Convert all characters to lowercase before tokenizing
                     use_idf=True, # Enable inverse-document-frequency reweightening
                     smooth_idf = True, # Smooth idf weights by adding one to document frequencies. Prevents zero divisions.
                     sublinear_tf = True,
                     norm='l2')

In [6]:
# Represent each document as a weighted tf-idf vector
topics_text = list(map(lambda x: ",".join(x), topics.values()))
tf_matrix = tf.fit_transform(topics_text)
print(tf.vocabulary_)

{'car': 3, 'film': 10, 'taylor': 39, 'stock': 35, 'market': 18, 'jogging': 15, 'driving': 5, 'swimming': 37, 'mozart': 22, 'movies': 21, 'economics': 6, 'goal': 13, 'justin': 16, 'stars': 34, 'tarantino': 38, 'music': 23, 'songs': 30, 'trump': 40, 'football': 12, 'song': 29, 'politics': 24, 'fitness': 11, 'pop': 25, 'sound': 31, 'speed': 32, 'ferrari': 8, 'swift': 36, 'bieber': 2, 'high': 14, 'enzo': 7, 'aventador': 1, 'singing': 27, 'actor': 0, 'fiction': 9, 'director': 4, 'messi': 19, 'lamborghini': 17, 'star': 33, 'soccer': 28, 'pulp': 26, 'movie': 20}


In [7]:
# Represent the query as a weighted tf-idf vector
tfidf_matrix = tfidf.fit_transform(topics_text)
tfidf_query = tfidf.transform(text_snippets)

# Compute the cosine similarity score for the query vector and each document vector.
# Note: Cosine for length-normalized vectors is simply the dot product (or scalar product).
cosine_similarity = (tf_matrix * tfidf_query.T).A

Let us visualize the cosine similarity matrix. It represents, for each of the documents (columns), what are the most similar topics (rows):

In [8]:
headers = ['text1', 'text2', 'text3', 'text4', 'text5', 'text6', 'text7', 'text8', 'text9', 'text10']
body = np.append(np.array([["music"],["films"],["sports"],["cars"],["politics"]])
                 , cosine_similarity, axis=1)
print(tabulate(body, headers=headers, tablefmt='pipe', floatfmt=".2f"))

|          |   text1 |   text2 |   text3 |   text4 |   text5 |   text6 |   text7 |   text8 |   text9 |   text10 |
|:---------|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|--------:|---------:|
| music    |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.71 |    0.50 |     0.43 |
| films    |    0.00 |    0.21 |    0.58 |    0.00 |    0.00 |    0.00 |    0.58 |    0.00 |    0.00 |     0.17 |
| sports   |    0.00 |    0.63 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |    0.00 |     0.00 |
| cars     |    0.00 |    0.00 |    0.00 |    0.71 |    0.61 |    0.43 |    0.00 |    0.00 |    0.00 |     0.00 |
| politics |    0.50 |    0.00 |    0.00 |    0.00 |    0.00 |    0.43 |    0.00 |    0.00 |    0.00 |     0.00 |


In [9]:
# Categorization of the text_snippets
# Rank documents with respect to the query by score (the higher, the better)
# Return best ones.
categories = np.argmax(cosine_similarity, axis=0)

In [10]:
# Assign text_snippets to users
user1_result = []
user2_result = []
user3_result = []

for text_id, category_id in enumerate(categories):
    category = topics_names[category_id]
    if category in user1:
        user1_result.append(text_id)
    if category in user2:
        user2_result.append(text_id)
    if category in user3:
        user3_result.append(text_id)

In [15]:
def print_text_snippets(user, name):
    print(name + ": " + str(users[name]))
    print(list (map(lambda x: text_snippets[x], user)))
    print()
    
print_text_snippets(user1_result, "user1")
print_text_snippets(user2_result, "user2")
print_text_snippets(user3_result, "user3")

user1: ['topic_cars', 'topic_sports', 'topic_film']
['Politics news are strongly affecting economics.', 'Tarantino is a bad actor, but a good director.', 'Ferrari Enzo is faster than Lamborghini Aventador.', 'Ferrari builds the best high speed cars.', 'The football star Messi is driving a car from Lamborghini.', 'Pulp fiction is one of the best movies ever made.']

user2: ['topic_music', 'topic_politics']
['Jogging is one of the best sports, but I love football', 'Taylor Swift and Justin Bieber are pop stars.', 'Mozart is much better than Justin Bieber.', 'Many people do not like to hear Taylor Swift singing in spite of she might be married with Tarantino.']

user3: ['topic_film']
['Tarantino is a bad actor, but a good director.', 'Pulp fiction is one of the best movies ever made.']

