In [1]:
# Loading in documents that need to be classified
# Every document only has 1 topic
text_snippets = ["Jogging is one of the best sports, but I love football", 
             "Politics news are strongly affecting economics.",
             "Tarantino is a bad actor, but a good director.", 
             "Ferrari Enzo is faster than Lamborghini Aventador.", 
             "Ferrari builds the best high speed cars.",
            "The football star Messi is driving a car from Lamborghini.",
            "Pulp fiction is one of the best movies ever made.",
            "Taylor Swift and Justin Bieber are pop stars.",
            "Mozart is much better than Justin Bieber.",
            "Many people do not like to hear Taylor Swift singing in spite of she is married with Tarantino."]

In [2]:
# Defining topics
topic_music = ["music", "sound", "song", "songs", "Taylor", "swift", "Justin", "Bieber", "Mozart", "pop", "stars", "singing"]
topic_film = ["movie", "movies", "film", "tarantino", "pulp", "fiction", "actor", "director"]
topic_sports = ["football", "star", "soccer", "goal", "messi", "jogging", "swimming", "fitness"]
topic_cars = ["ferrari", "lamborghini", "car", "speed", "high", "enzo", "aventador", "driving"]
topic_politics = ["politics", "economics", "trump", "stock", "market" ]
topics = [",".join(topic_music), ",".join(topic_film), ",".join(topic_sports), ",".join(topic_cars), ",".join(topic_politics)]
topic_name = ["topic_music", "topic_film", "topic_sports", "topic_cars", "topic_politics"]
print(topics)

['music,sound,song,songs,Taylor,swift,Justin,Bieber,Mozart,pop,stars,singing', 'movie,movies,film,tarantino,pulp,fiction,actor,director', 'football,star,soccer,goal,messi,jogging,swimming,fitness', 'ferrari,lamborghini,car,speed,high,enzo,aventador,driving', 'politics,economics,trump,stock,market']


In [3]:
# Defining the users
user_javi = ["topic_cars", "topic_sports", "topic_film"]
user_tolga = ["topic_music", "topic_politics"]
user_aitor = ["topic_film"]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', 
                     strip_accents='unicode', # Remove accents during the preprocessing step
                     stop_words = 'english', 
                     lowercase=True, # Convert all characters to lowercase before tokenizing
                     use_idf=True, # Enable inverse-document-frequency reweightening
                     smooth_idf = True, # Smooth idf weights by adding one to document frequencies. Prevents zero divisions.
                     sublinear_tf = True,
                     norm='l2'
                    )

In [5]:
# Represent each document as a weighted tf-idf vector
tfidf_matrix = tf.fit_transform(topics)
print(tf.vocabulary_)

{'music': 23, 'sound': 31, 'song': 29, 'songs': 30, 'taylor': 39, 'swift': 36, 'justin': 16, 'bieber': 2, 'mozart': 22, 'pop': 25, 'stars': 34, 'singing': 27, 'movie': 20, 'movies': 21, 'film': 10, 'tarantino': 38, 'pulp': 26, 'fiction': 9, 'actor': 0, 'director': 4, 'football': 12, 'star': 33, 'soccer': 28, 'goal': 13, 'messi': 19, 'jogging': 15, 'swimming': 37, 'fitness': 11, 'ferrari': 8, 'lamborghini': 17, 'car': 3, 'speed': 32, 'high': 14, 'enzo': 7, 'aventador': 1, 'driving': 5, 'politics': 24, 'economics': 6, 'trump': 40, 'stock': 35, 'market': 18}


In [6]:
# Represent the query as a weighted tf-idf vector
tfidf_query = tf.transform(text_snippets)

# Compute the cosine similarity score for the query vector and each document vector.
# Note: Cosine for length-normalized vectors is simply the dot product (or scalar product).
cosine_similarity = (tfidf_matrix * tfidf_query.T).A
print(cosine_similarity)

[[ 0.          0.          0.          0.          0.          0.          0.
   0.70710678  0.5         0.4330127 ]
 [ 0.          0.          0.61237244  0.          0.          0.
   0.61237244  0.          0.          0.1767767 ]
 [ 0.5         0.          0.          0.          0.          0.4330127
   0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.70710678  0.61237244  0.4330127
   0.          0.          0.          0.        ]
 [ 0.          0.63245553  0.          0.          0.          0.          0.
   0.          0.          0.        ]]


In [7]:
# TODO: Rank documents with respect to the query by score (the higher, the better)
# TODO: Return best one.


for i, query in enumerate(cosine_similarity):
    print(topic_name[i])
    print(query)
    print()

topic_music
[ 0.          0.          0.          0.          0.          0.          0.
  0.70710678  0.5         0.4330127 ]

topic_film
[ 0.          0.          0.61237244  0.          0.          0.
  0.61237244  0.          0.          0.1767767 ]

topic_sports
[ 0.5        0.         0.         0.         0.         0.4330127  0.         0.
  0.         0.       ]

topic_cars
[ 0.          0.          0.          0.70710678  0.61237244  0.4330127   0.
  0.          0.          0.        ]

topic_politics
[ 0.          0.63245553  0.          0.          0.          0.          0.
  0.          0.          0.        ]



In [8]:
# Categorization of the text_snippets
import numpy as np
categories = np.argmax(cosine_similarity, axis=0)

In [9]:
# Assign text_snippets to users
user_javi_result = []
user_tolga_result = []
user_aitor_result = []

for text_id, category_id in enumerate(categories):
    category = topic_name[category_id]
    if category in user_javi:
        user_javi_result.append(text_id)
    if category in user_tolga:
        user_tolga_result.append(text_id)
    if category in user_aitor:
        user_aitor_result.append(text_id)

In [10]:
def print_text_snippets(user, name):
    print(name)
    print(list (map(lambda x: text_snippets[x], user)))
    print()
    
print_text_snippets(user_javi_result, "Javi")
print_text_snippets(user_tolga_result, "Tolga")
print_text_snippets(user_aitor_result, "Aitor")

Javi
['Jogging is one of the best sports, but I love football', 'Tarantino is a bad actor, but a good director.', 'Ferrari Enzo is faster than Lamborghini Aventador.', 'Ferrari builds the best high speed cars.', 'The football star Messi is driving a car from Lamborghini.', 'Pulp fiction is one of the best movies ever made.']

Tolga
['Politics news are strongly affecting economics.', 'Taylor Swift and Justin Bieber are pop stars.', 'Mozart is much better than Justin Bieber.', 'Many people do not like to hear Taylor Swift singing in spite of she is married with Tarantino.']

Aitor
['Tarantino is a bad actor, but a good director.', 'Pulp fiction is one of the best movies ever made.']

