In [None]:
import datetime

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

id_column = "movieId"
content_column = "tag"
document_path = "./ml-20m/tags.csv"

In [None]:
# Read the document
t1 = datetime.datetime.now()
print("Start time to read document is %s: ", t1)

ds = pd.read_csv(document_path, converters={content_column: str})

t2 = datetime.datetime.now()
print("End time to read document is %s: ", t2)

print("Time taken to read the document is %s: ", t2 - t1)

# len(ds)
ds.head()

In [None]:
# Document vectorization

t1 = datetime.datetime.now()
print("Start time to TfIdf is %s: ", t1)

tf = TfidfVectorizer(analyzer='word', min_df=0, stop_words='english')

t2 = datetime.datetime.now()
print("End time to TfIdf is %s: ", t2)

print("Time taken to TfIdf is %s: ", t2 - t1)

print(tf)


In [None]:
# Transform the content as matrix

t1 = datetime.datetime.now()
print("Start time to compute fit_transform is %s: ", t1)

tfidf_matrix = tf.fit_transform(ds[content_column])
# tfidf_matrix = tf.fit_transform(ds[content_column].values.astype('U'))

t2 = datetime.datetime.now()
print("End time to compute fit_transform is %s: ", t2)
        
print("Time taken to compute fit_transform is %s: ", t2 - t1)

In [None]:
# Compute the similarity

t1 = datetime.datetime.now()
print("Start time to compute similarities is %s: ", t1)
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

t2 = datetime.datetime.now()
print("End time to compute similarities is %s: ", t2)

print("Time taken to compute similarities is %s: ", t2 - t1)

In [None]:
# Find the result

results = {}

t1 = datetime.datetime.now()
print("Start time to calculate result is %s: ", t1)

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds[id_column][i]) for i in similar_indices]

    results[row[id_column]] = similar_items[1:]
    
t2 = datetime.datetime.now()
print("End time to calculate result is %s: ", t2)

print("Time taken to calculate result is %s: ", t2 - t1)

def item(id):
    return ds.loc[ds[id_column] == id][content_column].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(id, num):
    print("Recommending " + str(num) + " products similar to " + item(id) + "...")
    print("-------")
    recs = results[id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

In [None]:
t1 = datetime.datetime.now()
print("Start time to recommend is %s: ", t1)

recommend(id=20, num=10)

t2 = datetime.datetime.now()
print("End time to recommend is %s: ", t2)

print("Time taken to recommend is %s: ", t2 - t1)
