In [34]:
import datetime

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

id_column = "id"
content_column = "description"
document_path = "./sample-data.csv"

In [35]:
# Read the document
t1 = datetime.datetime.now()
print("Start time to read document is %s: ", t1)

ds = pd.read_csv(document_path)

t2 = datetime.datetime.now()
print("End time to read document is %s: ", t2)

print("Time taken to read the document is %s: ", t2 - t1)

len(ds)
# ds.head()

Start time to read document is %s:  2020-02-03 10:10:26.211532
End time to read document is %s:  2020-02-03 10:10:26.224452
Time taken to read the document is %s:  0:00:00.012920


500

In [36]:
# Document vectorization

t1 = datetime.datetime.now()
print("Start time to TfIdf is %s: ", t1)

tf = TfidfVectorizer(analyzer='word', min_df=0, stop_words='english')

t2 = datetime.datetime.now()
print("End time to TfIdf is %s: ", t2)

print("Time taken to TfIdf is %s: ", t2 - t1)

print(tf)


Start time to TfIdf is %s:  2020-02-03 10:10:28.111828
End time to TfIdf is %s:  2020-02-03 10:10:28.118228
Time taken to TfIdf is %s:  0:00:00.006400
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


In [37]:
# Transform the content as matrix

t1 = datetime.datetime.now()
print("Start time to compute fit_transform is %s: ", t1)

tfidf_matrix = tf.fit_transform(ds[content_column])

t2 = datetime.datetime.now()
print("End time to compute fit_transform is %s: ", t2)
        
print("Time taken to compute fit_transform is %s: ", t2 - t1)

Start time to compute fit_transform is %s:  2020-02-03 10:10:30.038178
End time to compute fit_transform is %s:  2020-02-03 10:10:30.159043
Time taken to compute fit_transform is %s:  0:00:00.120865


In [38]:
# Compute the similarity

t1 = datetime.datetime.now()
print("Start time to compute similarities is %s: ", t1)
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)


t2 = datetime.datetime.now()
print("End time to compute similarities is %s: ", t2)
        
print("Time taken to compute similarities is %s: ", t2 - t1)

Start time to compute similarities is %s:  2020-02-03 10:10:31.358448
End time to compute similarities is %s:  2020-02-03 10:10:31.412595
Time taken to compute similarities is %s:  0:00:00.054147


In [32]:
# Find the result

results = {}

t1 = datetime.datetime.now()
print("Start time to calculate result is %s: ", t1)

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds[id_column][i]) for i in similar_indices]

    results[row[id_column]] = similar_items[1:]
    
t2 = datetime.datetime.now()
print("End time to calculate result is %s: ", t2)

print("Time taken to calculate result is %s: ", t2 - t1)

def item(id):
    return ds.loc[ds[id_column] == id][content_column].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary.
def recommend(id, num):
    print("Recommending " + str(num) + " products similar to " + item(id) + "...")
    print("-------")
    recs = results[id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

Start time to calculate result is %s:  2020-02-03 10:09:37.092305
End time to calculate result is %s:  2020-02-03 10:09:38.436373
Time taken to calculate result is %s:  0:00:01.344068


In [39]:
t1 = datetime.datetime.now()
print("Start time to recommend is %s: ", t1)

recommend(id=20, num=10)

t2 = datetime.datetime.now()
print("End time to recommend is %s: ", t2)

print("Time taken to recommend is %s: ", t2 - t1)


Start time to recommend is %s:  2020-02-03 10:10:33.270830
Recommending 10 products similar to Cap 1 graphic t-shirt...
-------
Recommended: Cap 1 graphic t-shirt (score:0.8346122195247042)
Recommended: Cap 1 graphic crew (score:0.772584528935399)
Recommended: Cap 1 crew (score:0.7081061617160098)
Recommended: Cap 1 t-shirt (score:0.7059855757125564)
Recommended: Cap 1 t-shirt (score:0.6785025221080636)
Recommended: Cap 1 scoop (score:0.6466990537133701)
Recommended: Cap 1 bottoms (score:0.5549528755979624)
Recommended: Cap 1 bottoms (score:0.549142080509303)
Recommended: Cap 1 graphic tee (score:0.540730700077156)
Recommended: Cap 2 cap sleeve (score:0.3706066482662903)
End time to recommend is %s:  2020-02-03 10:10:33.336938
Time taken to recommend is %s:  0:00:00.066108
