In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import pairwise_distances 

data_path = '../data/21B_tag_views_dataset.csv'

# Item-based Collaborative Filtering

## Data preparation

Data loading and user/tag freq matrix

In [2]:
data = pd.read_csv(data_path)

In [3]:
tag_count_df = data.groupby(['user_id', 'tag_id']).agg({'tag_id': 'count'}).rename(columns={'tag_id':'tag_count'})
tag_count_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tag_count
user_id,tag_id,Unnamed: 2_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,5c61cd1b82ec7a4d2918a6de99fcd1577b462f79,1
00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,1
00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,1
00000055a78bf6735c4a89358fab1de34104c3cb,a1437d6393ee9535248b16f27a649bbd98c9e2f5,1
00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,1


In [4]:
tag_count_df = tag_count_df.reset_index()
tags_user_df = tag_count_df.pivot(index='tag_id', columns='user_id', values='tag_count').fillna(0)
tags_user_df.head()

user_id,00000055a78bf6735c4a89358fab1de34104c3cb,00000bfd1cce5d57bd67ca12b70acc8cd4df4176,000014674d2afbd30b4a89e7f917b67ade3c31c4,00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,00002d886b0027b4cead503a8a6f71b797721dcc,00009c599dca6873a47404fa9b7a9b0a1bd13049,0000a8c248465bc8cd907e112b5c2f1e94424f28,0000e13241f8b242a8a25666d488a7f8661112e2,0000e656186fd0eafc55ac7b9c109a0232cc6d49,0000e774b6e4c5df530c2dca979f8662ff695e07,...,005abc99490c8581a7d764236f0494af60853c97,005ac110b90e5a3d97391afef39c92309f4a609f,005ad182288447b6fd8774b2126551b7867ca00c,005ad88fd63284e847168b5719b99d90e6414bef,005b042fbb71e09988334dc2f200a971e0ce30ff,005b73371f07bd43ca9f824cd5de2f1a1d513201,005b90e15237bedb77b8e903a63f862aa3dcae34,005ba399f064ff0240a7aee3c659fae35ef2da43,005ba490ba9f4de4aeed34ae9500bfdc14014345,005c1ad3eb8ae0f1380e0116a4764ae97c6791ff
tag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00410345e6d60633a211ebd3755d5c89ea7b5297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005703ee98894846cde759fbe88f3d7fde830c85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0076e2a45d90991150032dbfaa574b4b7ab21177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0087f6286f5bd4f872620555b3e3b880e21de444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
008a19c4e6b27ade78d422f9deaba16ef195772b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Model engine with cosine distance

In [5]:
item_distances = pairwise_distances(tags_user_df.values, metric='cosine')
item_distances.shape

(1000, 1000)

In [6]:
item_distances

array([[0.        , 1.        , 0.80930748, ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.80930748, 1.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        0.        ]])

Convert the distances to "similarity" scores

In [7]:
item_similarity = 1 - item_distances

In [8]:
item_similarity

array([[1.        , 0.        , 0.19069252, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.19069252, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

Set the same tag similarities to zero:

In [9]:
np.fill_diagonal(item_similarity, 0)

In [10]:
item_similarity

array([[0.        , 0.        , 0.19069252, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.19069252, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [11]:
item_similarity = pd.DataFrame(item_similarity, columns=tags_user_df.index, index=tags_user_df.index)
item_similarity.head()

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332,ff664fdace0b1f85828387e81580f871eacb6386
tag_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00410345e6d60633a211ebd3755d5c89ea7b5297,0.0,0.0,0.190693,0.0,0.0,0.0,0.141421,0.108465,0.0,0.258199,...,0.0,0.0,0.129099,0.1,0.0,0.0,0.091287,0.0,0.0,0.0
005703ee98894846cde759fbe88f3d7fde830c85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0076e2a45d90991150032dbfaa574b4b7ab21177,0.190693,0.0,0.0,0.0,0.123091,0.0,0.13484,0.155126,0.0,0.123091,...,0.0,0.083624,0.123091,0.095346,0.0,0.1066,0.087039,0.0,0.0,0.0
0087f6286f5bd4f872620555b3e3b880e21de444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076696,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.129099,0.182574,0.0,0.0
008a19c4e6b27ade78d422f9deaba16ef195772b,0.0,0.0,0.123091,0.0,0.0,0.0,0.0,0.070014,0.182574,0.0,...,0.0,0.113228,0.0,0.129099,0.0,0.288675,0.0,0.0,0.129099,0.0


### Tag values for "ff0d3fb21c00bc33f71187a2beec389e9eff5332"

In [12]:
tag1 = "ff0d3fb21c00bc33f71187a2beec389e9eff5332"

In [18]:
item_similarity.loc[tag1].sort_values(ascending=False)[:10]

tag_id
7ee223009403f7450993fe5d79516f1fc841e75e    0.789352
6b0cd6a8094daf42e766ea257a2af3571831bb32    0.690268
bdf147e99ee57500eb2dabcbf3cfa24e1daef357    0.639010
340f1eaf7ad0c07f1491338ab68cbcab30c315ec    0.632456
c093b1743115b3f9d368b2f7bdf54f367afccc7c    0.422577
61bc35a6401829bd28a8da47a2f235944ba8d2df    0.422577
85ef93bda0f7fb6327bd1b5ad44da26246b4360d    0.408248
dd3c8fd58366b577ce6b1d0f435602f11671c3dc    0.400000
551ec41539d9fb71200d18ec7903b1039cde594f    0.381385
0cbb51f1f43646c1718553da0c5864d4e1a6f037    0.365148
Name: ff0d3fb21c00bc33f71187a2beec389e9eff5332, dtype: float64

In [24]:
def get_tag_predictions(item_similarity_scores, tag,top_n):
    return item_similarity.loc[tag1].sort_values(ascending=False)[:10].index

In [27]:
predicted_tags = get_tag_predictions(item_similarity, tag1, 10)
predicted_tags

Index(['7ee223009403f7450993fe5d79516f1fc841e75e',
       '6b0cd6a8094daf42e766ea257a2af3571831bb32',
       'bdf147e99ee57500eb2dabcbf3cfa24e1daef357',
       '340f1eaf7ad0c07f1491338ab68cbcab30c315ec',
       'c093b1743115b3f9d368b2f7bdf54f367afccc7c',
       '61bc35a6401829bd28a8da47a2f235944ba8d2df',
       '85ef93bda0f7fb6327bd1b5ad44da26246b4360d',
       'dd3c8fd58366b577ce6b1d0f435602f11671c3dc',
       '551ec41539d9fb71200d18ec7903b1039cde594f',
       '0cbb51f1f43646c1718553da0c5864d4e1a6f037'],
      dtype='object', name='tag_id')

### Look at the tag descriptions

In [32]:
# Tag description
data[data['tag_id']==tag1]['product_name'].unique()

array(['Cosmos Necklace - Maria Pascual Shop'], dtype=object)

In [30]:
data[data['tag_id'].isin(predicted_tags)]['product_name'].unique()

array(['Shiny Moon Necklace - Maria Pascual Shop',
       'Sudadera (no Disponible Online)', 'Line Black',
       'Zapatillas Triple S', 'Mini Star Necklace - Maria Pascual Shop',
       'Cintur�_n Estrecho De Cuero Negro Estilo Western De Asos Design',
       'Sombrero Paja', 'Cosmos Necklace - Maria Pascual Shop'],
      dtype=object)

Makes sense, a user has seen items with tags that have the same descriptions