In [1]:
import pandas as pd
import numpy as np
from scipy.sparse.linalg import svds

data_path = '../data/21B_tag_views_dataset.csv'

In [2]:
data = pd.read_csv(data_path, encoding='utf8')

In [3]:
data.head()

Unnamed: 0,id,user_id,tag_id,product_name
0,1,00000055a78bf6735c4a89358fab1de34104c3cb,e78de9dad70d230a096f0bbdc3e89b5cae04ba77,La Gar̤onne In Oro Rosa A Maglie/bianco
1,2,00000055a78bf6735c4a89358fab1de34104c3cb,b9a521730141de9bc4fe8ebc9f33713411d0101a,Fishnet Eco Bag
2,3,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,Collarino Essentielle In Oro Interamente A Esa...
3,4,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,Asos - Vestito A Fascia Con Fondo A Fisarmonica
4,5,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,Peggy Off Shoulder Corset Top-white


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9999 entries, 0 to 9998
Data columns (total 4 columns):
id              9999 non-null int64
user_id         9999 non-null object
tag_id          9999 non-null object
product_name    9999 non-null object
dtypes: int64(1), object(3)
memory usage: 312.5+ KB


In [5]:
data.isnull().sum() # Checking for null data

id              0
user_id         0
tag_id          0
product_name    0
dtype: int64

In [6]:
data = data.set_index("id")

In [7]:
data.head()

Unnamed: 0_level_0,user_id,tag_id,product_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,00000055a78bf6735c4a89358fab1de34104c3cb,e78de9dad70d230a096f0bbdc3e89b5cae04ba77,La Gar̤onne In Oro Rosa A Maglie/bianco
2,00000055a78bf6735c4a89358fab1de34104c3cb,b9a521730141de9bc4fe8ebc9f33713411d0101a,Fishnet Eco Bag
3,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,Collarino Essentielle In Oro Interamente A Esa...
4,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,Asos - Vestito A Fascia Con Fondo A Fisarmonica
5,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,Peggy Off Shoulder Corset Top-white


In [8]:
data['product_name'].value_counts()

Look                                                                            240
501� Shorts                                                                     124
Shiny Moon Necklace - Maria Pascual Shop                                        111
Zapatillas Converse Chuck Taylor All Star Lift High Top                          66
Ba̱ador Madeira                                                                  62
Chaqueta Cropped                                                                 61
Vestido                                                                          60
Mono Corto Pinzas                                                                59
Line Black                                                                       59
Vestido Camisero Estampado Serpiente                                             55
Minivestido Skater Con Bordados Y Espalda Abierta De A Star Is Born              53
Vestido Largo Estampado Animal                                              

In [9]:
data.groupby('product_name')['tag_id'].nunique()

product_name
(similar) 'equador' Gold Lurex Tie Side Bikini Two Piece                     2
(similar) Bandolera De Piel De Becerro                                       1
(similar) Falda Asim̩trica Con Volantes                                      1
(simile) Limoncello                                                          1
1950's 701 Cut Off Shorts                                                    2
501 Customized High Rise Short                                               3
501 High Rise Short                                                          1
501� Shorts                                                                 12
501� Skinny Jeans                                                            1
A.bing Shorts                                                                1
Abito Avvolgente Con Stampa Botanica A Volant                                1
Abito Bianco A Quadri                                                        1
Abrigo Jacquard Special Garment        

As stated, there are descriptions that map to multiple tags. Lets create a matrix of item ocurrences by user

In [10]:
tag_df = data.groupby(['user_id', 'tag_id']).agg({'tag_id': 'count'}).rename(columns={'tag_id': 'tag_count'})

In [11]:
tag_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tag_count
user_id,tag_id,Unnamed: 2_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,5c61cd1b82ec7a4d2918a6de99fcd1577b462f79,1
00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,1
00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,1
00000055a78bf6735c4a89358fab1de34104c3cb,a1437d6393ee9535248b16f27a649bbd98c9e2f5,1
00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,1


In [12]:
tag_df[tag_df['tag_count'] > 1] # each user only contains unique tags

Unnamed: 0_level_0,Unnamed: 1_level_0,tag_count
user_id,tag_id,Unnamed: 2_level_1


In [13]:
tag_count_df = tag_df.reset_index()
tag_count_df

Unnamed: 0,user_id,tag_id,tag_count
0,00000055a78bf6735c4a89358fab1de34104c3cb,5c61cd1b82ec7a4d2918a6de99fcd1577b462f79,1
1,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,1
2,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,1
3,00000055a78bf6735c4a89358fab1de34104c3cb,a1437d6393ee9535248b16f27a649bbd98c9e2f5,1
4,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,1
5,00000055a78bf6735c4a89358fab1de34104c3cb,b9a521730141de9bc4fe8ebc9f33713411d0101a,1
6,00000055a78bf6735c4a89358fab1de34104c3cb,c093b1743115b3f9d368b2f7bdf54f367afccc7c,1
7,00000055a78bf6735c4a89358fab1de34104c3cb,e78de9dad70d230a096f0bbdc3e89b5cae04ba77,1
8,00000bfd1cce5d57bd67ca12b70acc8cd4df4176,383b1bfd6866f724a373bd5319a8d8def774bc5a,1
9,00000bfd1cce5d57bd67ca12b70acc8cd4df4176,8280fd9adc1f65063789b6a04b5e94e94d28b8a7,1


In [14]:
user_products_df = tag_count_df.pivot(index='user_id', columns='tag_id', values='tag_count')

In [15]:
user_products_df

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332,ff664fdace0b1f85828387e81580f871eacb6386
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,,,,,,,,,,,...,,,,,,,,,,
00000bfd1cce5d57bd67ca12b70acc8cd4df4176,,,,,,,,,,,...,,,,,,,,,,
000014674d2afbd30b4a89e7f917b67ade3c31c4,,,,,,,,,,,...,,,,,,,,,,
00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,,,,,,,,,,,...,,,,,,,,,,
00002d886b0027b4cead503a8a6f71b797721dcc,,,,,,,,,,,...,,,,,,,,,,
00009c599dca6873a47404fa9b7a9b0a1bd13049,,,,,,,,,,,...,,,,,,,,,,
0000a8c248465bc8cd907e112b5c2f1e94424f28,,,,,,,,,,,...,,,,,,,,,,
0000e13241f8b242a8a25666d488a7f8661112e2,,,,,,1.0,,,,,...,,,,,,,,,,
0000e656186fd0eafc55ac7b9c109a0232cc6d49,,,,,,,,,,,...,,,,,,,,,,
0000e774b6e4c5df530c2dca979f8662ff695e07,,,,,,,,,,,...,,,,,,,,,,


In [16]:
user_products_df = user_products_df.fillna(0)

In [17]:
user_products_df.head()

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332,ff664fdace0b1f85828387e81580f871eacb6386
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00000bfd1cce5d57bd67ca12b70acc8cd4df4176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000014674d2afbd30b4a89e7f917b67ade3c31c4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00002d886b0027b4cead503a8a6f71b797721dcc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Distance estimation using matrix decomposition

In [18]:
user_views_m = user_products_df.values

In [19]:
n_factors = 10 

In [20]:
u, sigma, v = svds(user_views_m, k=n_factors)

In [21]:
sigma.shape

(10,)

In [22]:
sigma = np.diag(sigma)

In [23]:
user_predictions = np.dot(np.dot(u, sigma), v)

In [24]:
user_predictions[5]

array([ 7.78550807e-02, -7.54272860e-03,  8.71386939e-02,  3.78060621e-02,
        8.76071351e-03, -3.84239690e-02,  5.74512005e-02,  6.19515336e-02,
        3.14677541e-03,  7.71713174e-02,  1.10763563e-02,  5.53502918e-04,
        2.58643838e-02,  1.36280261e-02,  7.74270525e-02,  2.08723878e-02,
       -4.86228731e-03, -7.24459194e-04,  7.54368315e-02,  7.36069597e-06,
       -1.01337276e-02,  3.48745497e-04, -9.32666250e-03,  6.64839455e-02,
        5.43298692e-02, -3.54287314e-02,  8.60608540e-02,  1.06419629e-03,
       -1.99955062e-02,  4.52988585e-02,  1.27740862e-01,  4.79425727e-02,
        8.65623902e-02, -1.32074822e-02,  3.25173975e-03,  2.98119475e-02,
        4.39320652e-02,  7.35422098e-02, -1.58710199e-04,  4.65625645e-02,
        6.93619826e-02,  5.28210787e-02,  2.01921872e-02,  1.60288725e-02,
       -3.91459482e-04,  9.62595218e-04,  6.42750160e-04,  7.84720069e-04,
       -6.53347220e-03,  1.50354046e-03,  8.37097048e-03,  1.39459634e-04,
       -5.30897767e-02, -

In [25]:
predictions_df = pd.DataFrame(user_predictions, columns=user_products_df.columns, index=user_products_df.index)

In [26]:
predictions_df.head()

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332,ff664fdace0b1f85828387e81580f871eacb6386
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,-0.005667,0.053877,-0.014597,-0.00388,0.002558,0.007655,-0.006271,-0.008355,0.013495,-0.005392,...,-0.007034,0.001482,-0.001485,-0.001173,0.004634,0.007185,-0.006497,-0.012688,0.030287,-0.000743
00000bfd1cce5d57bd67ca12b70acc8cd4df4176,0.000254,0.005227,0.00898,0.005358,-0.001119,-0.000366,-0.000429,0.003727,-0.001251,-0.000272,...,0.010663,0.000384,-0.000484,-0.001445,-0.000192,-0.00211,0.000388,0.014269,-0.001107,0.001494
000014674d2afbd30b4a89e7f917b67ade3c31c4,-0.000217,0.018534,0.016876,0.010625,-0.001646,-0.000949,-0.001433,0.010584,-0.002481,-0.001653,...,0.019572,0.002232,-0.001268,-0.004103,-0.000117,-0.00399,0.00077,0.025648,-0.001938,0.002765
00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.009275,-0.00087,0.011742,0.004651,0.002063,-0.000288,0.002985,0.019547,-0.00202,0.00796,...,-1.3e-05,0.003821,0.005282,0.004349,-0.00052,0.004067,0.012777,-1.5e-05,-0.001143,-4e-05
00002d886b0027b4cead503a8a6f71b797721dcc,-0.000857,-0.000562,-0.00012,-0.002321,0.00451,0.000563,-0.000185,0.001382,0.004907,-0.001201,...,2.9e-05,-0.001963,-0.000988,0.004496,-0.000169,0.003785,-0.005274,0.00012,0.00383,3.3e-05


### User most similar products

In [27]:
user = '00000055a78bf6735c4a89358fab1de34104c3cb'  # First user

In [28]:
top_n = 10  # Number of predictions to get

In [43]:
user_predictions = predictions_df.loc[user].sort_values(ascending=False).reset_index()

In [30]:
user_predictions.head()

Unnamed: 0,tag_id,00000055a78bf6735c4a89358fab1de34104c3cb
0,778e3136ca5764da2281b4de5712693c5523155d,0.114704
1,3e4d8d24daf15692515999d4c8809eac1a3ee55c,0.097745
2,5d39b3d82fb38bb63c169307b29f8c065f5069e1,0.097172
3,8f9cad3197bec0704f6d8f7817158eff7a10d86a,0.095586
4,e531c1dad33434c9d7a323f7928cae9cf04f9f7a,0.083848


In [31]:
user_products_df.loc[user].index[user_products_df.loc[user]>0]

Index(['5c61cd1b82ec7a4d2918a6de99fcd1577b462f79',
       '8378136c6dd0e03be859a210a0cee03955951fb1',
       '9cc68d8345f675892bcab0fad02f65b4ac7e71ea',
       'a1437d6393ee9535248b16f27a649bbd98c9e2f5',
       'a8272c62cd05d5b882e4f630fb55cfa0ba8491e6',
       'b9a521730141de9bc4fe8ebc9f33713411d0101a',
       'c093b1743115b3f9d368b2f7bdf54f367afccc7c',
       'e78de9dad70d230a096f0bbdc3e89b5cae04ba77'],
      dtype='object', name='tag_id')

In [40]:
user_seen_products = data[data['user_id']==user]['tag_id']
user_seen_products

id
1    e78de9dad70d230a096f0bbdc3e89b5cae04ba77
2    b9a521730141de9bc4fe8ebc9f33713411d0101a
3    8378136c6dd0e03be859a210a0cee03955951fb1
4    9cc68d8345f675892bcab0fad02f65b4ac7e71ea
5    a8272c62cd05d5b882e4f630fb55cfa0ba8491e6
6    5c61cd1b82ec7a4d2918a6de99fcd1577b462f79
7    c093b1743115b3f9d368b2f7bdf54f367afccc7c
8    a1437d6393ee9535248b16f27a649bbd98c9e2f5
Name: tag_id, dtype: object

In [44]:
user_predictions[user_predictions['tag_id'].isin(user_seen_products)]

Unnamed: 0,tag_id,00000055a78bf6735c4a89358fab1de34104c3cb
11,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,0.069991
28,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,0.057822
40,c093b1743115b3f9d368b2f7bdf54f367afccc7c,0.045431
48,e78de9dad70d230a096f0bbdc3e89b5cae04ba77,0.041631
50,8378136c6dd0e03be859a210a0cee03955951fb1,0.040859
58,b9a521730141de9bc4fe8ebc9f33713411d0101a,0.038283
176,a1437d6393ee9535248b16f27a649bbd98c9e2f5,0.012152
435,5c61cd1b82ec7a4d2918a6de99fcd1577b462f79,0.002834


In [48]:
not_seen_user_preds = user_predictions[~user_predictions['tag_id'].isin(user_seen_products)]
not_seen_user_preds.head()

Unnamed: 0,tag_id,00000055a78bf6735c4a89358fab1de34104c3cb
0,778e3136ca5764da2281b4de5712693c5523155d,0.114704
1,3e4d8d24daf15692515999d4c8809eac1a3ee55c,0.097745
2,5d39b3d82fb38bb63c169307b29f8c065f5069e1,0.097172
3,8f9cad3197bec0704f6d8f7817158eff7a10d86a,0.095586
4,e531c1dad33434c9d7a323f7928cae9cf04f9f7a,0.083848


In [49]:
data[data['user_id']==user]['product_name']

id
1              La Gar̤onne In Oro Rosa A Maglie/bianco
2                                      Fishnet Eco Bag
3    Collarino Essentielle In Oro Interamente A Esa...
4      Asos - Vestito A Fascia Con Fondo A Fisarmonica
5                  Peggy Off Shoulder Corset Top-white
6           Uo Suede Contrast Stitch Ecru Pelmet Skirt
7             Shiny Moon Necklace - Maria Pascual Shop
8                 Maglione Jacquard Con Logo 'arantxa'
Name: product_name, dtype: object