In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds

from sklearn.metrics.pairwise import pairwise_distances 
from sklearn.model_selection import train_test_split

data_path = '../data/21B_tag_views_dataset.csv'

# User-based Collaborative Filtering

## Data preparation

Data loading and user/tag freq matrix

In [2]:
data = pd.read_csv(data_path)

In [3]:
data.head()

Unnamed: 0,id,user_id,tag_id,product_name
0,1,00000055a78bf6735c4a89358fab1de34104c3cb,e78de9dad70d230a096f0bbdc3e89b5cae04ba77,La Gar̤onne In Oro Rosa A Maglie/bianco
1,2,00000055a78bf6735c4a89358fab1de34104c3cb,b9a521730141de9bc4fe8ebc9f33713411d0101a,Fishnet Eco Bag
2,3,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,Collarino Essentielle In Oro Interamente A Esa...
3,4,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,Asos - Vestito A Fascia Con Fondo A Fisarmonica
4,5,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,Peggy Off Shoulder Corset Top-white


In [4]:
tag_count_df = data.groupby(['user_id', 'tag_id']).agg({'tag_id': 'count'}).rename(columns={'tag_id':'tag_count'})
tag_count_df = tag_count_df.reset_index()

In [5]:
tag_count_df.head()

Unnamed: 0,user_id,tag_id,tag_count
0,00000055a78bf6735c4a89358fab1de34104c3cb,5c61cd1b82ec7a4d2918a6de99fcd1577b462f79,1
1,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,1
2,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,1
3,00000055a78bf6735c4a89358fab1de34104c3cb,a1437d6393ee9535248b16f27a649bbd98c9e2f5,1
4,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,1


In [6]:
group_user_tag_count = tag_count_df.groupby('user_id').count()
print("number of users:", group_user_tag_count.shape[0])

number of users: 859


Remove users which have less than 3 tags

In [7]:
idx = group_user_tag_count[(group_user_tag_count['tag_count'] >= 3)].index
tag_count_df = tag_count_df[tag_count_df['user_id'].isin(idx)]

In [8]:
tag_count_df.groupby('user_id').count().shape

(555, 2)

### Create an evaluation set

In [9]:
train_df, test_df = train_test_split(tag_count_df, test_size=0.33, stratify=tag_count_df['user_id'], 
                                     random_state=2019)

In [10]:
train_df.head()

Unnamed: 0,user_id,tag_id,tag_count
3451,001fcc3fbbb0018342194d97453fcac9431b8b8d,911233d3bd7439f816b59c4d0358d5b3d1180db3,1
7677,0046db9c93d12e84b83fb852e48c6ea478d0cf9f,849559737e3aa045f936886bd90aa43633e26172,1
7691,0046eba9057fe724911c1dfb7e1d89efe4180912,b576bce562c9ac7ae1a2b6d8d17b9875f9c1d476,1
7120,0040a6b97d2e61fef5eb404baf24867a4d3448a5,8be5f988077faab8b95e481a7a13f9d895db14f2,1
4756,0028c76d3882ac1de97c710a5c1ec65a54174de7,4ae75b3f956a77966e595767e2a456f00bcd835b,1


In [11]:
train_user_tags_df = train_df.pivot(index='user_id', columns='tag_id', values='tag_count').fillna(0)

In [12]:
test_df = test_df.set_index('user_id')

In [13]:
test_df.head()

Unnamed: 0_level_0,tag_id,tag_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
002c887f87f2741e34bdabad89b622389dd6b6c8,d51cd4cbf68c39cc679c84fb93b7180757e895d6,1
0058eecfe79ec9c75f7085fd8d80ce796a4fdf6e,9dec895921afac3b79d79caed6528fa0cef16b85,1
00322da4f93da9932f49e5f346beae6e72475c88,9a326d8e613a25001392cc42903bcfe8cdc1e5ff,1
0044a0c70865331db0adf7f350dea93442b869fd,2b0545bd678d20da70950fd780cf371f64946ab7,1
003508ad2971d0413183983061bdb3c2a455d88a,3515d80ed2a0ce83f79afeb031349c261d95f707,1


## Collaborative model engine with matrix decomposition

In [14]:
def matrix_decomposition(n_factors):
    u, sigma, v = svds(train_user_tags_df.values, k=n_factors)
    sigma = np.diag(sigma)
    user_predictions = np.dot(np.dot(u, sigma), v)
    return pd.DataFrame(user_predictions, columns=train_user_tags_df.columns, index=train_user_tags_df.index)

In [15]:
predictions_df = matrix_decomposition(n_factors = 30)
predictions_df.head()

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fd9b6d215102521471bdd20b88eef7e471a5f08c,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,-0.002766864,0.01983152,0.0006257567,0.00451168,-0.00247183,-0.003873215,-0.00243934,0.006347881,0.007761667,0.005169462,...,-0.002215744,-0.005257891,-0.005692199,-0.001005089,-0.003894122,3.28162e-05,0.003521641,0.0008114178,-0.008013274,0.01922221
000014674d2afbd30b4a89e7f917b67ade3c31c4,0.0002252433,0.007157366,0.001464573,0.001372593,-0.002531907,0.0001173938,0.0007325514,0.003867707,0.0001712029,-0.002338885,...,-0.0005490988,0.002696509,-0.001377479,-0.0009073557,0.0006164411,5.943258e-05,-0.001013057,-0.003521255,0.005958806,0.0008790847
00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,-0.002236487,0.0009305947,-0.00103281,-0.003682103,0.01232474,-0.01220585,0.001645196,0.0497694,-0.009707537,0.006842038,...,0.03547152,0.0009021001,-0.002038891,-0.004448768,0.004484566,-0.0005378293,0.02065431,0.01694823,0.0010571,0.01005686
00009c599dca6873a47404fa9b7a9b0a1bd13049,0.02226605,0.003202278,0.006859005,0.02563752,-0.001566725,-0.001031952,0.02435927,0.03533251,-0.003126543,0.01172076,...,-0.0232789,-0.001872252,-0.04260436,0.01248163,0.03923148,-0.003448356,0.02389488,0.06294212,-0.004532238,0.00119329
0000a8c248465bc8cd907e112b5c2f1e94424f28,-1.544546e-18,4.7285509999999996e-20,-1.344747e-18,-3.042988e-18,-5.218651e-19,2.443752e-18,9.66456e-19,3.65847e-19,-2.581234e-18,2.437094e-18,...,6.805206e-18,-2.5900709999999996e-19,-2.3796369999999997e-19,7.630618999999999e-19,2.552223e-18,2.878806e-21,4.423756e-18,3.778049e-18,-5.013697999999999e-19,-6.446025e-19


In [16]:
def get_user_predictions(user, predictions, n_preds, not_seen=True):
    """
    user: str user id
    predictions: pd.DataFrame of user_id/tag_id with predictions 'score'
    n_preds: int number of predictions to return
    not_seen: bool flag to return user seen predictions or not
    """
    user_predictions = predictions.loc[user].sort_values(ascending=False).reset_index()
    user_products = data[data['user_id']==user]['tag_id']
    if not_seen:
        user_predictions = user_predictions[~user_predictions['tag_id'].isin(user_products)]
    return user_predictions[:n_preds]

**Note**: The precision metric here is a bit useless, si nce it will highly depend on how many tags we want to show, and how many tags the user has seen which could be less than those we want to show, and therefore lowering the metric.

In [17]:
def eval_model(predictions, test):
    recall_list = []
    precision_list = []
    for i, user_id in enumerate(predictions.index.unique().values):
        test_user_tags = test.loc[user_id]['tag_id']
        user_preds = get_user_predictions(user_id, predictions, 10, False)
        # Match tags in test with those in the predictions
        matched_tags_bool = user_preds['tag_id'].isin(pd.Series(test_user_tags))
        
        true_positives = matched_tags_bool.sum()
        if type(test_user_tags) == str:
            user_recall = true_positives / (true_positives + 1 - true_positives) # sometimes it will get only 1 element which is a string
            user_precision = true_positives / (1 + len(user_preds) - true_positives)
        else:
            user_recall = true_positives / (true_positives + len(test_user_tags) - true_positives)
            user_precision = true_positives / (len(test_user_tags) + len(user_preds) - true_positives)
        recall_list.append(user_recall)
        precision_list.append(user_precision)
        
    recall = sum(recall_list)/test.shape[0]
    precision = sum(precision_list)/test.shape[0]
    print("Recall:", recall*100)
    print("Precision:", precision*100)

        
eval_model(predictions_df, test_df)

Recall: 2.131267282196077
Precision: 0.5635654652483083


In [18]:
preds_15_df = matrix_decomposition(n_factors = 15)
eval_model(preds_15_df, test_df)

Recall: 1.6832546230632066
Precision: 0.5132753391297613


In [19]:
preds_50_df = matrix_decomposition(n_factors = 50)
eval_model(preds_50_df, test_df)

Recall: 2.5653930499902065
Precision: 0.6030131716814621


In [20]:
preds_50_df = matrix_decomposition(n_factors = 60)
eval_model(preds_50_df, test_df)

Recall: 2.599453118410464
Precision: 0.5956516307638045


## Collaborative model engine with cosine distance

In [21]:
def cosine_distance_model():
    item_similarity = pairwise_distances(train_user_tags_df.values.T, metric='cosine')
    user_predictions = train_user_tags_df.values.dot(item_similarity)/np.abs(item_similarity.sum(axis=1))
    # Here we are dealing with distances, so the lower the distance the best, which is the inverse of the SDV 
    user_predictions = 1 / (1 + user_predictions)
    return pd.DataFrame(user_predictions, columns=train_user_tags_df.columns, index=train_user_tags_df.index)

In [22]:
cosine_predictions_df = cosine_distance_model()
cosine_predictions_df.head()

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fd9b6d215102521471bdd20b88eef7e471a5f08c,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,0.994815,0.995689,0.994745,0.994867,0.994706,0.994721,0.994748,0.994716,0.994974,0.994776,...,0.994722,0.994913,0.994751,0.994762,0.994871,0.99496,0.99473,0.994782,0.994918,0.995171
000014674d2afbd30b4a89e7f917b67ade3c31c4,0.993785,0.994361,0.993701,0.993846,0.993653,0.993672,0.993704,0.993801,0.993766,0.993738,...,0.993673,0.994354,0.993708,0.99372,0.993852,0.993958,0.993683,0.993925,0.99411,0.993833
00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.99792,0.997961,0.9981,0.99794,0.998154,0.998039,0.998055,0.998373,0.997913,0.998065,...,0.998347,0.997959,0.997988,0.998054,0.998054,0.997978,0.998255,0.998072,0.997961,0.998048
00009c599dca6873a47404fa9b7a9b0a1bd13049,0.979654,0.979187,0.979387,0.979635,0.979166,0.978754,0.979388,0.979343,0.978883,0.979796,...,0.979164,0.978976,0.978511,0.979543,0.979694,0.979169,0.979751,0.979471,0.978997,0.979219
0000a8c248465bc8cd907e112b5c2f1e94424f28,0.99792,0.997961,0.997891,0.99794,0.997875,0.997882,0.997893,0.99788,0.997913,0.997904,...,0.997882,0.997959,0.997894,0.997898,0.997942,0.997978,0.997885,0.997906,0.997961,0.997936


In [23]:
get_user_predictions(cosine_predictions_df.index[0], cosine_predictions_df, 10)

Unnamed: 0,tag_id,00000055a78bf6735c4a89358fab1de34104c3cb
5,7920408a188a51d437a48a707f9e71277c9d8335,0.996253
6,e531c1dad33434c9d7a323f7928cae9cf04f9f7a,0.996221
7,a75c8e7b72c5e7d363911e1b02932b0563fa5a6a,0.996176
8,3f736ea31dc289439c2868ef54b0fcb8ea3be3b9,0.996087
9,6979b2dff5cd1530f7fe498d96efb1bff3b549cf,0.996036
10,9f5cd26abfc96a97f8ee874d132c526a0fccb382,0.995998
11,a4f57d548918c1aabeec80a99078853e849bd65c,0.995918
12,ceb10adfb4b5b50f5bd2e01d1cc797c71e01a77c,0.995893
13,3e4d8d24daf15692515999d4c8809eac1a3ee55c,0.995891
14,29d400c6bada3de9543bcd931729848b5a95cdd6,0.995847


In [24]:
eval_model(cosine_predictions_df, test_df)

Recall: 2.9036334913112163
Precision: 0.4884231092761901


## Collaborative model engine with euclidean distance

In [25]:
def euclidean_distance_model():
    item_similarity = pairwise_distances(train_user_tags_df.values.T, metric='euclidean')
    user_predictions = train_user_tags_df.values.dot(item_similarity)/np.abs(item_similarity.sum(axis=1))
    user_predictions = 1 / (1 + user_predictions)
    return pd.DataFrame(user_predictions, columns=train_user_tags_df.columns, index=train_user_tags_df.index)

In [26]:
euclidean_predictions_df = euclidean_distance_model()
euclidean_predictions_df.head()

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fd9b6d215102521471bdd20b88eef7e471a5f08c,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,0.994949,0.99541,0.994907,0.994975,0.994893,0.994884,0.994917,0.994885,0.995028,0.994931,...,0.99489,0.994994,0.994898,0.994923,0.994972,0.995018,0.994903,0.994923,0.994998,0.995137
000014674d2afbd30b4a89e7f917b67ade3c31c4,0.994602,0.994809,0.994339,0.994629,0.994469,0.994185,0.994567,0.994129,0.994458,0.994583,...,0.994258,0.995042,0.994267,0.994574,0.994451,0.994768,0.994422,0.994316,0.994627,0.994439
00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,0.997159,0.99726,0.997491,0.997173,0.997356,0.997603,0.997226,0.997931,0.997286,0.997234,...,0.99772,0.996982,0.99749,0.997227,0.997423,0.997109,0.997486,0.997602,0.997319,0.997417
00009c599dca6873a47404fa9b7a9b0a1bd13049,0.977724,0.97765,0.978102,0.977687,0.977648,0.978058,0.977549,0.978645,0.977626,0.977776,...,0.978102,0.976915,0.977747,0.977656,0.978182,0.977269,0.978092,0.978414,0.977681,0.97788
0000a8c248465bc8cd907e112b5c2f1e94424f28,0.998639,0.998574,0.998391,0.998646,0.998543,0.998252,0.998631,0.998154,0.998492,0.998635,...,0.998321,0.998914,0.998324,0.998632,0.998453,0.998765,0.998482,0.998285,0.998511,0.99845


In [27]:
eval_model(euclidean_predictions_df, test_df)

Recall: 3.0084254870984717
Precision: 0.5127284511170768


### Final comments:
- Euclidean distance model showed the best performance of all, but close to the cosine.
- Matrix decomposition is a good approach for high dimentional datasets, but underperforms against simple distance metrics
- A better evaluation scheme should be used to draw final conclusions, for instance, with fold cross validation