In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse.linalg import svds

from sklearn.model_selection import train_test_split

data_path = '../data/21B_tag_views_dataset.csv'

# Collaborative Filtering

## Data preparation

Data loading and user/tag freq matrix

In [2]:
data = pd.read_csv(data_path)

In [3]:
data.head()

Unnamed: 0,id,user_id,tag_id,product_name
0,1,00000055a78bf6735c4a89358fab1de34104c3cb,e78de9dad70d230a096f0bbdc3e89b5cae04ba77,La Gar̤onne In Oro Rosa A Maglie/bianco
1,2,00000055a78bf6735c4a89358fab1de34104c3cb,b9a521730141de9bc4fe8ebc9f33713411d0101a,Fishnet Eco Bag
2,3,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,Collarino Essentielle In Oro Interamente A Esa...
3,4,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,Asos - Vestito A Fascia Con Fondo A Fisarmonica
4,5,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,Peggy Off Shoulder Corset Top-white


In [4]:
tag_count_df = data.groupby(['user_id', 'tag_id']).agg({'tag_id': 'count'}).rename(columns={'tag_id':'tag_count'})
tag_count_df = tag_count_df.reset_index()

In [5]:
tag_count_df.head()

Unnamed: 0,user_id,tag_id,tag_count
0,00000055a78bf6735c4a89358fab1de34104c3cb,5c61cd1b82ec7a4d2918a6de99fcd1577b462f79,1
1,00000055a78bf6735c4a89358fab1de34104c3cb,8378136c6dd0e03be859a210a0cee03955951fb1,1
2,00000055a78bf6735c4a89358fab1de34104c3cb,9cc68d8345f675892bcab0fad02f65b4ac7e71ea,1
3,00000055a78bf6735c4a89358fab1de34104c3cb,a1437d6393ee9535248b16f27a649bbd98c9e2f5,1
4,00000055a78bf6735c4a89358fab1de34104c3cb,a8272c62cd05d5b882e4f630fb55cfa0ba8491e6,1


In [6]:
group_user_tag_count = tag_count_df.groupby('user_id').count()
print("number of users:", group_user_tag_count.shape[0])

number of users: 859


Remove users which have less than 3 tags

In [7]:
idx = group_user_tag_count[(group_user_tag_count['tag_count'] >= 3)].index
tag_count_df = tag_count_df[tag_count_df['user_id'].isin(idx)]

In [8]:
tag_count_df.groupby('user_id').count().shape

(555, 2)

### Create an evaluation set

In [9]:
train_df, test_df = train_test_split(tag_count_df, test_size=0.33, stratify=tag_count_df['user_id'], 
                                     random_state=2019)

In [10]:
train_df.head()

Unnamed: 0,user_id,tag_id,tag_count
3451,001fcc3fbbb0018342194d97453fcac9431b8b8d,911233d3bd7439f816b59c4d0358d5b3d1180db3,1
7677,0046db9c93d12e84b83fb852e48c6ea478d0cf9f,849559737e3aa045f936886bd90aa43633e26172,1
7691,0046eba9057fe724911c1dfb7e1d89efe4180912,b576bce562c9ac7ae1a2b6d8d17b9875f9c1d476,1
7120,0040a6b97d2e61fef5eb404baf24867a4d3448a5,8be5f988077faab8b95e481a7a13f9d895db14f2,1
4756,0028c76d3882ac1de97c710a5c1ec65a54174de7,4ae75b3f956a77966e595767e2a456f00bcd835b,1


In [11]:
train_user_tags_df = train_df.pivot(index='user_id', columns='tag_id', values='tag_count').fillna(0)

In [12]:
test_df = test_df.set_index('user_id')

In [19]:
test_df.head()

Unnamed: 0_level_0,tag_id,tag_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
002c887f87f2741e34bdabad89b622389dd6b6c8,d51cd4cbf68c39cc679c84fb93b7180757e895d6,1
0058eecfe79ec9c75f7085fd8d80ce796a4fdf6e,9dec895921afac3b79d79caed6528fa0cef16b85,1
00322da4f93da9932f49e5f346beae6e72475c88,9a326d8e613a25001392cc42903bcfe8cdc1e5ff,1
0044a0c70865331db0adf7f350dea93442b869fd,2b0545bd678d20da70950fd780cf371f64946ab7,1
003508ad2971d0413183983061bdb3c2a455d88a,3515d80ed2a0ce83f79afeb031349c261d95f707,1


## Collaborative model engine with matrix decomposition

In [55]:
def matrix_decomposition(n_factors):
    u, sigma, v = svds(train_user_tags_df.values, k=n_factors)
    sigma = np.diag(sigma)
    user_predictions = np.dot(np.dot(u, sigma), v)
    return pd.DataFrame(user_predictions, columns=train_user_tags_df.columns, index=train_user_tags_df.index)

In [56]:
predictions_df = matrix_decomposition(n_factors = 30)
predictions_df.head()

tag_id,00410345e6d60633a211ebd3755d5c89ea7b5297,005703ee98894846cde759fbe88f3d7fde830c85,0076e2a45d90991150032dbfaa574b4b7ab21177,0087f6286f5bd4f872620555b3e3b880e21de444,008a19c4e6b27ade78d422f9deaba16ef195772b,0108e35fbb3bb7e60c2045386294914255f137eb,01c1f1173136b005f885c4691db374d4762f15e0,022f8f30c65aaeb13def1fb9d700c1937e30da06,0245aa04713eb538ebcb6d6c5667a55f5920d535,02a5da421359cb69816444a48f35c0cb01806ca6,...,fd9b6d215102521471bdd20b88eef7e471a5f08c,fde652531ae50c7def995f5c5c2b067cd9bdc9de,fe0f9fd1fdfe652523ed4a3a9e57375301bf7144,fe2297da6fbc7992a934f8ce4c848584d73bd1c3,fe5cd317975ca5dada3d9d047133f1925e460053,fe9cd8d22101c48eca5bc3eee51b9dc5e07fe683,fe9e7151b6ae2070c053a80fa37862cfef449cae,fed8505b88adf6d879b4df147b29a068d98faa72,ff0257af2bc7c28cd397a820aa33cde0d04b58b8,ff0d3fb21c00bc33f71187a2beec389e9eff5332
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000055a78bf6735c4a89358fab1de34104c3cb,-0.002766864,0.01983152,0.0006257567,0.00451168,-0.00247183,-0.003873215,-0.00243934,0.006347881,0.007761667,0.005169462,...,-0.002215744,-0.005257891,-0.005692199,-0.001005089,-0.003894122,3.28162e-05,0.003521641,0.0008114178,-0.008013274,0.01922221
000014674d2afbd30b4a89e7f917b67ade3c31c4,0.0002252433,0.007157366,0.001464573,0.001372593,-0.002531907,0.0001173938,0.0007325514,0.003867707,0.0001712029,-0.002338885,...,-0.0005490988,0.002696509,-0.001377479,-0.0009073557,0.0006164411,5.943258e-05,-0.001013057,-0.003521255,0.005958806,0.0008790847
00001dbe00e56fc4b1c1b65dda63de2a5ece55f9,-0.002236487,0.0009305947,-0.00103281,-0.003682103,0.01232474,-0.01220585,0.001645196,0.0497694,-0.009707537,0.006842038,...,0.03547152,0.0009021001,-0.002038891,-0.004448768,0.004484566,-0.0005378293,0.02065431,0.01694823,0.0010571,0.01005686
00009c599dca6873a47404fa9b7a9b0a1bd13049,0.02226605,0.003202278,0.006859005,0.02563752,-0.001566725,-0.001031952,0.02435927,0.03533251,-0.003126543,0.01172076,...,-0.0232789,-0.001872252,-0.04260436,0.01248163,0.03923148,-0.003448356,0.02389488,0.06294212,-0.004532238,0.00119329
0000a8c248465bc8cd907e112b5c2f1e94424f28,-2.918515e-18,-1.965776e-19,-4.0549039999999997e-19,1.673772e-18,1.737332e-18,-7.332031999999999e-19,9.005554e-19,1.0811610000000002e-17,6.250691e-20,1.493647e-18,...,3.394955e-18,-1.757823e-18,1.799081e-18,2.379094e-18,-4.639275e-19,-1.7712099999999998e-19,2.5804220000000002e-18,1.1936420000000002e-17,-3.085584e-18,-6.383953e-18


In [33]:
def get_user_predictions(user, predictions, n_preds, not_seen=True):
    """
    user: str user id
    predictions: pd.DataFrame of user_id/tag_id with predictions 'score'
    n_preds: int number of predictions to return
    not_seen: bool flag to return user seen predictions or not
    """
    user_predictions = predictions.loc[user].sort_values(ascending=False).reset_index()
    user_products = data[data['user_id']==user]['tag_id']
    if not_seen:
        user_predictions = user_predictions[~user_predictions['tag_id'].isin(user_products)]
    return user_predictions[:n_preds]

In [54]:
def eval_model(predictions, test):
    recall_list = []
    for i, user_id in enumerate(predictions.index.unique().values):
        test_user_tags = test.loc[user_id]['tag_id']
        user_preds = get_user_predictions(user_id, predictions, 15, False)
        matched_tags_bool = user_preds['tag_id'].isin(pd.Series(test_user_tags))
        
        if type(test_user_tags) == str:
            user_recall = matched_tags_bool.sum()  # sometimes it will get only 1 element which is a string
#             print(a.sum(), 1)
        else:
            user_recall = matched_tags_bool.sum()/len(test_user_tags)
#             print(a.sum(), len(test_user_tags))
        """ debug
        if i == 7:
            print('test user tags:')
            print(test_user_tags)
            print('predicted user tags:')
            print(user_preds)
        """

        recall_list.append(user_recall)
    recall = sum(recall_list)/test.shape[0]
    print(recall*100)


        
eval_model(predictions_df, test_df)

2.9097389389269432


In [57]:
preds_15_df = matrix_decomposition(n_factors = 15)
eval_model(preds_15_df, test_df)

2.469109184510183


In [58]:
preds_50_df = matrix_decomposition(n_factors = 50)
eval_model(preds_50_df, test_df)

3.2542263336102204


In [60]:
preds_50_df = matrix_decomposition(n_factors = 60)
eval_model(preds_50_df, test_df)

3.612987021754795
