# LightFM 範例

In [1]:
import numpy as np
import pandas as pd

In [2]:
rating_df = pd.read_csv('../ecommerce-dataset/events_small.csv')
rating_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221373311,781127,view,21989,
1,1433222147345,1076270,view,262799,
2,1433221380636,849453,view,123990,
3,1433223176926,629333,view,128394,
4,1433222897013,492414,view,279976,


In [3]:
# 將數劇集按照時間大小由小排到大
# sort data frame by timestemp for splitting
rating_df = rating_df.sort_values('timestamp')
rating_df.head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
34513,1430622118534,584571,view,436195,
34476,1430622162554,837890,view,2519,
34471,1430622330806,990356,view,369532,
34484,1430622469247,584571,view,436195,
34470,1430622609378,1002397,view,77392,
34514,1430622790487,1375898,view,64152,
34472,1430622933406,823085,view,131879,
34477,1430623098261,1061274,view,356129,
34485,1430623224021,823085,view,214519,
34473,1430623241016,122517,view,232129,


In [4]:
# map user id and movie id to integer starting from 0 to N (num of users) and M (num of items)
from sklearn.preprocessing import LabelEncoder
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

rating_df['visitorid'] = user_encoder.fit_transform(rating_df.visitorid)
rating_df['itemid'] = item_encoder.fit_transform(rating_df.itemid)

In [5]:
num_users = rating_df.visitorid.max()+1
num_items = rating_df.itemid.max()+1
num_users, num_items

(27372, 9098)

In [6]:
# 按照visitorid分組，呼叫rank函數可以得到該點擊是該使用者第幾個點擊，這裡用asecending=False使最後一次點擊的appearance=1
# group by visitorid and call "rank" to know the number of click for that user. We set ascending=false so that the
# last click has appearance = 1

rating_df['appearance'] = rating_df.groupby('visitorid').timestamp.rank(ascending=False)
rating_df.head(15)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,appearance
34513,1430622118534,11315,view,8460,,2.0
34476,1430622162554,16213,view,69,,1.0
34471,1430622330806,19226,view,7229,,2.0
34484,1430622469247,11315,view,8460,,1.0
34470,1430622609378,19479,view,1573,,2.0
34514,1430622790487,26737,view,1315,,1.0
34472,1430622933406,15931,view,2638,,3.0
34477,1430623098261,20675,view,6962,,1.0
34485,1430623224021,15931,view,4263,,2.0
34473,1430623241016,2310,view,4602,,4.0


In [7]:
# train / val split
train_df = rating_df.loc[rating_df.appearance>1]
val_df = rating_df.loc[rating_df.appearance==1]
train_df.shape, val_df.shape

((38766, 6), (27371, 6))

In [8]:
# 移除重複的(visitorid,itemid) pair
# remove duplicate (visitorid,itemid) pair

train_df.drop_duplicates(['visitorid','itemid'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
from scipy.sparse import csr_matrix
# 建立 user-item 評分矩陣
# construct user-item rating matrix
user2item = csr_matrix( (np.ones(len(train_df)), (train_df.visitorid.values ,train_df.itemid.values)), shape=(num_users, num_items), dtype=np.float32 )
user2item.sum()

22743.0

In [10]:
from lightfm import LightFM
# 訓練 LightFM
# training

model = LightFM(no_components=32, loss='warp')
model.fit(interactions=user2item,epochs=150)



<lightfm.lightfm.LightFM at 0x1105afe10>

In [11]:
'''
Credit to https://gist.github.com/bwhite/3726239
'''
def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    >>> rs = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.61111111111111105
    >>> rs = np.array([[0, 0, 0], [0, 1, 0], [1, 0, 0]])
    >>> mean_reciprocal_rank(rs)
    0.5
    >>> rs = [[0, 0, 0, 1], [1, 0, 0], [1, 0, 0]]
    >>> mean_reciprocal_rank(rs)
    0.75
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])


In [12]:
# 將validation set中每一個使用者和物品的id拿出來
# obtain the user and item ids in the validation set
val_user_ids = val_df.visitorid.values
val_gts = val_df.itemid.values

In [13]:
from tqdm import tqdm
def evaluate_prediction(model, val_user_ids, val_gts):
    '''
    Return the average mrr for each users
    args:
        predictions: np.array user-item predictions
    returns:
        ndcg: float, computed MRR
    '''

    hits = []
    
    # 迴圈跑過validation set每一個使用者和物品的pair
    # iterate over user and item pair in the validation set
    for target_user, val_gt in tqdm(zip(val_user_ids, val_gts)):

        
        predictions = model.predict(user_ids= np.array([target_user] * num_items), item_ids=np.arange(num_items) )
        rankings = np.flip(np.argsort(predictions))
        hits.append(rankings == val_gt)
        
    mrr = mean_reciprocal_rank(hits)

    return mrr
mrr = evaluate_prediction(model, val_user_ids,val_gts)
mrr

27371it [00:56, 481.16it/s]


0.29570898917115823