In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn import preprocessing
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.cross_validation import random_train_test_split 



In [2]:
def get_data():
    events = pd.read_csv("ecommerce-dataset/events.csv")
    events = events[:27561]
    events = events.assign(date=pd.Series(datetime.datetime.fromtimestamp(i//1000).date() for i in events.timestamp))
    events = events.sort_values('date').reset_index(drop=True)
    events = events[['visitorid', 'itemid', 'event', 'date']]
    start_date = '2015-06-01'
    end_date = '2015-06-03'
    fd = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date()
    events = events[(events.date >= fd(start_date)) & (events.date <= fd(end_date))]
    
    
    cate_enc = preprocessing.LabelEncoder()
    #users = cate_enc.fit_transform(events['visitorid'])
    users = events['visitorid']
    users = pd.DataFrame(users)
    
    cate_enc = preprocessing.LabelEncoder()
    #items = cate_enc.fit_transform(events['itemid'])
    items = events['itemid']
    items = pd.DataFrame(items)
    
    cate_enc = preprocessing.LabelEncoder()
    events['event'] = cate_enc.fit_transform(events.event)
    
    #cate_enc = preprocessing.LabelEncoder()
    #events['visitorid'] = cate_enc.fit_transform(events.visitorid)
    
    #cate_enc = preprocessing.LabelEncoder()
    #events['itemid'] = cate_enc.fit_transform(events.itemid)
    
    
    #print(type(items),type(events),type(users))
    #print(items.head())
    #print(events.head())
    #print(users.head())
    return (events, items, users)


In [3]:
datatup = get_data()
def get_ratings():
    return datatup[0]

def get_book_features():
    return datatup[1]

def get_user_features():
    return datatup[2]

In [4]:
check = get_ratings()
check['itemid'].head(7).values



array([ 84160, 351450, 367588, 134698, 288795, 315756, 399491],
      dtype=int64)

## Building Dataset

In [5]:
dataset = Dataset()
dataset.fit((get_ratings().visitorid.values),
            (get_ratings().itemid.values))

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))


dataset.fit_partial(users=(get_ratings().visitorid.values),
                    items=(get_ratings().itemid.values),
                    item_features=None,
                    user_features=None)

ratings = get_ratings()
split_point = np.int(np.round(ratings.shape[0]*0.8))
train = ratings.iloc[0:split_point]
test = ratings.iloc[split_point::]
test = test[(test['visitorid'].isin(train['visitorid'])) & (test['itemid'].isin(train['itemid']))]
 
    
(interactions, weights) = dataset.build_interactions(((train['visitorid'][ind], train['itemid'][ind],train['event'][ind])
                                                      for ind in train.index))

(test_interactions, test_weights) = dataset.build_interactions(((test['visitorid'][ind], test['itemid'][ind],test['event'][ind])
                                                      for ind in test.index))

print(repr(interactions))
print(repr(weights))

Num users: 16684, num_items 15848.
<16684x15848 sparse matrix of type '<class 'numpy.int32'>'
	with 22049 stored elements in COOrdinate format>
<16684x15848 sparse matrix of type '<class 'numpy.float32'>'
	with 22049 stored elements in COOrdinate format>


In [8]:
labels = np.unique([ratings['itemid'][ind] for ind in ratings.index])
print(len(labels))

15848


In [10]:
model = LightFM(no_components=10, loss='warp')

#(train, test) = random_train_test_split(interactions=interactions, test_percentage=0.2)
#(trainW, testW) = random_train_test_split(interactions=weights, test_percentage=0.2)

model.fit(interactions,sample_weight=weights, epochs=100, num_threads=1)

### model performnce evaluation
train.size

88196

In [64]:
dataset.mapping()

({1305393: 0,
  568149: 1,
  1166026: 2,
  1186298: 3,
  570614: 4,
  549695: 5,
  531661: 6,
  253694: 7,
  665281: 8,
  678672: 9,
  1136743: 10,
  279117: 11,
  852494: 12,
  130434: 13,
  1190789: 14,
  694154: 15,
  1172184: 16,
  569395: 17,
  253439: 18,
  1382935: 19,
  1402681: 20,
  11757: 21,
  329743: 22,
  106989: 23,
  1109619: 24,
  463825: 25,
  8002: 26,
  1368631: 27,
  1330849: 28,
  733586: 29,
  571170: 30,
  476106: 31,
  878877: 32,
  880719: 33,
  286616: 34,
  264190: 35,
  1128376: 36,
  680869: 37,
  945184: 38,
  108890: 39,
  1235292: 40,
  1178822: 41,
  772072: 42,
  934718: 43,
  962218: 44,
  594034: 45,
  1082453: 46,
  583501: 47,
  1407340: 48,
  1176899: 49,
  168062: 50,
  1319481: 51,
  1161163: 52,
  1066225: 53,
  253731: 54,
  206598: 55,
  509379: 56,
  648140: 57,
  843120: 58,
  1045411: 59,
  265603: 60,
  1236753: 61,
  928676: 62,
  44118: 63,
  492270: 64,
  75401: 65,
  825045: 66,
  598544: 67,
  1123525: 68,
  1080827: 69,
  389493: 7

In [36]:
train_precision = precision_at_k(model, weights, k=10).mean()
test_precision = precision_at_k(model, test_weights,k=10).mean()

train_auc = auc_score(model, weights).mean()
test_auc = auc_score(model, test_weights).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision*100, test_precision*100))
print('AUC: train %.2f, test %.2f.' % (train_auc*100, test_auc*100))

print("testing testing testing")

Precision: train 11.91, test 7.66.
AUC: train 99.79, test 88.90.
testing testing testing


In [52]:
# building manual dataset
users = [9672]
items = [41979,185570,19572,56361,160211,234255]
actions = [8,9,10,11,12,13]

print(actions)
print(items)


[8, 9, 10, 11, 12, 13]
[41979, 185570, 19572, 56361, 160211, 234255]


In [45]:
from scipy.sparse import coo_matrix
rate_mat = coo_matrix((actions,(users, items)), shape=(16684,15848))

ValueError: column index exceeds matrix dimensions

In [53]:
dataset.fit_partial(users=(users),
                    items=(items),
                    item_features=None,
                    user_features=None)

(new_interactions, new_weights) = dataset.build_interactions(((users[0], items[ind],actions[ind])
                                                      for ind in range(5)))

print(new_interactions, new_weights)

  (176, 26)	1
  (176, 27)	1
  (176, 28)	1
  (176, 29)	1
  (176, 30)	1   (176, 26)	8.0
  (176, 27)	9.0
  (176, 28)	10.0
  (176, 29)	11.0
  (176, 30)	12.0


In [54]:
model.fit(new_interactions,sample_weight=new_weights,epochs=100)

<lightfm.lightfm.LightFM at 0xc8aeed0>

In [37]:
model.predict_rank(new_interactions)

<16684x15848 sparse matrix of type '<class 'numpy.float32'>'
	with 5 stored elements in Compressed Sparse Row format>

In [55]:
ans = model.predict(9672,np.arange(len(labels)))
pred = pd.Series(ans,index=labels)
recommends = list(pd.Series(pred.sort_values(ascending=False).index))
print(recommends[:10])

[911, 860, 869, 907, 882, 116248, 65378, 374490, 35929, 141969]


In [51]:
import pickle
pickled = {
     'model' : model,
     'dataset': dataset
 }
pickle.dump(pickled,open('model2'+".p","wb"))

In [27]:
model.fit_partial(test_interactions,sample_weight=test_weights, epochs=2)

<lightfm.lightfm.LightFM at 0xc8aeed0>

In [33]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test,k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

print("testing testing testing")

ValueError: The user feature matrix specifies more features than there are estimated feature embeddings: 16685 vs 22049.

In [54]:
model.fit_partial(3347,)

  (13640, 3069)	1.0
  (3347, 4001)	1.0
  (4983, 5801)	1.0
  (9221, 9565)	1.0
  (4744, 1004)	1.0
  (9182, 9525)	1.0
  (15004, 14728)	1.0
  (12484, 12475)	1.0
  (8208, 8651)	1.0
  (13918, 13817)	1.0
  (13795, 1103)	1.0
  (1139, 1353)	1.0
  (8165, 8612)	1.0
  (1233, 5366)	1.0
  (6818, 14784)	1.0
  (3020, 3561)	1.0
  (4561, 5425)	1.0
  (5213, 8987)	1.0
  (4663, 5513)	1.0
  (9451, 9773)	1.0
  (6003, 6682)	1.0
  (1265, 1499)	1.0
  (4693, 3501)	1.0
  (9314, 9647)	1.0
  (9900, 1971)	1.0
  :	:
  (1708, 2014)	1.0
  (34, 10586)	1.0
  (1103, 2327)	1.0
  (10434, 10643)	1.0
  (476, 1224)	1.0
  (2834, 2058)	1.0
  (12979, 12934)	1.0
  (11592, 11679)	1.0
  (8794, 788)	1.0
  (4832, 4430)	1.0
  (3318, 3961)	1.0
  (8986, 9341)	1.0
  (5897, 8552)	1.0
  (670, 2295)	1.0
  (6128, 8878)	1.0
  (1321, 1566)	1.0
  (5938, 6622)	1.0
  (5260, 6035)	1.0
  (15041, 14260)	1.0
  (74, 9298)	1.0
  (11245, 11366)	1.0
  (15159, 14841)	1.0
  (4412, 5263)	1.0
  (34, 6369)	1.0
  (2857, 3287)	1.0
