In [1]:
# import all necessary libraries

import pandas as pd
import numpy as np

import tqdm

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.cross_validation import random_train_test_split



In [2]:
# Load datasets
df_trans = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
df_user = pd.read_csv('data/customers.csv')
df_item = pd.read_csv('data/articles.csv', dtype={'article_id': str})

In [3]:
# Capture Seasonal Effect by limiting the transaction date to 5 weeks
df_5week = df_trans[df_trans['t_dat'] >= '2020-08-19'] 
df_5week

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
30488290,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0784053005,0.050831,2
30488291,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0784053005,0.050831,2
30488292,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0926921001,0.033881,2
30488293,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0868038001,0.050831,2
30488294,2020-08-19,00080403a669b3b89d1bef1ec73ea466d95e39698d6dde...,0868038001,0.050831,2
...,...,...,...,...,...
31788319,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,0929511001,0.059305,2
31788320,2020-09-22,fff2282977442e327b45d8c89afde25617d00124d0f999...,0891322004,0.042356,2
31788321,2020-09-22,fff380805474b287b05cb2a7507b9a013482f7dd0bce0e...,0918325001,0.043203,1
31788322,2020-09-22,fff4d3a8b1f3b60af93e78c30a7cb4cf75edaf2590d3e5...,0833459002,0.006763,1


In [4]:
# Initialize a LightFM Dataset() class

dataset = Dataset()
dataset.fit(users=df_user['customer_id'], 
            items=df_item['article_id'])

dataset.interactions_shape()

(1371980, 105542)

In [5]:
# Build the user-item interaction matrix

(interactions, weights) = dataset.build_interactions(df_5week.iloc[:, 1:3].values) 

In [6]:
# Split the interaction matrix into train and val (80% : 20%) using LightFM random_train_test_split()

u_i_train, u_i_val = random_train_test_split(interactions, test_percentage=0.2, random_state=1)
u_i_train, u_i_val

(<1371980x105542 sparse matrix of type '<class 'numpy.int32'>'
 	with 1040027 stored elements in COOrdinate format>,
 <1371980x105542 sparse matrix of type '<class 'numpy.int32'>'
 	with 260007 stored elements in COOrdinate format>)

In [7]:
%%time

# Train a LightFM model without user features and item features, and calculate the MAP@12
param = {'no_components': 200,
   'loss': 'warp',
   'learning_schedule': 'adagrad',
   'learning_rate': 0.025,
   'max_sampled': 10}

model = LightFM(**param, 
         random_state=1)
model.fit(u_i_train, # spase matrix representing whether user u and item i interacted
      epochs=100, verbose=1)
val_precision = precision_at_k(model, u_i_val, k=12).mean()
print('MAP@12: ', val_precision) 

Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [15:11<00:00,  9.12s/it]


MAP@12:  0.03190462
Wall time: 1h 14min 18s


***Looks so great!!!***

***Problem here: Because some customer may buy the same article at different time, the randomly selected train and test set may share some interactions. This will cause incorrect evaluation (data leakage???). One solution is to group by customers and articles, and aggregate on price or on count of purchases. In this way, we can eliminate repeated purchases, consequently the train and test data will not share any intersection. Otherwise it will raise an error when you include train_interactions in precision_at_k(). The error says "Test interactions matrix and train interactions matrix share 43426 interactions. This will cause incorrect evaluation, check your data split". If that happens, one has to set check_intersections=False for the code to run. However, the evaluation is still not right.***

**Now try to add user features and item features**

**User features**

In [8]:
# Preprocessing of df_user

df_user_cp = df_user.copy()
df_user_cp.drop('postal_code', inplace=True, axis=1) # drop postal_code, too high cardinality, also not useful for predicting purchase
df_user_cp.fillna('NA', inplace=True) # fill all missing value with NA
df_user_cp.fashion_news_frequency.replace('NONE', 'NA', inplace=True) # replace all remaining NONE, None with NA
df_user_cp.fashion_news_frequency.replace('None', 'NA', inplace=True)

In [9]:
# Function for mapping customers into age groups, replace NA with age group 3

def age_group(x):
    if x == 'NA':
        x = 3
    elif float(x)<20:
        x = 1
    elif float(x)>=20 and float(x)<40:
        x = 2
    elif float(x)>=40 and float(x)<60:
        x = 3
    elif float(x)>=60 and float(x)<80:
        x = 4
    else:
        x = 5
    return x

In [10]:
df_user_cp['age_group'] = df_user_cp.age.map(age_group)
df_user_cp.drop('age', axis=1, inplace=True)
df_user_cp

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age_group
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,,3
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,,2
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,,2
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,,3
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,3
...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,,2
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,,2
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,2
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,1


In [11]:
# create user feature name list

uf, cols, unique_f = [], [], []

for col in df_user_cp.columns[1:]:
    cols += [col]*len(df_user_cp[col].unique())
    unique_f += list(df_user_cp[col].unique())                  

for x,y in zip(cols, unique_f):
    res = str(x)+ ":" +str(y)
    uf.append(res)
    print(res)

FN:NA
FN:1.0
Active:NA
Active:1.0
club_member_status:ACTIVE
club_member_status:NA
club_member_status:PRE-CREATE
club_member_status:LEFT CLUB
fashion_news_frequency:NA
fashion_news_frequency:Regularly
fashion_news_frequency:Monthly
age_group:3
age_group:2
age_group:4
age_group:1
age_group:5


In [12]:
# Building user features

def user_feature_colon_value(my_list):
    """
    Takes as input a list and prepends the columns names to respective values in the list.
    For example: if my_list = [1,1,0,'del'],
    resultant output = ['f1:1', 'f2:1', 'f3:0', 'loc:del']
   
    """
    result = []
    ll = ['FN:', 'Active:', 'club_member_status:', 'fashion_news_frequency:', 'age_group:']
    aa = my_list
    for x,y in zip(ll,aa):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

In [13]:
ad_subset = df_user_cp[['FN', 'Active', 'club_member_status', 'fashion_news_frequency', 'age_group']] 
ad_list = [list(x) for x in ad_subset.values]
user_feature_list = []
for item in ad_list:
    user_feature_list.append(user_feature_colon_value(item))
    
user_tuple = list(zip(df_user_cp.customer_id, user_feature_list))
user_tuple[0]

('00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657',
 ['FN:NA',
  'Active:NA',
  'club_member_status:ACTIVE',
  'fashion_news_frequency:NA',
  'age_group:3'])

**Item features**

Many article features have very high cardinality. Some of them can be groupped and actually they are groupped. For example, product_code, prod_name, product_type_no, and product_type_name are redundant and can be groupped as product_group_name. Some other features have similar problems. Based on this observation, I will choose **product_group_name, perceived_colour_value_name, index_group_name, garment_group_name** as the item features used for LightFM. detail_desc has too high cardinality, will not use it. May use NLP to process the detail_desc text in the future.

In [14]:
# create item feature name list

af, cols, unique_f = [], [], []

for col in ['product_group_name', 'perceived_colour_value_name', 'index_group_name', 'garment_group_name']:
    cols += [col]*len(df_item[col].unique())
    unique_f += list(df_item[col].unique())                  

for x,y in zip(cols, unique_f):
    res = str(x)+ ":" +str(y)
    af.append(res)
    print(res)

product_group_name:Garment Upper body
product_group_name:Underwear
product_group_name:Socks & Tights
product_group_name:Garment Lower body
product_group_name:Accessories
product_group_name:Items
product_group_name:Nightwear
product_group_name:Unknown
product_group_name:Underwear/nightwear
product_group_name:Shoes
product_group_name:Swimwear
product_group_name:Garment Full body
product_group_name:Cosmetic
product_group_name:Interior textile
product_group_name:Bags
product_group_name:Furniture
product_group_name:Garment and Shoe care
product_group_name:Fun
product_group_name:Stationery
perceived_colour_value_name:Dark
perceived_colour_value_name:Light
perceived_colour_value_name:Dusty Light
perceived_colour_value_name:Medium Dusty
perceived_colour_value_name:Bright
perceived_colour_value_name:Medium
perceived_colour_value_name:Undefined
perceived_colour_value_name:Unknown
index_group_name:Ladieswear
index_group_name:Baby/Children
index_group_name:Menswear
index_group_name:Sport
index_gro

In [15]:
# Building item features

def item_feature_colon_value(my_list):
    """
    Takes as input a list and prepends the columns names to respective values in the list.
    For example: if my_list = [1,1,0,'del'],
    resultant output = ['f1:1', 'f2:1', 'f3:0', 'loc:del']
   
    """
    result = []
    ll = ['product_group_name:', 'perceived_colour_value_name:', 'index_group_name:', 'garment_group_name:']
    aa = my_list
    for x,y in zip(ll,aa):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

In [16]:
ad_subset = df_item[['product_group_name', 'perceived_colour_value_name', 'index_group_name', 'garment_group_name']] 
ad_list = [list(x) for x in ad_subset.values]
item_feature_list = []
for item in ad_list:
    item_feature_list.append(item_feature_colon_value(item))
    
item_tuple = list(zip(df_item.article_id, item_feature_list))
item_tuple[0]

('0108775015',
 ['product_group_name:Garment Upper body',
  'perceived_colour_value_name:Dark',
  'index_group_name:Ladieswear',
  'garment_group_name:Jersey Basic'])

In [17]:
# Create a new dataset instance, and call fit to supply userid, item id and/or user/item feature names

dataset = Dataset()
dataset.fit(
        users=df_user['customer_id'], # all the users
        items=df_item['article_id'], # all the items
        user_features = uf, # all user features
        item_features = af) # selected item features

dataset.interactions_shape()

(1371980, 105542)

In [18]:
# use dataset.build_user_features and build_item_features() to build the real user and item features

user_features = dataset.build_user_features(user_tuple, normalize=False)
item_features = dataset.build_item_features(item_tuple, normalize=False)
user_features, item_features

(<1371980x1371996 sparse matrix of type '<class 'numpy.float32'>'
 	with 8231880 stored elements in Compressed Sparse Row format>,
 <105542x105595 sparse matrix of type '<class 'numpy.float32'>'
 	with 527710 stored elements in Compressed Sparse Row format>)

In [19]:
%%time

# Train a model with user features and item features

param = {'no_components': 200,
   'loss': 'warp',
   'learning_schedule': 'adagrad',
   'learning_rate': 0.025,
   'max_sampled': 10}

model = LightFM(**param, 
         random_state=1)

model.fit(interactions=u_i_train, 
      user_features= user_features, # we have built the sparse matrix above
      item_features= item_features, # we have built the sparse matrix above
      epochs=100, verbose=1)

val_precision = precision_at_k(model, u_i_val, k=12, user_features=user_features, item_features=item_features).mean()
print(f' MAP@12: {val_precision}')

Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [30:34<00:00, 18.35s/it]


 MAP@12: 0.01114843599498272
Wall time: 2h 24min 37s


***Adding user features and item features greatly degrades the model performance. Will not use them.***

**Use tuned hyperparameters (see another notebook LightFM_2.ipynb) to build final model without user features and item features**

In [20]:
%%time

param = {'no_components': 400,
   'loss': 'warp',
   'learning_schedule': 'adagrad',
   'learning_rate': 0.025,
   'max_sampled': 15}

final_model = LightFM(**param, 
         random_state=1)
final_model.fit(u_i_train, # spase matrix representing whether user u and item i interacted
      epochs=100, verbose=1)
val_precision = precision_at_k(final_model, u_i_val, k=12).mean()
print('MAP@12: ', val_precision) 

Epoch: 100%|█████████████████████████████████████████████████████████████████████████| 100/100 [29:02<00:00, 17.43s/it]


MAP@12:  0.03363116
Wall time: 2h 16min 38s


### Prepare predictions for all the users and the submission file

In [21]:
df_sample = pd.read_csv('data/sample_submission.csv')
df_sample

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0706016001 0706016002 0372860001 0610776002 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0706016001 0706016002 0372860001 0610776002 07...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0706016001 0706016002 0372860001 0610776002 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0706016001 0706016002 0372860001 0610776002 07...


In [22]:
df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   customer_id  1371980 non-null  object
 1   prediction   1371980 non-null  object
dtypes: object(2)
memory usage: 20.9+ MB


In [23]:
# We already have user_id_map, user_feature_map, item_id_map, item_feature_map

user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
dataset.mapping()

({'00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657': 0,
  '0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa': 1,
  '000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318': 2,
  '00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e': 3,
  '00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a': 4,
  '000064249685c11552da43ef22a5030f35a147f723d5b02ddd9fd22452b1f5a6': 5,
  '0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d8cd0c725276a467a2a': 6,
  '00007d2de826758b65a93dd24ce629ed66842531df6699338c5570910a014cc2': 7,
  '00007e8d4e54114b5b2a9b51586325a8d0fa74ea23ef77334eaec4ffccd7ebcc': 8,
  '00008469a21b50b3d147c97135e25b4201a8c58997f78782a0cc706645e14493': 9,
  '0000945f66de1a11d9447609b8b41b1bc987ba185a5496ae8831e8493afa24ff': 10,
  '000097d91384a0c14893c09ed047a963c4fc6a5c021044eec603b323e8c82d1d': 11,
  '00009c2aeae8761f738e4f937d9be6b49861a66339c2b1c3b1cc6e322729a370': 12,
  '00009d946eec3ea54add5ba56d5210ea898def4b46c68

In [24]:
# create inverse mappings

inv_user_id_map = {v:k for k, v in user_id_map.items()}
inv_item_id_map = {v:k for k, v in item_id_map.items()}

In [26]:
%%time

# Try to predict for one customer
user_id = user_id_map['00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657']
scores = final_model.predict(user_ids=user_id, item_ids=np.arange(u_i_train.shape[1]))
pred = np.argsort(scores)[::-1][:12]
pred_articles = [inv_item_id_map[p] for p in pred]
pred_articles

Wall time: 90.6 ms


['0568601043',
 '0762846031',
 '0568601006',
 '0779781015',
 '0858856005',
 '0568601044',
 '0762846027',
 '0728156024',
 '0568601023',
 '0568601030',
 '0568601007',
 '0568597007']

For one customer, it takes 90.6 ms for the prediction. For 1371980 customers, it will in theory take more than 34.5 hours but actually took about 25.5 hours (see another notebook).

In [25]:
def submit(model, df_sample, submission_name="Result_LightFM/submissions.csv"):
    preds = []
    for customer_id in df_sample['customer_id'].values:
        user_id = user_id_map[customer_id]
        scores = model.predict(user_ids=user_id, item_ids=np.arange(u_i_train.shape[1]))
        # sort() and argsort() in numpy are all in ascending order, so has to use [::-1] to convert to descending
        pred = np.argsort(scores)[::-1][:12]
        # map pred to article_id and convert to string type
        pred_str = ' '.join([inv_item_id_map[p] for p in pred]).strip()
        preds.append((customer_id, pred_str))
        
    df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
    df_preds.to_csv(submission_name, index=False)
    
    display(df_preds.head())
    print(df_preds.shape)
    
    return df_preds

In [None]:
%%time

df_predictions = submit(final_model, df_sample)
df_predictions

***Takes 25 hours to run. Didn't run here in this notebook. See running result in LightFM_2.ipynb***

***When predicting, have to use a for loop to predict each customer one by one, cannot input a customer_id array, otherwise, will raise an AssertionError when the predict() method checking "assert len(user_ids) == len(item_ids)". See explanation here: https://github.com/lyst/lightfm/issues/226***

In [29]:
# Given an item, recommend similar items, see here: https://github.com/lyst/lightfm/issues/244
 
def similar_items(item_id, model, N=10):
    # Cosine similarity
    item_bias, item_embeddings = model.get_item_representations() 
    scores = item_embeddings.dot(item_embeddings[item_id, :])
    item_norms = np.linalg.norm(item_embeddings, axis=1)
    scores /= item_norms

    best = np.argpartition(scores, -N)[-N:]
    best_article_id = [inv_item_id_map[b] for b in best]
    return sorted(zip(best_article_id, scores[best] / item_norms[item_id]), 
                  key=lambda x: -x[1])

In [30]:
item_id = item_id_map['0778064038']
sim_items = similar_items(item_id=item_id, model=final_model)
sim_items

[('0778064038', 1.0000001),
 ('0778064041', 0.70167756),
 ('0778064045', 0.6579587),
 ('0778064001', 0.614941),
 ('0778064005', 0.598418),
 ('0800691013', 0.55788076),
 ('0778064033', 0.53323054),
 ('0800691016', 0.5317068),
 ('0778064051', 0.53089154),
 ('0778064029', 0.52106225)]

In [31]:
submitted = pd.read_csv('Result_LightFM/submissions.csv')
submitted

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0762846031 0568601006 0779781015 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0915526001 0751471001 0898694001 0915529003 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0915526001 0751471001 0898694001 0915529003 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0915526001 0751471001 0898694001 0915529003 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0915526001 0751471001 0898694001 0915529003 09...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0557599022 0740922009 0804992033 0791587015 07...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0915526001 0751471001 0898694001 0915529003 09...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0762846027 0794819001 0884081001 0689365050 08...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0915526001 0751471001 0898694001 0915529003 09...


***After submitting to Kaggle, got a private score of 0.01754 and public score of 0.01729. Not too bad.***

**Save all model and variables needed for future prediction**

In [37]:
import pickle

# Save the user and item mapping dictionaries for future prediction

with open('Result_LightFM/user_id_map.pkl', "wb") as out_file:
    pickle.dump(user_id_map, out_file)

with open('Result_LightFM/inv_user_id_map.pkl', "wb") as out_file:
    pickle.dump(inv_user_id_map, out_file)

with open('Result_LightFM/item_id_map.pkl', "wb") as out_file:
    pickle.dump(item_id_map, out_file)
    
with open('Result_LightFM/inv_item_id_map.pkl', "wb") as out_file:
    pickle.dump(inv_item_id_map, out_file)


In [49]:
# Save the final model for future prediction

with open('Result_LightFM/final_model.pkl', "wb") as out_file:
    pickle.dump(final_model, out_file)

***Please note, had problem dumping this model object because it is too big, and raises MemoryError. Had to wait for memory being released out, and closed some other programs. Then I was able to dump the model object.***

In [41]:
# import joblib

# filename = 'Result_LightFM/final_model.sav'
# joblib.dump(final_model, filename)  

# # # Can later load saved model
# # loaded_model = joblib.load(filename)

['Result_LightFM/final_model.sav']

***Using joblib.dump, it was really easy to dump the model object. However, had problem re-loading the saved model "MemoryError: Unable to allocate 2.04 GiB for an array with shape (548792000,) and data type float32".***

***Then I tried (see the end of this notebook): if I restart the computer and re-load the saved model using joblib.load(), I am able to load it quickly and successfully. So, joblib is actually faster and better for large object as long as you have enough memory.***

**Try to see if loading saved model will work or not**

In [50]:
with open('Result_LightFM/final_model.pkl', "rb") as in_file:
    loaded_model = pickle.load(in_file)

In [43]:
with open('Result_LightFM/user_id_map.pkl', "rb") as in_file:
    loaded_uidmap = pickle.load(in_file)

with open('Result_LightFM/inv_user_id_map.pkl', "rb") as in_file:
    loaded_inv_uidmap = pickle.load(in_file)

with open('Result_LightFM/item_id_map.pkl', "rb") as in_file:
    loaded_iidmap = pickle.load(in_file)

with open('Result_LightFM/inv_item_id_map.pkl', "rb") as in_file:
    loaded_inv_iidmap = pickle.load(in_file)

In [51]:
%%time

# Try to predict for one customer
user_id = loaded_uidmap['00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657']
scores = loaded_model.predict(user_ids=user_id, item_ids=np.arange(len(loaded_iidmap)))
pred = np.argsort(scores)[::-1][:12]
pred_articles = [loaded_inv_iidmap[p] for p in pred]
pred_articles

Wall time: 178 ms


['0568601043',
 '0762846031',
 '0568601006',
 '0779781015',
 '0858856005',
 '0568601044',
 '0762846027',
 '0728156024',
 '0568601023',
 '0568601030',
 '0568601007',
 '0568597007']

***The save model and dictionaries can be loaded and used to make predicitions.***

In [1]:
import joblib

filename = 'Result_LightFM/final_model.sav'
loaded_model = joblib.load(filename)



In [None]:
# If low memory, can also try to use the following code to delete all varaibles, clear the global namespace.
# %reset -f