# Documentation

This Algorithm creates a prediction function that re-creates the mappings and trains the model inside of it at every run

In [1]:
import dill as pickle

def save_pkl(object_to_store, name: str, path: str = './'):
    """
    :param object_to_store: object to pickle
    :param name: name to save pickle
    :param path: path to save pickle
    """
    with open('{}/{}.pkl'.format(path, name), 'wb') as file:
        pickle.dump(object_to_store, file)

def local_load_pkl_model(model_name: str, path: str = './'):
    """
    :param model_name: name of pickled model to load
    :param path: path to the pickled model
    :return: the pickle model
    """
    with open('{}/{}.pkl'.format(path, model_name), 'rb') as file:
        b = pickle.load(file)

    return b

## Load Data

In [2]:
from fastavro import parse_schema, json_reader
from pandas import DataFrame
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 10000)

In [3]:
schemas = {}
schema = {
    'doc': 'Article collection schema',
    'name': 'Article',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'article_id', 'type': 'int'},
        {'name': 'product_code', 'type': 'int'},
        {'name': 'prod_name', 'type': 'string'},
        {'name': 'product_type_no', 'type': 'int'},
        {'name': 'product_type_name', 'type': 'string'},
        {'name': 'product_group_name', 'type': 'string'},
        {'name': 'graphical_appearance_no', 'type': 'int'},
        {'name': 'graphical_appearance_name', 'type': 'string'},
        {'name': 'colour_group_code', 'type': 'int'},
        {'name': 'colour_group_name', 'type': 'string'},
        {'name': 'perceived_colour_value_id', 'type': 'int'},
        {'name': 'perceived_colour_value_name', 'type': 'string'},
        {'name': 'perceived_colour_master_id', 'type': 'int'},
        {'name': 'perceived_colour_master_name', 'type': 'string'},
        {'name': 'department_no', 'type': 'int'},
        {'name': 'department_name', 'type': 'string'},
        {'name': 'index_code', 'type': 'string'},
        {'name': 'index_name', 'type': 'string'},
        {'name': 'index_group_no', 'type': 'int'},
        {'name': 'index_group_name', 'type': 'string'},
        {'name': 'section_no', 'type': 'int'},
        {'name': 'section_name', 'type': 'string'},
        {'name': 'garment_group_no', 'type': 'int'},
        {'name': 'garment_group_name', 'type': 'string'},
        {'name': 'detail_desc', 'type': 'string'},
        {'name': 'article_url', 'type': 'string'}
    ],
}
schemas['article'] = parse_schema(schema)


schema = {
    'doc': 'Customer collection schema',
    'name': 'Customer',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'FN', 'type': 'string'},
        {'name': 'Active', 'type': 'string'},
        {'name': 'club_member_status', 'type': 'string'},
        {'name': 'fashion_news_frequency', 'type': 'string'},
        {'name': 'age', 'type': 'int'},
        {'name': 'postal_code', 'type': 'string'}
    ],
}
schemas['customer'] = parse_schema(schema)


schema = {
    'doc': 'Transaction collection schema',
    'name': 'Transaction',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 't_dat', 'type': 'string'},
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'article_id', 'type': 'int'},
        {'name': 'price', 'type': 'int'},
        {'name': 'sales_channel_id', 'type': 'long'}
    ],
}
schemas['transaction'] = parse_schema(schema)

def load(collection):
    records = []
    with open('./collections/' + collection + '.json', 'r') as fo:
        avro_reader = json_reader(fo, schemas[collection])
        for record in avro_reader:
            records.append(record)
    return records


data = {
    'org_id': 1,  # organization id (added by the "loader")
    'article': [load('article')],  # bonuses for this user
    'customer': [load('customer')],  # payments for this user
    'transaction': [load('transaction')],  # games for this user
}

## Create feature_names.pkl

In [4]:
feature_names = ['customer','transaction','article']

In [5]:
# Use save_pkl function to store feature_names object
save_pkl(object_to_store = feature_names, name = "feature_names")

## Create prediction.pkl

In [6]:
def prediction(data):
    
    import os
    import tqdm
    import ast
    import pandas as pd
    import numpy as np
    from lightfm import LightFM
    from lightfm.data import Dataset

    # Import LightFM's evaluation metrics
    from lightfm.evaluation import precision_at_k

    %matplotlib inline
    SEED = 42
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    
    
    ## Prepare Dataset

    data = DataFrame(data)
    # Get customer data
    customer_df1 = data['customer'].astype(str).dropna().apply(ast.literal_eval)
    customer_df2 = pd.concat([pd.DataFrame(x) for x in customer_df1], keys=customer_df1.index)
    customer = data[['org_id']].join(customer_df2.reset_index(level=1, drop=True)).reset_index(drop=True)

    # Get article data
    article_df1 = data['article'].astype(str).dropna().apply(ast.literal_eval)
    article_df2 = pd.concat([pd.DataFrame(x) for x in article_df1], keys=article_df1.index)
    article = data[['org_id']].join(article_df2.reset_index(level=1, drop=True)).reset_index(drop=True)
    article["article_id"] =  article["article_id"].astype(str)

    # Get transaction data
    transaction_df1 = data['transaction'].astype(str).dropna().apply(ast.literal_eval)
    transaction_df2 = pd.concat([pd.DataFrame(x) for x in transaction_df1], keys=transaction_df1.index)
    transaction = data[['org_id']].join(transaction_df2.reset_index(level=1, drop=True)).reset_index(drop=True)
    transaction["t_dat"] =  pd.to_datetime(transaction["t_dat"])
    transaction["article_id"] =  transaction["article_id"].astype(str)


    dataset = Dataset()
    dataset.fit(users=customer['customer_id'].unique(), 
                items=article['article_id'].unique())

    num_users, num_topics = dataset.interactions_shape()
    print(f'Number of users: {num_users}, Number of topics: {num_topics}.')

    #train_set = train[(train.t_dat>='2020-8-26')&(train.t_dat<='2020-9-15')]
    train_set = transaction

    (interactions, weights) = dataset.build_interactions(train_set.iloc[:, 2:4].values)

    # default number of recommendations
    K = 10
    EPOCHS = 1

    # model learning rate
    LEARNING_RATE = 0.25
    # no of latent factors
    NO_COMPONENTS = 20

    # no of threads to fit model
    NO_THREADS = 32
    # regularisation for both user and item features
    ITEM_ALPHA=1e-6
    USER_ALPHA=1e-6


    light_fm = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                     learning_rate=LEARNING_RATE,                 
                     random_state=np.random.RandomState(SEED))
    light_fm.fit(interactions=interactions, epochs=EPOCHS, verbose=1)

    #Get the mappings
    '''
    uid = mapping from customer_id to model equivalent user_id
    iid = mapping from article_id to  model equivalent article_id
    '''
    uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping() 
    '''
    create inverse mappings
    '''
    inv_uid_map = {v:k for k, v in uid_map.items()}
    inv_iid_map = {v:k for k, v in iid_map.items()}

    #convert submission user_id and article_id to model equivalent user_id and article_id

    test_X = customer.customer_id.values.tolist()
    lfn_user = lambda x: uid_map[x]
    
    test_X_m = [lfn_user(tx) for tx in test_X]
    print(len(test_X_m))


    customer_ids = []
    preds = []

    for usr_ in tqdm.tqdm(test_X_m, total = len(test_X_m)):
        m_opt = light_fm.predict(np.array([usr_] * len(iid_map)), np.array(list(iid_map.values())))
        pred = np.argsort(-m_opt)[:K]
        customer_ids.append(inv_uid_map[usr_])
        preds.append(' '.join([inv_iid_map[p] for p in pred]).strip())
        #break
    
    customer_ids = np.array(customer_ids).reshape(-1, 1)
    preds = np.array(preds).reshape(-1, 1)

    final_preds = pd.DataFrame(data=np.concatenate((customer_ids, preds), axis=1).reshape(-1, 2), columns=['customer_id', 'prediction'])
    return final_preds

save_pkl(object_to_store = prediction, name = "prediction")

# Predictions ☂️

In [7]:
import dill as pickle

def save_pkl(object_to_store, name: str, path: str = './'):
    """
    :param object_to_store: object to pickle
    :param name: name to save pickle
    :param path: path to save pickle
    """
    with open('{}/{}.pkl'.format(path, name), 'wb') as file:
        pickle.dump(object_to_store, file)

def local_load_pkl_model(model_name: str, path: str = './'):
    """
    :param model_name: name of pickled model to load
    :param path: path to the pickled model
    :return: the pickle model
    """
    with open('{}/{}.pkl'.format(path, model_name), 'rb') as file:
        b = pickle.load(file)

    return b

## Load Data

In [8]:
from fastavro import parse_schema, json_reader
from pandas import DataFrame
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 10000)

In [9]:
feature_names = local_load_pkl_model(model_name=f'feature_names')
print(f'Loading collection features: {feature_names}')

Loading collection features: ['customer', 'transaction', 'article']


In [10]:
schemas = {}
schema = {
    'doc': 'Article collection schema',
    'name': 'Article',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'article_id', 'type': 'int'},
        {'name': 'product_code', 'type': 'int'},
        {'name': 'prod_name', 'type': 'string'},
        {'name': 'product_type_no', 'type': 'int'},
        {'name': 'product_type_name', 'type': 'string'},
        {'name': 'product_group_name', 'type': 'string'},
        {'name': 'graphical_appearance_no', 'type': 'int'},
        {'name': 'graphical_appearance_name', 'type': 'string'},
        {'name': 'colour_group_code', 'type': 'int'},
        {'name': 'colour_group_name', 'type': 'string'},
        {'name': 'perceived_colour_value_id', 'type': 'int'},
        {'name': 'perceived_colour_value_name', 'type': 'string'},
        {'name': 'perceived_colour_master_id', 'type': 'int'},
        {'name': 'perceived_colour_master_name', 'type': 'string'},
        {'name': 'department_no', 'type': 'int'},
        {'name': 'department_name', 'type': 'string'},
        {'name': 'index_code', 'type': 'string'},
        {'name': 'index_name', 'type': 'string'},
        {'name': 'index_group_no', 'type': 'int'},
        {'name': 'index_group_name', 'type': 'string'},
        {'name': 'section_no', 'type': 'int'},
        {'name': 'section_name', 'type': 'string'},
        {'name': 'garment_group_no', 'type': 'int'},
        {'name': 'garment_group_name', 'type': 'string'},
        {'name': 'detail_desc', 'type': 'string'},
        {'name': 'article_url', 'type': 'string'}
    ],
}
schemas['article'] = parse_schema(schema)


schema = {
    'doc': 'Customer collection schema',
    'name': 'Customer',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'FN', 'type': 'string'},
        {'name': 'Active', 'type': 'string'},
        {'name': 'club_member_status', 'type': 'string'},
        {'name': 'fashion_news_frequency', 'type': 'string'},
        {'name': 'age', 'type': 'int'},
        {'name': 'postal_code', 'type': 'string'}
    ],
}
schemas['customer'] = parse_schema(schema)


schema = {
    'doc': 'Transaction collection schema',
    'name': 'Transaction',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 't_dat', 'type': 'string'},
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'article_id', 'type': 'int'},
        {'name': 'price', 'type': 'int'},
        {'name': 'sales_channel_id', 'type': 'long'}
    ],
}
schemas['transaction'] = parse_schema(schema)

def load(collection):
    records = []
    with open('./collections/' + collection + '.json', 'r') as fo:
        avro_reader = json_reader(fo, schemas[collection])
        for record in avro_reader:
            records.append(record)
    return records


data = {
    'org_id': 1,  # organization id (added by the "loader")
    'article': [load('article')],  # bonuses for this user
    'customer': [load('customer')],  # payments for this user
    'transaction': [load('transaction')],  # games for this user
}

In [11]:
prediction_function = local_load_pkl_model(model_name=f'prediction')
print('Loading prediction function')

Loading prediction function


In [None]:
print('Supply predcition function with collection data')
prediction = prediction_function(data)

Supply predcition function with collection data




Number of users: 10001, Number of topics: 50000.


Epoch: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.28it/s]


10001


 66%|███████████████████████████████████████████████████████████████████████████████████▏                                          | 6598/10001 [01:25<00:40, 83.60it/s]

In [None]:
prediction.head()

## requirements.txt

The final step would be to provide us with all the libraries that those pkl objects are using. So for example in the model code cell you are importing **lightgbm** so we will need to know about this library. The **requirements.txt** correspond to used libraries and packages for your enviornment:

lightgbm <br>
scikit-learn <br>
pandas <br>
dill

**So for the above examples it's pretty straight forward and the contents of this file are the above 3 libraries.**

Note that if you have some version dependency of a specific library make sure to define the specific version
required. For example:

lightgbm == 3.3.2 <br>
scikit-learn == 0.24.2 <br>
pandas == 1.3.4 <br>
dill

##  IMP Notes

Practical recommender systems need be periodically retrained to refresh the model with new interaction data. To pursue high model fidelity, it is usually desirable to retrain the model on both historical and new data, since it can account for both long-term and short-term user preference