# Documentation

This Algorithm is simply creates a prediction algorithm having a script with some logic no models involved

### Define pickle function

This section is optional and is based on your preferences but for ease of use we suggest creating a function that will take care of creating the pickle file. So create once but call it anytime needed

In [25]:
import dill as pickle

def save_pkl(object_to_store, name: str, path: str = './'):
    """
    :param object_to_store: object to pickle
    :param name: name to save pickle
    :param path: path to save pickle
    """
    with open('{}/{}.pkl'.format(path, name), 'wb') as file:
        pickle.dump(object_to_store, file)

def local_load_pkl_model(model_name: str, path: str = './'):
    """
    :param model_name: name of pickled model to load
    :param path: path to the pickled model
    :return: the pickle model
    """
    with open('{}/{}.pkl'.format(path, model_name), 'rb') as file:
        b = pickle.load(file)

    return b

## Load Data

In [26]:
from fastavro import parse_schema, json_reader
from pandas import DataFrame
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 10000)

In [27]:
schemas = {}
schema = {
    'doc': 'Article collection schema',
    'name': 'Article',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'article_id', 'type': 'int'},
        {'name': 'product_code', 'type': 'int'},
        {'name': 'prod_name', 'type': 'string'},
        {'name': 'product_type_no', 'type': 'int'},
        {'name': 'product_type_name', 'type': 'string'},
        {'name': 'product_group_name', 'type': 'string'},
        {'name': 'graphical_appearance_no', 'type': 'int'},
        {'name': 'graphical_appearance_name', 'type': 'string'},
        {'name': 'colour_group_code', 'type': 'int'},
        {'name': 'colour_group_name', 'type': 'string'},
        {'name': 'perceived_colour_value_id', 'type': 'int'},
        {'name': 'perceived_colour_value_name', 'type': 'string'},
        {'name': 'perceived_colour_master_id', 'type': 'int'},
        {'name': 'perceived_colour_master_name', 'type': 'string'},
        {'name': 'department_no', 'type': 'int'},
        {'name': 'department_name', 'type': 'string'},
        {'name': 'index_code', 'type': 'string'},
        {'name': 'index_name', 'type': 'string'},
        {'name': 'index_group_no', 'type': 'int'},
        {'name': 'index_group_name', 'type': 'string'},
        {'name': 'section_no', 'type': 'int'},
        {'name': 'section_name', 'type': 'string'},
        {'name': 'garment_group_no', 'type': 'int'},
        {'name': 'garment_group_name', 'type': 'string'},
        {'name': 'detail_desc', 'type': 'string'},
        {'name': 'article_url', 'type': 'string'}
    ],
}
schemas['article'] = parse_schema(schema)


schema = {
    'doc': 'Customer collection schema',
    'name': 'Customer',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'FN', 'type': 'string'},
        {'name': 'Active', 'type': 'string'},
        {'name': 'club_member_status', 'type': 'string'},
        {'name': 'fashion_news_frequency', 'type': 'string'},
        {'name': 'age', 'type': 'int'},
        {'name': 'postal_code', 'type': 'string'}
    ],
}
schemas['customer'] = parse_schema(schema)


schema = {
    'doc': 'Transaction collection schema',
    'name': 'Transaction',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 't_dat', 'type': 'string'},
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'article_id', 'type': 'int'},
        {'name': 'price', 'type': 'int'},
        {'name': 'sales_channel_id', 'type': 'long'}
    ],
}
schemas['transaction'] = parse_schema(schema)

def load(collection):
    records = []
    with open('./collections/' + collection + '.json', 'r') as fo:
        avro_reader = json_reader(fo, schemas[collection])
        for record in avro_reader:
            records.append(record)
    return records


data = {
    'org_id': 1,  # organization id (added by the "loader")
    'article': [load('article')],  # bonuses for this user
    'customer': [load('customer')],  # payments for this user
    'transaction': [load('transaction')],  # games for this user
}

## Create feature_names.pkl

In [28]:
feature_names = ['customer','transaction']

In [29]:
# Use save_pkl function to store feature_names object
save_pkl(object_to_store = feature_names, name = "feature_names")

## Create prediction.pkl

In [30]:
def prediction(data):

    import ast
    import pandas as pd 
    import numpy as np
    import pandas as pd
    from pandas import DataFrame
    
    data = DataFrame(data)
    # Get customer data
    customer_df1 = data['customer'].astype(str).dropna().apply(ast.literal_eval)
    customer_df2 = pd.concat([pd.DataFrame(x) for x in customer_df1], keys=customer_df1.index)
    customer = data[['org_id']].join(customer_df2.reset_index(level=1, drop=True)).reset_index(drop=True)

    # Get transaction data
    transaction_df1 = data['transaction'].astype(str).dropna().apply(ast.literal_eval)
    transaction_df2 = pd.concat([pd.DataFrame(x) for x in transaction_df1], keys=transaction_df1.index)
    transaction = data[['org_id']].join(transaction_df2.reset_index(level=1, drop=True)).reset_index(drop=True)
    transaction["t_dat"] =  pd.to_datetime(transaction["t_dat"], format="%Y-%m-%d")
    transaction["article_id"] =  transaction["article_id"].astype(str)

    df_3w = transaction[transaction['t_dat'] >= pd.to_datetime('2020-08-31')].copy()
    df_2w = transaction[transaction['t_dat'] >= pd.to_datetime('2020-09-07')].copy()
    df_1w = transaction[transaction['t_dat'] >= pd.to_datetime('2020-09-15')].copy()
    
    purchase_dict_3w = {}

    for i,x in enumerate(zip(df_3w['customer_id'], df_3w['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict_3w:
            purchase_dict_3w[cust_id] = {}

        if art_id not in purchase_dict_3w[cust_id]:
            purchase_dict_3w[cust_id][art_id] = 0

        purchase_dict_3w[cust_id][art_id] += 1

    dummy_list_3w = list((df_3w['article_id'].value_counts()).index)[:12]
    
    purchase_dict_2w = {}

    for i,x in enumerate(zip(df_2w['customer_id'], df_2w['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict_2w:
            purchase_dict_2w[cust_id] = {}

        if art_id not in purchase_dict_2w[cust_id]:
            purchase_dict_2w[cust_id][art_id] = 0

        purchase_dict_2w[cust_id][art_id] += 1

    dummy_list_2w = list((df_2w['article_id'].value_counts()).index)[:12]
    
    purchase_dict_1w = {}

    for i,x in enumerate(zip(df_1w['customer_id'], df_1w['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict_1w:
            purchase_dict_1w[cust_id] = {}

        if art_id not in purchase_dict_1w[cust_id]:
            purchase_dict_1w[cust_id][art_id] = 0

        purchase_dict_1w[cust_id][art_id] += 1

    dummy_list_1w = list((df_1w['article_id'].value_counts()).index)[:12]
    
    dummy_benchmark = customer[['customer_id']].copy()
    
    prediction_list = []

    dummy_list = list((df_2w['article_id'].value_counts()).index)[:12]
    dummy_pred = ' '.join(dummy_list)

    for i, cust_id in enumerate(customer['customer_id'].values.reshape((-1,))):
        if cust_id in purchase_dict_1w:
            l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s = ' '.join(l[:12])
            else:
                s = ' '.join(l+dummy_list_1w[:(12-len(l))])
        elif cust_id in purchase_dict_2w:
            l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s = ' '.join(l[:12])
            else:
                s = ' '.join(l+dummy_list_2w[:(12-len(l))])
        elif cust_id in purchase_dict_3w:
            l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s = ' '.join(l[:12])
            else:
                s = ' '.join(l+dummy_list_3w[:(12-len(l))])
        else:
            s = dummy_pred
        prediction_list.append(s)

    dummy_benchmark['prediction'] = prediction_list
    dummy_benchmark['prediction'] = dummy_benchmark['prediction'].str.split()
    return dummy_benchmark

save_pkl(object_to_store = prediction, name = "prediction")

# Tedst prediction flow️

In [31]:
import dill as pickle

def save_pkl(object_to_store, name: str, path: str = './'):
    """
    :param object_to_store: object to pickle
    :param name: name to save pickle
    :param path: path to save pickle
    """
    with open('{}/{}.pkl'.format(path, name), 'wb') as file:
        pickle.dump(object_to_store, file)

def local_load_pkl_model(model_name: str, path: str = './'):
    """
    :param model_name: name of pickled model to load
    :param path: path to the pickled model
    :return: the pickle model
    """
    with open('{}/{}.pkl'.format(path, model_name), 'rb') as file:
        b = pickle.load(file)

    return b

## Load Data

In [32]:
from fastavro import parse_schema, json_reader
from pandas import DataFrame
import pandas as pd
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.width', 10000)

In [33]:
feature_names = local_load_pkl_model(model_name=f'feature_names')
print(f'Loading collection features: {feature_names}')

Loading collection features: ['customer', 'transaction']


In [34]:
schemas = {}

schema = {
    'doc': 'Customer collection schema',
    'name': 'Customer',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'FN', 'type': 'string'},
        {'name': 'Active', 'type': 'string'},
        {'name': 'club_member_status', 'type': 'string'},
        {'name': 'fashion_news_frequency', 'type': 'string'},
        {'name': 'age', 'type': 'int'},
        {'name': 'postal_code', 'type': 'string'}
    ],
}
schemas['customer'] = parse_schema(schema)


schema = {
    'doc': 'Transaction collection schema',
    'name': 'Transaction',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 't_dat', 'type': 'string'},
        {'name': 'customer_id', 'type': 'string'},
        {'name': 'article_id', 'type': 'int'},
        {'name': 'price', 'type': 'int'},
        {'name': 'sales_channel_id', 'type': 'long'}
    ],
}
schemas['transaction'] = parse_schema(schema)

def load(collection):
    records = []
    with open('./collections/' + collection + '.json', 'r') as fo:
        avro_reader = json_reader(fo, schemas[collection])
        for record in avro_reader:
            records.append(record)
    return records


data = {
    'org_id': 1,  # organization id (added by the "loader")
    'customer': [load('customer')],  # payments for this user
    'transaction': [load('transaction')],  # games for this user
}

In [35]:
prediction_function = local_load_pkl_model(model_name=f'prediction')
print('Loading prediction function')

print('Supply predcition with collection data')
prediction = prediction_function(data)

Loading prediction function
Supply predcition with collection data


In [36]:
prediction.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[568601043, 448509014, 372860001, 579541001, 6..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[448509014, 372860001, 673677002, 536139068, 2..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[448509014, 372860001, 673677002, 536139068, 2..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[448509014, 372860001, 673677002, 536139068, 2..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[448509014, 372860001, 673677002, 536139068, 2..."


## requirements.txt

The final step would be to provide us with all the libraries that those pkl objects are using. So for example in the model code cell you are importing **lightgbm** so we will need to know about this library. The **requirements.txt** correspond to used libraries and packages for your enviornment:

lightgbm <br>
scikit-learn <br>
pandas <br>
dill

**So for the above examples it's pretty straight forward and the contents of this file are the above 3 libraries.**

Note that if you have some version dependency of a specific library make sure to define the specific version
required. For example:

lightgbm == 3.3.2 <br>
scikit-learn == 0.24.2 <br>
pandas == 1.3.4 <br>
dill

##  IMP Notes

Practical recommender systems need be periodically retrained to refresh the model with new interaction data. To pursue high model fidelity, it is usually desirable to retrain the model on both historical and new data, since it can account for both long-term and short-term user preference