In [1]:
# nb-ncf-all-segs.ipynb

# Разработка рекомендательной системы для электронной коммерции на основе гибридного подхода машинного обучения с учётом пользовательского поведения и контентных характеристик

In [2]:
import sys
import numpy as np
import pandas as pd
import plotly.express as px
import time
import warnings
import random
warnings.simplefilter("ignore")
from tqdm.notebook import tqdm
#from tqdm import tqdm
warnings.simplefilter("ignore")
from settings import *
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.utils.timer import Timer
from recommenders.utils.plot import line_graph
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
import pickle
import tensorflow as tf
import random
tf.get_logger().setLevel('ERROR') # only show error messages

.\cache\2019-Nov.csv
.\cache\2019-Oct.csv
.\cache\test.csv
.\cache\test_full.csv
.\cache\train.csv



In [3]:
start_time = time.time()

In [4]:
#import tensorflow.compat.v1 as tf
print(f'System version: {sys.version}')
print(f'Pandas version: {pd.__version__}')

System version: 3.9.19 (main, Mar 21 2024, 17:21:27) [MSC v.1916 64 bit (AMD64)]
Pandas version: 1.5.3


In [5]:
def toratings(ds):
    """
    Преобразует таблицу к строкам: user_id, category_name, rating и надо добавить timestamp
    """
    cols = np.sort(dataset[SPLIT_CATEGORY].unique()).tolist() # Получаем значения всех категорий
    #ds = ds.loc[:, (ds != 0).any(axis=0)] # Удаляем столбцы с одними нулями
    ds.reset_index(inplace=True) # Преобразуем индекс в столбец

    ratings = pd.DataFrame()
    for col in cols:
        if col not in list(ds.columns):
            cols.remove(col)
    for col in cols:
        dtmp = ds[ds[col]!=0][['user_id', col, 'last_purchase']].rename(columns={col: 'rating'})
        dtmp.insert(1,'category',col)
        ratings=pd.concat([ratings,dtmp], ignore_index = True)
    df2 = pd.to_datetime(ratings['last_purchase'])
    ratings['last_purchase'] = df2
    ratings['itemID'] = pd.factorize(ratings['category'])[0]
    print(ratings.head(1))
    return ratings.rename(columns={'last_purchase': 'timestamp', 'user_id': 'userID'})

In [6]:
# Model parameters
# top k items to recommend
TOP_K = 5

# Model parameters
EPOCHS = 100
BATCH_SIZE = 256

SEED = 42

In [7]:
# Evaluate how well NCF performs
# The ranking metrics are used for evaluation.
def eval_ncf():
    #TOP_K = 10
    #df_merge["top_k"] = k
    #df_merge["denominator"] = df_merge[["top_k", "actual"]].min(axis=1)
    #eval_map = (df_merge["rr"] / df_merge["denominator"]).sum() / n_users

    eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
    eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
    eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
    eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

    print("MAP:\t\t%f" % eval_map,
          "NDCG:\t\t%f" % eval_ndcg,
          "Precision@K:\t%f" % eval_precision,
          "Recall@K:\t%f" % eval_recall, sep='\n')
    return [eval_map, eval_ndcg, eval_precision, eval_recall]

In [8]:
pd.options.display.float_format = "{:,.2f}".format
prec = []

In [9]:
%%time
random.seed(42)
dataset = pd.read_pickle(ds_flt_file) # Полные данные после фильтрации
ds = pd.read_pickle(showcase_seg_file) # Сегментированная витрина
prec = []
for seg in set(ds['Segment']):
    df = ds[ds['Segment']==seg].copy()
    df.reset_index('user_id', inplace=True)
    df = dataset[(dataset['event_type'] == 'purchase') & (dataset['user_id'].isin(df['user_id']))][['user_id', 'category_code_level2', 'event_time']]
    df['rating'] = 1
    df.rename(columns = {'user_id': 'userID', 'category_code_level2': 'category', 'event_time': 'timestamp'}, inplace=True)
    factor = pd.factorize(df['category'])
    df['itemID'] = factor[0]
    # Split the data using the Spark chronological splitter provided in utilities
    train, test = python_chrono_split(df, ratio = 0.75)
    
    # Filter out any users or items in the test set that do not appear in the training set.
    test = test[test["userID"].isin(train["userID"].unique())]
    test = test[test["itemID"].isin(train["itemID"].unique())]
    
    # Write datasets to csv files.
    train_file = cache_dir + "./train.csv"
    test_file = cache_dir + "./test.csv"
    train.to_csv(train_file, index=False)
    test.to_csv(test_file, index=False)

    # Generate an NCF dataset object from the data subsets.
    data = NCFDataset(train_file=train_file, test_file=test_file, seed=SEED)
    
    model = NCF (
        n_users=data.n_users, 
        n_items=data.n_items,
        model_type="NeuMF",
        n_factors=4,
        layer_sizes=[16,8,4],
        n_epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        learning_rate=1e-3,
        verbose=0,
        seed=SEED
    )
    
    with Timer() as train_time:
        model.fit(data)
    print(f'Took {train_time} seconds for training.')
    # Plot the train RMSE as a function of the epochs
    dir('model')
    #line_graph(values=model.data, labels='train', x_name='epoch', y_name='rmse_train');
    
    with Timer() as test_time:
        users, items, preds = [], [], []
        item = list(train.itemID.unique())
        for user in train.userID.unique():
            user = [user] * len(item) 
            users.extend(user)
            items.extend(item)
            preds.extend(list(model.predict(user, item, is_list=True)))

        all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

        merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
        all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)
    print(f'Took {test_time} seconds for prediction.')
    
    prec.append(eval_ncf())
    # Train the NCF model on the training data, and get the top-k recommendations for our testing data
    # NCF accepts implicit feedback and generates prospensity of items to be recommended to users 
    # in the scale of 0 to 1. A recommended item list can then be generated based on the scores. 
    # Note that this quickstart notebook is using a smaller number of epochs to reduce time for training. 
    # As a consequence, the model performance will be slighlty deteriorated.
    print(f'\nСегмент: {seg}')
    for n in random.sample(range(0, len(all_predictions)), 2):
    #for n in [0, 100]:
        user_id = all_predictions['userID'].iloc[n]
        print(f'Пользователь: {user_id}')
        print(f'Рекомендовано: {[factor[1][idx] for idx in all_predictions[all_predictions["userID"]==user_id].sort_values("prediction",ascending = False).head(5)["itemID"]]}')
        print(f'Куплено: {set(dataset[(dataset["user_id"]==user_id) & (dataset["event_type"]=="purchase")]["category_code_level2"])}\n')

prec = pd.DataFrame(prec, columns=['MAP', 'NDCG', 'Precision@K', 'Recall@K'])
prec.index.name = 'Segment'

INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 57.4914 seconds for training.
Took 0.2942 seconds for prediction.
MAP:		0.070100
NDCG:		0.084068
Precision@K:	0.028873
Recall@K:	0.124707

Сегмент: 0
Пользователь: 517883807
Рекомендовано: ['water_heater', 'headphone', 'notebook', 'desktop', 'tv']
Куплено: {'subwoofer'}

Пользователь: 555991595
Рекомендовано: ['desktop', 'monitor', 'hdd', 'juicer', 'headphone']
Куплено: {'tv', 'carriage'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 178.7956 seconds for training.
Took 0.8803 seconds for prediction.
MAP:		0.043661
NDCG:		0.052908
Precision@K:	0.017664
Recall@K:	0.079416

Сегмент: 1
Пользователь: 530238994
Рекомендовано: ['bath', 'pillow', 'wallet', 'camera', 'compressor']
Куплено: {'projector'}

Пользователь: 546095675
Рекомендовано: ['tv', 'headphone', 'desktop', 'blender', 'kettle']
Куплено: {'shoes'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 43.9143 seconds for training.
Took 0.2416 seconds for prediction.
MAP:		0.119790
NDCG:		0.145906
Precision@K:	0.054545
Recall@K:	0.217452

Сегмент: 2
Пользователь: 514046566
Рекомендовано: ['keyboard', 'ironing_board', 'washer', 'hdd', 'headphone']
Куплено: {'mouse'}

Пользователь: 536135060
Рекомендовано: ['hdd', 'videocards', 'cpu', 'motherboard', 'desktop']
Куплено: {'memory'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 1904.2036 seconds for training.
Took 2.1050 seconds for prediction.
MAP:		0.024797
NDCG:		0.032525
Precision@K:	0.013467
Recall@K:	0.052281

Сегмент: 3
Пользователь: 536479302
Рекомендовано: ['skates', 'vacuum', 'bag', 'tablet', 'tv']
Куплено: {'air_heater'}

Пользователь: 564697582
Рекомендовано: ['clocks', 'tv', 'skates', 'vacuum', 'notebook']
Куплено: {'headphone'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 65.5106 seconds for training.
Took 0.4111 seconds for prediction.
MAP:		0.136565
NDCG:		0.155102
Precision@K:	0.046734
Recall@K:	0.205611

Сегмент: 4
Пользователь: 542867919
Рекомендовано: ['headphone', 'toster', 'desktop', 'ironing_board', 'grill']
Куплено: {'kettle', 'clocks', 'iron'}

Пользователь: 516315889
Рекомендовано: ['hood', 'refrigerators', 'meat_grinder', 'tv', 'microwave']
Куплено: {'oven', 'hob'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 23.3342 seconds for training.
Took 0.1594 seconds for prediction.
MAP:		0.120267
NDCG:		0.138945
Precision@K:	0.047826
Recall@K:	0.171649

Сегмент: 5
Пользователь: 554382639
Рекомендовано: ['generator', 'refrigerators', 'telephone', 'hood', 'meat_grinder']
Куплено: {'tv', 'headphone', 'clocks', 'desktop', 'water_heater', 'microwave'}

Пользователь: 538414176
Рекомендовано: ['dishwasher', 'washer', 'refrigerators', 'tablet', 'vacuum']
Куплено: {'tv', 'air_conditioner', 'iron'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 700.4988 seconds for training.
Took 1.0445 seconds for prediction.
MAP:		0.046479
NDCG:		0.061087
Precision@K:	0.023618
Recall@K:	0.101532

Сегмент: 6
Пользователь: 560196463
Рекомендовано: ['bed', 'sewing_machine', 'chair', 'washer', 'microwave']
Куплено: {'tv', 'iron'}

Пользователь: 523606047
Рекомендовано: ['microwave', 'chair', 'sewing_machine', 'bed', 'tv']
Куплено: {'iron'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 52.6622 seconds for training.
Took 0.2977 seconds for prediction.
MAP:		0.087466
NDCG:		0.095210
Precision@K:	0.029752
Recall@K:	0.113407

Сегмент: 7
Пользователь: 520003920
Рекомендовано: ['player', 'drill', 'washer', 'tv', 'vacuum']
Куплено: {'tv', 'player', 'subwoofer'}

Пользователь: 518565207
Рекомендовано: ['subwoofer', 'refrigerators', 'clocks', 'telephone', 'headphone']
Куплено: {'chair', 'subwoofer'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 80.4033 seconds for training.
Took 0.4488 seconds for prediction.
MAP:		0.071262
NDCG:		0.090275
Precision@K:	0.034146
Recall@K:	0.140159

Сегмент: 8
Пользователь: 518066620
Рекомендовано: ['scales', 'iron', 'vacuum', 'microwave', 'water_heater']
Куплено: {'kettle', 'washer'}

Пользователь: 557068849
Рекомендовано: ['washer', 'tv', 'water_heater', 'tablet', 'juicer']
Куплено: {'notebook', 'printer'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 167.4046 seconds for training.
Took 0.5574 seconds for prediction.
MAP:		0.057373
NDCG:		0.072222
Precision@K:	0.027810
Recall@K:	0.112399

Сегмент: 9
Пользователь: 534746575
Рекомендовано: ['tv', 'mixer', 'microwave', 'notebook', 'vacuum']
Куплено: {'keds'}

Пользователь: 512471414
Рекомендовано: ['blender', 'notebook', 'meat_grinder', 'headphone', 'hair_cutter']
Куплено: {'refrigerators', 'player'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 28.2764 seconds for training.
Took 0.3219 seconds for prediction.
MAP:		0.026753
NDCG:		0.037444
Precision@K:	0.019048
Recall@K:	0.061839

Сегмент: 10
Пользователь: 513349635
Рекомендовано: ['refrigerators', 'grill', 'generator', 'washer', 'keds']
Куплено: {'bag', 'ironing_board', 'iron', 'umbrella'}

Пользователь: 515452337
Рекомендовано: ['coffee_machine', 'mixer', 'printer', 'iron', 'notebook']
Куплено: {'vacuum', 'mouse'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 72.3776 seconds for training.
Took 0.4454 seconds for prediction.
MAP:		0.072099
NDCG:		0.087308
Precision@K:	0.034429
Recall@K:	0.122663

Сегмент: 11
Пользователь: 568320937
Рекомендовано: ['clocks', 'keds', 'tv', 'acoustic', 'videoregister']
Куплено: {'shoes', 'clocks'}

Пользователь: 564038011
Рекомендовано: ['refrigerators', 'keds', 'notebook', 'vacuum', 'water_heater']
Куплено: {'tv', 'carriage', 'washer'}



INFO:recommenders.models.ncf.dataset:Indexing .\cache\./train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing .\cache\./test_full.csv ...


Took 35.3669 seconds for training.
Took 0.1866 seconds for prediction.
MAP:		0.025751
NDCG:		0.029129
Precision@K:	0.011159
Recall@K:	0.044349

Сегмент: 12
Пользователь: 517157331
Рекомендовано: ['generator', 'carriage', 'toys', 'headphone', 'microwave']
Куплено: {'headphone', 'bicycle'}

Пользователь: 552801943
Рекомендовано: ['carriage', 'generator', 'saw', 'bicycle', 'swing']
Куплено: {'toys'}

CPU times: total: 57min 45s
Wall time: 57min 35s


In [12]:
from user_func import prdf
prec.index.name = 'Segment'
print('Метрики ранжирования')
prdf(prec.T)
for col in prec.columns:
    print(col, prec[col].mean())

Метрики ранжирования


Segment,0,1,2,3,4,5,6,7,8,9,10,11,12
MAP,0.07,0.04,0.12,0.02,0.14,0.12,0.05,0.09,0.07,0.06,0.03,0.07,0.03
NDCG,0.08,0.05,0.15,0.03,0.16,0.14,0.06,0.1,0.09,0.07,0.04,0.09,0.03
Precision@K,0.03,0.02,0.05,0.01,0.05,0.05,0.02,0.03,0.03,0.03,0.02,0.03,0.01
Recall@K,0.12,0.08,0.22,0.05,0.21,0.17,0.1,0.11,0.14,0.11,0.06,0.12,0.04


MAP 0.06941244818478082
NDCG 0.08324066200210387
Precision@K 0.029928574440131396
Recall@K 0.1190356131253955


In [11]:
seconds=int(time.time() - start_time)
print("%d:%02d" % (seconds//60, seconds-60*(seconds//60),))

57:35
