In [1]:
import datetime
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from lightfm import LightFM
from lightfm.evaluation import auc_score, precision_at_k
from scipy.sparse import coo_matrix
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             precision_score)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")



In [10]:
# Создадим датафреймы на основе предоставленных данных
events_raw = pd.read_csv("./data/events.csv")
# Приведем к временному формату данные из timestapm
events_raw["timestamp"] = pd.to_datetime(events_raw["timestamp"], unit="ms")

events_raw.rename(columns={"timestamp": "date"}, inplace=True)

events_raw["date"] = pd.to_datetime(events_raw["date"].dt.date)

In [11]:
events = events_raw.copy()
events = events.sort_values('date').reset_index(drop=True)
events = events[['visitorid','itemid','event', 'date']]
events

Unnamed: 0,visitorid,itemid,event,date
0,689859,421640,view,2015-05-03
1,935582,203248,view,2015-05-03
2,696326,194830,view,2015-05-03
3,131668,395045,view,2015-05-03
4,595484,129111,view,2015-05-03
...,...,...,...,...
2756096,699799,73200,view,2015-09-18
2756097,362806,230348,view,2015-09-18
2756098,85274,120740,view,2015-09-18
2756099,637990,54521,view,2015-09-18


In [12]:
top_items = pd.DataFrame(events.groupby("itemid")["event"].value_counts()).reset_index()

top_items = top_items[top_items["count"] > 30][["itemid", "count"]]

top_items = top_items.groupby("itemid")["count"].sum().to_dict()

len(top_items)

18823

In [13]:
events["num_occur"] = events["itemid"].map(top_items)
events

Unnamed: 0,visitorid,itemid,event,date,num_occur
0,689859,421640,view,2015-05-03,
1,935582,203248,view,2015-05-03,166.0
2,696326,194830,view,2015-05-03,46.0
3,131668,395045,view,2015-05-03,86.0
4,595484,129111,view,2015-05-03,43.0
...,...,...,...,...,...
2756096,699799,73200,view,2015-09-18,121.0
2756097,362806,230348,view,2015-09-18,
2756098,85274,120740,view,2015-09-18,
2756099,637990,54521,view,2015-09-18,70.0


In [14]:
events[events['num_occur'] > 500]["itemid"].value_counts()

itemid
187946    3412
461686    2978
5411      2334
370653    1854
219512    1800
257040    1647
298009    1642
96924     1633
309778    1628
384302    1608
320130    1507
7943      1489
111530    1447
441668    1433
335975    1428
234255    1307
190000    1231
151444    1230
37029     1227
369447    1214
9877      1162
312728    1155
65273     1153
142466    1136
112782    1126
48030     1122
315543    1103
445351    1073
161623    1072
434782    1069
29196     1067
400946    1039
91755     1024
102306    1013
Name: count, dtype: int64

In [15]:
events_processed = events[events['num_occur'] > 1000]
events_processed = events_processed.drop(columns="num_occur")
events_processed = events_processed.drop_duplicates().reset_index(drop=True)
events_processed

Unnamed: 0,visitorid,itemid,event,date
0,634873,37029,view,2015-05-03
1,1148414,37029,view,2015-05-03
2,1180720,369447,addtocart,2015-05-03
3,737241,369447,view,2015-05-03
4,967527,37029,view,2015-05-03
...,...,...,...,...
40073,1265892,434782,view,2015-09-18
40074,1388583,461686,view,2015-09-18
40075,273995,91755,view,2015-09-18
40076,337534,91755,view,2015-09-18


In [44]:
events_train = events_processed[events_processed["date"].dt.month < 9]
events_test = events_processed[events_processed["date"].dt.month >= 9]
events_test = events_test[
    (events_test["visitorid"].isin(events_train["visitorid"]))
    & (events_test["itemid"].isin(events_train["itemid"]))
]

In [47]:
id_cols=['visitorid','itemid']
trans_cat_train=dict()
trans_cat_test=dict()

for k in id_cols:
    cate_enc=preprocessing.LabelEncoder()
    trans_cat_train[k]=cate_enc.fit_transform(events_train[k].values)
    trans_cat_test[k]=cate_enc.transform(events_test[k].values)

In [49]:
ratings = dict()

cate_enc=preprocessing.LabelEncoder()
ratings['train'] = cate_enc.fit_transform(events_train.event)
ratings['test'] = cate_enc.transform(events_test.event)

In [51]:
n_users=len(np.unique(trans_cat_train['visitorid']))
n_items=len(np.unique(trans_cat_train['itemid']))

In [52]:
rate_matrix = dict()

rate_matrix["train"] = coo_matrix(
    (ratings["train"], (trans_cat_train["visitorid"], trans_cat_train["itemid"])),
    shape=(n_users, n_items),
)

rate_matrix["test"] = coo_matrix(
    (ratings["test"], (trans_cat_test["visitorid"], trans_cat_test["itemid"])),
    shape=(n_users, n_items),
)

In [53]:
model = LightFM(no_components=10, loss="warp")
model.fit(rate_matrix["train"], epochs=100, num_threads=8)

<lightfm.lightfm.LightFM at 0x31c602950>

In [56]:
auc_score(model, rate_matrix['train'], num_threads=8).mean()

0.9974265

In [57]:
auc_score(model, rate_matrix['test'], num_threads=10).mean()

0.8639968

In [15]:
# Создадим датафреймы на основе предоставленных данных
events_raw = pd.read_csv("./data/events.csv")

# Приведем к временному формату данные из timestapm
events_raw["timestamp"] = pd.to_datetime(events_raw["timestamp"], unit="ms")
events_raw.rename(columns={"timestamp": "date"}, inplace=True)
events_raw["date"] = pd.to_datetime(events_raw["date"].dt.date)

events = events_raw.copy()
events = events.sort_values("date").reset_index(drop=True)
events = events[["visitorid", "itemid", "event", "date"]]

# Фильтруем по количеству больше 30 и оставляем только itemid и count
top_items = pd.DataFrame(events.groupby("itemid")["event"].value_counts()).reset_index()
top_items = top_items[top_items["count"] > 30][["itemid", "count"]]
top_items = top_items.groupby("itemid")["count"].sum().to_dict()

# Создаем новый столбец num_occur, в котором хранится количество событий для каждого itemid
events["num_occur"] = events["itemid"].map(top_items)

# Фильтруем события, оставляем только те, у которых num_occur больше 1000
events_processed = events[events["num_occur"] > 1000]
events_processed = events_processed.drop(columns="num_occur")
events_processed = events_processed.drop_duplicates().reset_index(drop=True)

# Выделяем обучающий набор данных до сентября
events_train = events_processed[events_processed["date"].dt.month < 8]
# Выделяем тестовый набор данных с сентября и позже
events_test = events_processed[events_processed["date"].dt.month >= 8]

# Фильтруем тестовый набор данных
events_test = events_test[
    (events_test["visitorid"].isin(events_train["visitorid"]))
    & (events_test["itemid"].isin(events_train["itemid"]))
]

# Список категориальных признаков
id_cols = ["visitorid", "itemid"]

# Создаем словарь для закодированных значений обучающего набора
trans_cat_train = dict()
# Создаем словарь для закодированных значений тестового набора
trans_cat_test = dict()

for k in id_cols:
    cate_enc = preprocessing.LabelEncoder()
    trans_cat_train[k] = cate_enc.fit_transform(
        events_train[k].values
    )  # Кодируем значения обучающего набора
    trans_cat_test[k] = cate_enc.transform(
        events_test[k].values
    )  # Кодируем значения тестового набора

# Создаем словарь для закодированных значений целевой переменной
ratings = dict()

cate_enc_2 = preprocessing.LabelEncoder()
ratings["train"] = cate_enc_2.fit_transform(
    events_train.event
)  # Кодируем целевую переменную для обучающего набора
ratings["test"] = cate_enc_2.transform(
    events_test.event
)  # Кодируем целевую переменную для тестового набора

# Вычисляем количество уникальных пользователей
n_users = len(np.unique(trans_cat_train["visitorid"]))
# Вычисляем количество уникальных товаров
n_items = len(np.unique(trans_cat_train["itemid"]))

# Создаем словарь для матриц оценок
rate_matrix = dict()

# Создаем разреженную матрицу для обучающего набора
rate_matrix["train"] = coo_matrix(
    (ratings["train"], (trans_cat_train["visitorid"], trans_cat_train["itemid"])),
    shape=(n_users, n_items),
)
# Создаем разреженную матрицу для тестового набора
rate_matrix["test"] = coo_matrix(
    (
        ratings["test"],  # данные
        (trans_cat_test["visitorid"], trans_cat_test["itemid"]),
    ),  # индексы строк (trans_cat_test[“visitorid”]) и индексы столбцов (trans_cat_test[“itemid”])
    shape=(n_users, n_items),
)

# Создаем модель LightFM с указанием параметров
model = LightFM(no_components=10, loss="warp")
# Обучаем модель на обучающей матрице
model.fit(rate_matrix["train"], epochs=100, num_threads=8)

# Вычисляем среднюю точность на тестовой матрице для k=3
map_at3 = precision_at_k(model, rate_matrix["test"], k=3).mean()
print(f"Mean Average Precision at 3: {round(map_at3*100, 3)} %")

# Выводим среднюю площадь под ROC-кривой для обучающей матрицы
print(auc_score(model, rate_matrix["train"], num_threads=8).mean())
# Выводим среднюю площадь под ROC-кривой для тестовой матрицы
print(auc_score(model, rate_matrix["test"], num_threads=10).mean())

#top_items = (
#    pd.DataFrame(
#        model.predict(user_ids=np.arange(n_users),
#                     item_ids=np.arange(n_items),
#                     num_threads=8)
#    )
#  .assign(visitorid=lambda x: id_cols["visitorid"].take(x[0.0]))
#  .sort_values(by=[0.0], ascending=False)
#  .groupby("visitorid")
#  .head(3)
#)

# Print the top 3 items for each user
#print(top_items)

Mean Average Precision at 3: 29.707 %
0.9977242
0.91089493


In [37]:
# Используем обученную модель для предсказания предпочтений пользователей для товаров в тестовой выборке
predicted_scores = model.predict(
    trans_cat_test["visitorid"], trans_cat_test["itemid"], num_threads=8
)

# Создаем функцию для получения предсказанных оценок для всех возможных пар (пользователь, товар) в тестовой выборке
def get_predicted_ratings(visitor_ids, item_ids, scores):
    predicted_ratings = pd.DataFrame({
        "visitorid": visitor_ids,
        "itemid": item_ids,
        "predicted_score": scores
    })
    return predicted_ratings

# Преобразуем полученные предсказанные оценки в датафрейм с колонками visitorid и itemid
predicted_ratings_df = get_predicted_ratings(
    events_test["visitorid"], events_test["itemid"], predicted_scores
)
#predicted_ratings_df["predicted_score"] = predicted_ratings_df["predicted_score"]*-1
predicted_ratings_df = predicted_ratings_df[predicted_ratings_df["predicted_score"] < 0]
predicted_ratings_df

Unnamed: 0,visitorid,itemid,predicted_score
24889,952946,37029,-0.128135
25011,1293358,219512,-1.378590
25183,1297062,190000,-2.818779
25202,1297062,461686,-0.095341
25244,844369,234255,-1.131769
...,...,...,...
39469,316850,320130,-3.281291
39709,693709,29196,-1.363712
39731,1126569,190000,-1.556001
39870,895999,7943,-3.105774


In [40]:
test = pd.pivot_table(
    data=predicted_ratings_df,
    index="visitorid",
    columns="itemid",
    values="predicted_score",
    aggfunc="sum",
)
test

itemid,5411,7943,9877,29196,37029,48030,65273,91755,112782,161623,...,257040,312728,315543,320130,370653,384302,400946,441668,445351,461686
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
79627,,,-0.471864,,,,-1.43723,,,,...,,,-0.441799,,,,,,,
218828,,,,-2.555792,,,,,,,...,,,,,,,,,,
234313,,,,,-0.456719,,,,,,...,,,,,,,,,,
244756,,,,,,,,,,,...,,,,,,,,,,
311205,,,,,,,,,,,...,,,,,,,,,,-1.635829
316850,,-1.965459,,,,,,,,,...,,,,-3.281291,,,,,,
361041,-1.627082,,,,,,,,,,...,,-2.224741,,,,,,,,
430913,,,,,,,,,,,...,,,,,,,,,,
468309,,,,,,,,,,,...,,,,,,,,,-2.206437,
475172,,,,,,,,,,,...,,,,,,,,-0.588515,,


In [85]:
users = test.index.to_list()
items = test.columns.to_list()
scores = np.array(test)

test_array = scores[2].reshape(1,-1)
min_indexes = np.argsort(test_array, axis=1)[:, :3].tolist()
min_indexes

[[4, 0, 20]]

In [70]:
items[0].reshape(1,-1)

array([[        nan,         nan, -0.47186366,         nan,         nan,
                nan, -1.4372302 ,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
        -0.44179928,         nan,         nan,         nan,         nan,
                nan,         nan,         nan]], dtype=float32)

In [62]:
events_test[(events_test["visitorid"] == 1150086)]

Unnamed: 0,visitorid,itemid,event,date
26686,1150086,461686,view,2015-08-06
27082,1150086,320130,view,2015-08-07
27173,1150086,9877,addtocart,2015-08-07
27237,1150086,9877,view,2015-08-07
27331,1150086,9877,view,2015-08-08
27457,1150086,320130,view,2015-08-08
27463,1150086,461686,view,2015-08-08
27663,1150086,461686,view,2015-08-09
27730,1150086,7943,view,2015-08-09
29132,1150086,190000,addtocart,2015-08-13


In [42]:
events_test[(events_test["event"] == "transaction")]

Unnamed: 0,visitorid,itemid,event,date
25151,844369,190000,transaction,2015-08-02
25339,179210,48030,transaction,2015-08-02
25595,1297062,9877,transaction,2015-08-03
26981,642130,461686,transaction,2015-08-07
27276,705542,320130,transaction,2015-08-07
28009,530559,257040,transaction,2015-08-10
28584,871359,219512,transaction,2015-08-11
29191,1150086,190000,transaction,2015-08-13
29786,530559,315543,transaction,2015-08-14
30030,464410,461686,transaction,2015-08-15


In [108]:
scores = model.predict(2, np.arange(rate_matrix["train"].shape[1]))
top_3_items = np.argsort(-scores)[:3]
top_3_items

array([29, 15, 13])

In [136]:
import pandas as pd
from scipy.sparse import coo_matrix
from lightfm import LightFM

# Create dictionaries for encoding categorical columns
id_cols = {
    "visitorid": pd.factorize(events_processed["visitorid"])[0],
    "itemid": pd.factorize(events_processed["itemid"])[0],
}

# Create a dictionary for encoding the target variable
ratings = pd.factorize(events_processed["event"])[0]

# Create a dictionary for storing the sparse matrices
rate_matrix = {}

# Create a train/test split
train_index = events_processed["date"].dt.month < 8
test_index = events_processed["date"].dt.month >= 8

matrix_shape = (len(set(id_cols["visitorid"])), len(set(id_cols["itemid"])))
matrix_shape

# Create a sparse matrix for the training data
rate_matrix["train"] = coo_matrix(
    (
        ratings[train_index],
        (
            id_cols["visitorid"][train_index],
            id_cols["itemid"][train_index],
        ),
    ),
    shape=(matrix_shape[0], matrix_shape[1]),
)

# Create a sparse matrix for the test data
rate_matrix["test"] = coo_matrix(
    (
        ratings[test_index],
        (
            id_cols["visitorid"][test_index],
            id_cols["itemid"][test_index],
        ),
    ),
    shape=(matrix_shape[0], matrix_shape[1]),
)

# Create a LightFM model with specified parameters
model = LightFM(no_components=30, loss="warp")

# Train the model iteratively in batches
batch_size = 1000
for i in range(0, len(rate_matrix["train"].data), batch_size):
    model.fit_partial(
        rate_matrix["train"].tocsr()[i : i + batch_size],
        epochs=10,
        num_threads=8,
    )

n_users = len(set(id_cols["visitorid"]))
n_items = len(set(id_cols["itemid"]))

top_items = (
    pd.DataFrame(
        model.predict(user_ids=np.arange(n_users),
                     item_ids=np.arange(n_items),
                     num_threads=8)
    )
  .assign(visitorid=lambda x: id_cols["visitorid"].take(x[0.0]))
  .sort_values(by=[0.0], ascending=False)
  .groupby("visitorid")
  .head(3)
)

# Print the top 3 items for each user
print(top_items)

ValueError: Expected the number of user IDs (32870) to equal the number of item IDs (34)

In [13]:
# Создадим датафреймы на основе предоставленных данных
events_raw = pd.read_csv("./data/events.csv")

# Приведем к временному формату данные из timestapm
events_raw["timestamp"] = pd.to_datetime(events_raw["timestamp"], unit="ms")
events_raw.rename(columns={"timestamp": "date"}, inplace=True)
events_raw["date"] = pd.to_datetime(events_raw["date"].dt.date)

events = events_raw.copy()
events = events.sort_values("date").reset_index(drop=True)
events = events[["visitorid", "itemid", "event", "date"]]

# Фильтруем по количеству больше 30 и оставляем только itemid и count
top_items = pd.DataFrame(events.groupby("itemid")["event"].value_counts()).reset_index()
top_items = top_items[top_items["count"] > 30][["itemid", "count"]]
top_items = top_items.groupby("itemid")["count"].sum().to_dict()

# Создаем новый столбец num_occur, в котором хранится количество событий для каждого itemid
events["num_occur"] = events["itemid"].map(top_items)

# Фильтруем события, оставляем только те, у которых num_occur больше 1000
events_processed = events[events["num_occur"] > 1000]
events_processed = events_processed.drop(columns="num_occur")
events_processed = events_processed.drop_duplicates().reset_index(drop=True)


# Create dictionaries for encoding categorical columns
id_cols = {
    "visitorid": pd.factorize(events["visitorid"])[0],
    "itemid": pd.factorize(events["itemid"])[0],
}

# Create a dictionary for encoding the target variable
ratings = pd.factorize(events["event"])[0]

# Create a dictionary for storing the sparse matrices
rate_matrix = {}

# Create a train/test split
train_index = events["date"].dt.month < 8
test_index = events["date"].dt.month >= 8

matrix_shape = (len(set(id_cols["visitorid"])), len(set(id_cols["itemid"])))
matrix_shape

# Create a sparse matrix for the training data
rate_matrix["train"] = coo_matrix(
    (
        ratings[train_index],
        (
            id_cols["visitorid"][train_index],
            id_cols["itemid"][train_index],
        ),
    ),
    shape=(matrix_shape[0], matrix_shape[1]),
)

# Create a sparse matrix for the test data
rate_matrix["test"] = coo_matrix(
    (
        ratings[test_index],
        (
            id_cols["visitorid"][test_index],
            id_cols["itemid"][test_index],
        ),
    ),
    shape=(matrix_shape[0], matrix_shape[1]),
)

# Create a LightFM model with specified parameters
model = LightFM(no_components=30, loss="warp")

# Train the model iteratively in batches
batch_size = 1000
for i in range(0, len(rate_matrix["train"].data), batch_size):
    model.fit_partial(
        rate_matrix["train"].tocsr()[i : i + batch_size],
        epochs=10,
        num_threads=8,
    )

n_users = len(set(id_cols["visitorid"]))
n_items = len(set(id_cols["itemid"]))

top_items = (
    pd.DataFrame(
        model.predict(user_ids=np.arange(n_users),
                     item_ids=np.arange(n_items),
                     num_threads=8)
    )
  .assign(visitorid=lambda x: id_cols["visitorid"].take(x[0.0]))
  .sort_values(by=[0.0], ascending=False)
  .groupby("visitorid")
  .head(3)
)

# Print the top 3 items for each user
print(top_items)

ValueError: Incorrect number of features in user_features

In [14]:
map_at3 = precision_at_k(model, rate_matrix["test"], k=3).mean()
print(f"Mean Average Precision at 3: {round(map_at3*100, 3)} %")

# Выводим среднюю площадь под ROC-кривой для обучающей матрицы
print(auc_score(model, rate_matrix["train"], num_threads=8).mean())
# Выводим среднюю площадь под ROC-кривой для тестовой матрицы
print(auc_score(model, rate_matrix["test"], num_threads=10).mean())

ValueError: The user feature matrix specifies more features than there are estimated feature embeddings: 1000 vs 1407580.