In [3]:
import warnings
import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from sklearn.metrics import precision_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

warnings.filterwarnings("ignore")



In [4]:
# Создадим датафреймы на основе предоставленных данных
events_raw = pd.read_csv("./data/events.csv")
# Приведем к временному формату данные из timestapm
events_raw["timestamp"] = pd.to_datetime(events_raw["timestamp"], unit="ms")

events_raw.rename(columns={"timestamp": "date_time"}, inplace=True)

events_raw["date_time"] = pd.to_datetime(events_raw["date_time"].dt.date)

In [66]:
display(events_raw.head())
display(events_raw.info())
events_raw.shape

Unnamed: 0,date_time,visitorid,event,itemid,transactionid
0,2015-06-02,257597,view,355908,
1,2015-06-02,992329,view,248676,
2,2015-06-02,111016,view,318965,
3,2015-06-02,483717,view,253185,
4,2015-06-02,951259,view,367447,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype         
---  ------         -----         
 0   date_time      datetime64[ns]
 1   visitorid      int64         
 2   event          object        
 3   itemid         int64         
 4   transactionid  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 105.1+ MB


None

(2756101, 5)

In [67]:
events = events_raw.copy()
events.drop(columns="transactionid", inplace=True)
events["event"] = events["event"].apply(lambda x: 0 if x != "transaction" else 1)
display(events.head())
events.shape

Unnamed: 0,date_time,visitorid,event,itemid
0,2015-06-02,257597,0,355908
1,2015-06-02,992329,0,248676
2,2015-06-02,111016,0,318965
3,2015-06-02,483717,0,253185
4,2015-06-02,951259,0,367447


(2756101, 4)

In [68]:
train_events = events[
    (events["date_time"].dt.month >= 7) & (events["date_time"].dt.month < 9)
]
test_events = events[(events["date_time"].dt.month == 9)]

shell_events = train_events = events[(events["date_time"].dt.month >= 7)]

In [69]:
top_sales = train_events.groupby("itemid")["event"].sum().to_dict()

In [70]:
sales_bound = 22

In [71]:
train_events["num_of_sales"] = train_events["itemid"].map(top_sales)
train_events = train_events[train_events["num_of_sales"] > sales_bound]
display(train_events.head())
print(train_events.shape)

Unnamed: 0,date_time,visitorid,event,itemid,num_of_sales
590425,2015-07-01,850455,0,48030,25
591668,2015-07-01,1315777,0,48030,25
593868,2015-07-01,897679,0,9877,23
593930,2015-07-01,850455,0,48030,25
595139,2015-07-01,1315777,0,48030,25


(10428, 5)


In [72]:
test_events = test_events[
    (test_events["visitorid"].isin(train_events["visitorid"].values))
    & (test_events["itemid"].isin(train_events["itemid"].values))
]
display(test_events.head())
print(test_events.shape)

Unnamed: 0,date_time,visitorid,event,itemid
1146713,2015-09-01,619929,0,320130
1149177,2015-09-01,1264015,0,461686
1149499,2015-09-01,1249730,0,320130
1149525,2015-09-01,765484,0,320130
1149624,2015-09-01,730608,0,320130


(1695, 4)


In [73]:
train_events_matrix = pd.pivot_table(
    data=train_events,
    index="visitorid",
    columns="itemid",
    values="event",
    aggfunc="sum",
)

train_events_matrix

itemid,546,9877,48030,119736,213834,248455,312728,320130,334401,420960,441852,445351,461686
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
379,,,,,,,,0.0,,,,,
482,,,0.0,,,,,,,,,,
485,,,,,,0.0,,,,,,,
524,,,,,,,,,,,,0.0,
878,,,,,,,,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404445,,,,,,,,0.0,,,,,
1404805,,,,,,,,,,,,0.0,
1406047,,,,0.0,,,,,,,,,
1407281,,,,,,,,,,,0.0,,


In [74]:
test_index = train_events_matrix.index.to_list()
test_columns = train_events_matrix.columns.to_list()

In [75]:
test_events = test_events[
    (test_events["visitorid"].isin(test_index))
    & (test_events["itemid"].isin(test_columns))
]
test_events

Unnamed: 0,date_time,visitorid,event,itemid
1146713,2015-09-01,619929,0,320130
1149177,2015-09-01,1264015,0,461686
1149499,2015-09-01,1249730,0,320130
1149525,2015-09-01,765484,0,320130
1149624,2015-09-01,730608,0,320130
...,...,...,...,...
1461425,2015-09-17,748562,0,213834
1461797,2015-09-17,82425,0,320130
1461841,2015-09-17,388200,0,320130
1461978,2015-09-17,335524,0,320130


In [76]:
test_events_matrix = pd.pivot_table(
    data=test_events,
    index="visitorid",
    columns="itemid",
    values="event",
    aggfunc="sum",
)

test_events_matrix

itemid,546,9877,48030,119736,213834,248455,312728,320130,334401,420960,441852,445351,461686
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2671,,,,,,,,,,0.0,,,
3942,,,,,,,,,,,,,0.0
4162,,,,,,,,,,,,,0.0
6952,,,,,,,,,,,,,1.0
8874,,0.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401366,,,,,,,,,,,0.0,,
1402917,,,,,,,,0.0,,,,,
1403769,,,,,,,0.0,,,,,,
1403819,,,,,,,,,,,,,0.0


In [77]:
shell_events = shell_events[
    (shell_events["visitorid"].isin(test_index))
    & (shell_events["itemid"].isin(test_columns))
]

shell_matrix = pd.pivot_table(
    data=shell_events,
    index="visitorid",
    columns="itemid",
    values="event",
    aggfunc=lambda x: 0,
)

shell_matrix

itemid,546,9877,48030,119736,213834,248455,312728,320130,334401,420960,441852,445351,461686
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
379,,,,,,,,0.0,,,,,
482,,,0.0,,,,,,,,,,
485,,,,,,0.0,,,,,,,
524,,,,,,,,,,,,0.0,
878,,,,,,,,,,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404445,,,,,,,,0.0,,,,,
1404805,,,,,,,,,,,,0.0,
1406047,,,,0.0,,,,,,,,,
1407281,,,,,,,,,,,0.0,,


In [78]:
train_events_matrix = shell_matrix + train_events_matrix
test_events_matrix = shell_matrix + test_events_matrix

train_pivot = train_events_matrix.fillna(0)
test_pivot = test_events_matrix.fillna(0)

print(train_pivot.shape)
print(test_pivot.shape)

(4776, 13)
(4776, 13)


In [79]:
train_pivot_sparse = scipy.sparse.csr_matrix(train_pivot.values)
test_pivot_sparse = scipy.sparse.csr_matrix(test_pivot.values)

---
ALS

In [80]:
model = AlternatingLeastSquares(factors=100, random_state=42)
model.fit(train_pivot_sparse)

  0%|          | 0/15 [00:00<?, ?it/s]

In [81]:
map_at3 = mean_average_precision_at_k(model, train_pivot_sparse, test_pivot_sparse, K=3)
print(f"Mean Average Precision at 3: {round(map_at3*100, 3)} %")

  0%|          | 0/75 [00:00<?, ?it/s]

Mean Average Precision at 3: 0.0 %


----
LightFM

In [82]:
model = LightFM(no_components=100, loss="warp", random_state=42)
model.fit(train_pivot_sparse, epochs=30)

<lightfm.lightfm.LightFM at 0x157a22b00>

In [83]:
map_at3 = precision_at_k(model, test_pivot_sparse, k=3).mean()
print(f"Mean Average Precision at 3: {round(map_at3*100, 3)} %")

Mean Average Precision at 3: 34.667 %


In [None]:
# Создадим датафреймы на основе предоставленных данных
events_raw = pd.read_csv("./data/events.csv")
# Приведем к временному формату данные из timestapm
events_raw["timestamp"] = pd.to_datetime(events_raw["timestamp"], unit="ms")

events_raw.rename(columns={"timestamp": "date_time"}, inplace=True)

events_raw["date_time"] = pd.to_datetime(events_raw["date_time"].dt.date)

events = events_raw.copy()
events.drop(columns="transactionid", inplace=True)
events["event"] = events["event"].apply(lambda x: 0 if x != "transaction" else 1)

train_events = events[
    (events["date_time"].dt.month >= 7) & (events["date_time"].dt.month < 9)
]
test_events = events[(events["date_time"].dt.month == 9)]

shell_events = train_events = events[(events["date_time"].dt.month >= 7)]

top_sales = train_events.groupby("itemid")["event"].sum().to_dict()

train_events["num_of_sales"] = train_events["itemid"].map(top_sales)
train_events = train_events[train_events["num_of_sales"] > sales_bound]

train_events_matrix = pd.pivot_table(
    data=train_events,
    index="visitorid",
    columns="itemid",
    values="event",
    aggfunc="sum",
)

test_index = train_events_matrix.index.to_list()
test_columns = train_events_matrix.columns.to_list()

test_events = test_events[
    (test_events["visitorid"].isin(test_index))
    & (test_events["itemid"].isin(test_columns))
]

test_events_matrix = pd.pivot_table(
    data=test_events,
    index="visitorid",
    columns="itemid",
    values="event",
    aggfunc="sum",
)

shell_events = shell_events[
    (shell_events["visitorid"].isin(test_index))
    & (shell_events["itemid"].isin(test_columns))
]

shell_matrix = pd.pivot_table(
    data=shell_events,
    index="visitorid",
    columns="itemid",
    values="event",
    aggfunc=lambda x: 0,
)

user_ids = test_events_matrix.index
item_ids = test_events_matrix.columns


train_events_matrix = shell_matrix + train_events_matrix
test_events_matrix = shell_matrix + test_events_matrix

train_pivot = train_events_matrix.fillna(0)
test_pivot = test_events_matrix.fillna(0)

train_pivot_sparse = scipy.sparse.csr_matrix(train_pivot.values)
test_pivot_sparse = scipy.sparse.csr_matrix(test_pivot.values)

model = LightFM(no_components=100, loss="warp", random_state=42)
model.fit(train_pivot_sparse, epochs=30)

map_at3 = precision_at_k(model, test_pivot_sparse, k=3).mean()
print(f"Mean Average Precision at 3: {round(map_at3*100, 3)} %")

: 

In [114]:
# Проверяем длину разреженного массива
sparse_array_length = test_pivot_sparse.getnnz()

# Приводим списки идентификаторов пользователей и товаров к одному размеру
user_ids = test_pivot_sparse.indices
item_ids = test_pivot_sparse.indices
if len(user_ids) != sparse_array_length:
    user_ids = user_ids[:sparse_array_length]
if len(item_ids) != sparse_array_length:
    item_ids = item_ids[:sparse_array_length]

# Предсказываем предпочтения для тестовой выборки
test_predictions = model.predict(user_ids, item_ids)

# Преобразуем предсказания к ожидаемому формату (список или строка)
test_predictions = [str(pred) for pred in test_predictions]

# Создаем новый датафрейм с колонками visitorid и itemid
recommendation_df = pd.DataFrame(test_predictions, columns=["itemid"])

# Добавляем колонку с предложенными товарами
recommendation_df["recommended_items"] = recommendation_df["itemid"].apply(lambda x: x[:3])

# Удаляем колонку itemid, так как она больше не нужна
recommendation_df.drop(columns="itemid", inplace=True)

# Выводим датафрейм с предложениями
print(recommendation_df)


   recommended_items
0                0.6
1                -0.
2                0.6
3                -0.
4                -0.
..               ...
76               0.5
77               -0.
78               0.5
79               0.6
80               0.6

[81 rows x 1 columns]
