In [3]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from sklearn.metrics import precision_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

warnings.filterwarnings("ignore")

In [4]:
# Создадим датафреймы на основе предоставленных данных
events_raw = pd.read_csv("./data/events.csv")

category_tree = pd.read_csv("./data/category_tree.csv")

properties_1 = pd.read_csv("./data/item_properties_part1.csv")
properties_2 = pd.read_csv("./data/item_properties_part2.csv")
item_properties_raw = pd.concat([properties_1, properties_2])

# Приведем к временному формату данные из timestapm
events_raw["timestamp"] = pd.to_datetime(events_raw["timestamp"], unit="ms")
item_properties_raw["timestamp"] = pd.to_datetime(item_properties_raw["timestamp"], unit="ms")

events_raw.rename(columns={"timestamp":"date_time"}, inplace=True)
item_properties_raw.rename(columns={"timestamp":"date_time"}, inplace=True)


In [5]:
display(
    events_raw.head(),
    item_properties_raw.head(),
    category_tree.head()
)

Unnamed: 0,date_time,visitorid,event,itemid,transactionid
0,2015-06-02 05:02:12.117,257597,view,355908,
1,2015-06-02 05:50:14.164,992329,view,248676,
2,2015-06-02 05:13:19.827,111016,view,318965,
3,2015-06-02 05:12:35.914,483717,view,253185,
4,2015-06-02 05:02:17.106,951259,view,367447,


Unnamed: 0,date_time,itemid,property,value
0,2015-06-28 03:00:00,460429,categoryid,1338
1,2015-09-06 03:00:00,206783,888,1116713 960601 n277.200
2,2015-08-09 03:00:00,395014,400,n552.000 639502 n720.000 424566
3,2015-05-10 03:00:00,59481,790,n15360.000
4,2015-05-17 03:00:00,156781,917,828513


Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


---
> events — датасет с событиями. Колонки:

• timestamp — время события

• visitorid — идентификатор пользователя

• event — тип события

• itemid — идентификатор объекта

• transactionid — идентификатор транзакции, если она проходила


> category_tree   — файл с деревом категорий (можно восстановить дерево).

• category_id — идентификатор категорий

• parent_id — идентификатор родительской категории



> item_properties - файл с свойствами товаров.

• timestamp — момент записи значения свойства

• item_id — идентификатор объекта

• property — свойство, кажется, они все, кроме категории, захешированы

• value — значение свойства


----

In [17]:
# Функция для классификации
def classification_reconstructor(events_df, prop_df, cat_df):
    # Фичи для data
    def event_features(data_raw):
        data = data_raw.copy()
        data.drop(["transactionid"], axis=1, inplace=True)

        data["day_of_week"] = data["date_time"].map(lambda x: x.weekday())

        data["Day"] = data["date_time"].map(lambda x: x.day)
        data["Hour"] = data["date_time"].map(lambda x: x.hour)
        data["minute"] = data["date_time"].map(lambda x: x.minute)

        def get_time_periods(hour):
            if hour >= 3 and hour < 7:
                return "Dawn"
            elif hour >= 7 and hour < 12:
                return "Morning"
            elif hour >= 12 and hour < 16:
                return "Afternoon"
            elif hour >= 16 and hour < 22:
                return "Evening"
            else:
                return "Night"

        data["day_period"] = data["Hour"].map(get_time_periods)

        data["date_time"] = data["date_time"].dt.date
        data.drop(["date_time"], axis=1, inplace=True)
        data["event"] = data["event"].apply(lambda x: 1 if x == "transaction" else 0)

        data.drop_duplicates(inplace=True)

        data = pd.get_dummies(data=data, columns=["day_period", "day_of_week"], dtype=int)

        return data

    # Фичи для items
    # Возьмем только самые распространенные proprties, например топ 20
    def items_features(data_raw, category_tree):
        data = data_raw.copy()

        data.drop(["date_time"], axis=1, inplace=True)

        data = data[data["property"] == "categoryid"]
        data.drop(["property"], axis=1, inplace=True)
        data = data.drop_duplicates().reset_index(drop=True)
        data.rename(columns={"value": "categoryid"}, inplace=True)

        num_cate = data.groupby("itemid").count()
        num_cate.reset_index(inplace=True)
        num_cate.rename(columns={"categoryid": "num_categoryid"}, inplace=True)

        data["categoryid"] = data["categoryid"].astype(int)
        item = data.merge(category_tree, how="inner", on="categoryid")

        item_grouped = (
            item.groupby(by="itemid")[["categoryid", "parentid"]].count()
        ).reset_index()

        return item_grouped

    events = event_features(events_df)
    item_properties = items_features(prop_df, cat_df)

    df = pd.merge(
        left=events, right=item_properties, left_on="itemid", right_on="itemid"
    )
    
    df.drop(["Day", "Hour", "minute"], axis=1, inplace=True)
    
    return df

---
#### Выделение сэмплов для работы

In [18]:
# Сэмплируем, ибо нет мощей для всего датасета
events_sample = events_raw[events_raw["date_time"].dt.month <= 8]
items_sample = item_properties_raw[item_properties_raw["date_time"].dt.month <= 8]


events_sample.reset_index(drop=True, inplace=True)
items_sample.reset_index(drop=True, inplace=True)

In [19]:
sample = classification_reconstructor(events_sample, items_sample, category_tree)
sample.head()

Unnamed: 0,visitorid,event,itemid,day_period_Afternoon,day_period_Dawn,day_period_Evening,day_period_Morning,day_period_Night,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,categoryid,parentid
0,257597,0,355908,0,1,0,0,0,0,1,0,0,0,0,0,1,1
1,992329,0,248676,0,1,0,0,0,0,1,0,0,0,0,0,1,1
2,483717,0,253185,0,1,0,0,0,0,1,0,0,0,0,0,1,1
3,951259,0,367447,0,1,0,0,0,0,1,0,0,0,0,0,2,2
4,972639,0,22556,0,1,0,0,0,0,1,0,0,0,0,0,1,1


In [20]:
X_sample = sample.drop("event", axis=1)
y_sample = sample["event"]

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X_sample, y_sample, test_size=0.2, stratify=y_sample
)

In [25]:
model_clf = XGBClassifier()
model_clf.fit(X_train, y_train)

y_pred = model_clf.predict(X_test)
score_test = round(accuracy_score(y_test, y_pred), 3)
print(f"Accuracy on test : {score_test}")

Accuracy on test : 0.991


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00    427256
           1       0.00      0.00      0.00      3927

    accuracy                           0.99    431183
   macro avg       0.50      0.50      0.50    431183
weighted avg       0.98      0.99      0.99    431183



In [13]:
# Валидируем
events_val = events_raw[events_raw["date_time"].dt.month > 8]
items_val = item_properties_raw[item_properties_raw["date_time"].dt.month > 8]

validation = classification_reconstructor(events_val, items_val, category_tree)
display(validation.head())

X_val = validation.drop("event", axis=1)
y_val = validation["event"]

y_val_pred = model_clf.predict(X_val)
score_val = round(accuracy_score(y_val, y_val_pred), 3)
print(f"Accuracy on validation : {score_val}")
print(classification_report(y_val, y_val_pred))

Unnamed: 0,visitorid,event,itemid,Day,Hour,day_period_Afternoon,day_period_Dawn,day_period_Evening,day_period_Morning,day_period_Night,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,categoryid,parentid
0,860215,0,403576,1,5,0,1,0,0,0,0,1,0,0,0,0,0,1,1
1,533732,0,229527,1,5,0,1,0,0,0,0,1,0,0,0,0,0,1,1
2,719549,0,35615,1,5,0,1,0,0,0,0,1,0,0,0,0,0,1,1
3,920933,0,98467,1,5,0,1,0,0,0,0,1,0,0,0,0,0,1,1
4,137333,0,223064,1,5,0,1,0,0,0,0,1,0,0,0,0,0,1,1


Accuracy on validation : 0.991
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     25826
           1       0.00      0.00      0.00       232

    accuracy                           0.99     26058
   macro avg       0.50      0.50      0.50     26058
weighted avg       0.98      0.99      0.99     26058



----