In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from sklearn.metrics import precision_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

warnings.filterwarnings("ignore")



In [4]:
# Создадим датафреймы на основе предоставленных данных
events_df = pd.read_csv("./data/events.csv")

category_tree = pd.read_csv("./data/category_tree.csv")

properties_1 = pd.read_csv("./data/item_properties_part1.csv")
properties_2 = pd.read_csv("./data/item_properties_part2.csv")
item_properties_df = pd.concat([properties_1, properties_2])

# Приведем к временному формату данные из timestapm
events_df["timestamp"] = pd.to_datetime(events_df["timestamp"], unit="ms")
item_properties_df["timestamp"] = pd.to_datetime(item_properties_df["timestamp"], unit="ms")

events_df.rename(columns={"timestamp":"date_time"}, inplace=True)
item_properties_df.rename(columns={"timestamp":"date_time"}, inplace=True)


In [7]:
# Put all the visitor id in an array and sort it ascendingly
all_visitors = events_df.visitorid.sort_values().unique()

buying_visitors = (
    events_df[events_df["event"] == "transaction"].visitorid.sort_values().unique()
)

viewing_visitors_list = list(set(all_visitors) - set(buying_visitors))

In [8]:
def create_dataframe(visitor_list):

    array_for_df = []
    for index in visitor_list:

        # Create that visitor's dataframe once
        v_df = events_df[events_df.visitorid == index]

        temp = []
        # Add the visitor id
        temp.append(index)

        # Add the total number of unique products viewed
        temp.append(v_df[v_df.event == "view"].itemid.unique().size)

        # Add the total number of views regardless of product type
        temp.append(v_df[v_df.event == "view"].event.count())

        # Add the total number of purchases
        number_of_items_bought = v_df[v_df.event == "transaction"].event.count()
        temp.append(number_of_items_bought)

        # Then put either a zero or one if they made a purchase
        if number_of_items_bought == 0:
            temp.append(0)
        else:
            temp.append(1)

        array_for_df.append(temp)

    return pd.DataFrame(
        array_for_df,
        columns=[
            "visitorid",
            "num_items_viewed",
            "view_count",
            "bought_count",
            "purchased",
        ],
    )

In [9]:
buying_visitors_df = create_dataframe(buying_visitors)

In [11]:
buying_visitors_df

Unnamed: 0,visitorid,num_items_viewed,view_count,bought_count,purchased
0,172,22,33,2,1
1,186,1,2,1,1
2,264,2,3,2,1
3,419,3,4,1,1
4,539,1,4,1,1
...,...,...,...,...,...
11714,1406787,3,20,1,1
11715,1406981,4,4,1,1
11716,1407070,1,1,1,1
11717,1407110,2,7,1,1


In [10]:
buying_visitors_df.shape

(11719, 5)

In [12]:
viewing_visitors_df = create_dataframe(viewing_visitors_list[0:50000])

In [13]:
viewing_visitors_df

Unnamed: 0,visitorid,num_items_viewed,view_count,bought_count,purchased
0,0,3,3,0,0
1,1,1,1,0,0
2,2,4,8,0,0
3,3,1,1,0,0
4,4,1,1,0,0
...,...,...,...,...,...
49995,50389,1,3,0,0
49996,50390,1,1,0,0
49997,50391,1,1,0,0
49998,50392,1,1,0,0


In [14]:
viewing_visitors_df.shape

(50000, 5)

In [15]:
main_df = pd.concat([buying_visitors_df, viewing_visitors_df], ignore_index=True)

In [16]:
main_df

Unnamed: 0,visitorid,num_items_viewed,view_count,bought_count,purchased
0,172,22,33,2,1
1,186,1,2,1,1
2,264,2,3,2,1
3,419,3,4,1,1
4,539,1,4,1,1
...,...,...,...,...,...
61714,50389,1,3,0,0
61715,50390,1,1,0,0
61716,50391,1,1,0,0
61717,50392,1,1,0,0
