In [3]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k


from lightfm import LightFM
from lightfm.evaluation import precision_at_k

warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split

In [4]:
events = pd.read_csv("./data/events.csv")

properties = pd.concat([
    pd.read_csv("./data/item_properties_part1.csv"),
    pd.read_csv("./data/item_properties_part2.csv")
])

categories = pd.read_csv("./data/category_tree.csv")

In [5]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit = 'ms')
properties['event_datetime'] = pd.to_datetime(properties['timestamp'], unit = 'ms')

In [6]:
events['day_of_week'] = events['event_datetime'].map(lambda x: x.weekday())
events['Year'] = events['event_datetime'].map(lambda x: x.year)
events['Month'] = events['event_datetime'].map(lambda x: x.month)
events['Day'] = events['event_datetime'].map(lambda x: x.day)
events['Hour'] = events['event_datetime'].map(lambda x: x.hour)
events['minute'] = events['event_datetime'].map(lambda x: x.minute)

In [7]:
def get_time_periods(hour):
    if hour >= 3 and hour < 7:
        return 'Dawn'
    elif hour >= 7 and hour < 12:
        return 'Morning'
    elif hour >= 12 and hour < 16:
        return 'Afternoon'
    elif hour >= 16 and hour < 22:
        return 'Evening'
    else:
        return 'Night'
    
events['Day Period'] = events['Hour'].map(get_time_periods)
events['Day Period'].value_counts()

Evening      1078199
Night         765924
Dawn          494588
Afternoon     293490
Morning       123900
Name: Day Period, dtype: int64

In [8]:
# Возьмем только самые распространенные proprties, например топ 20 (при построении модели можно играть)
top_properties = properties.drop_duplicates(['itemid', 'property']).groupby("property")['itemid'].count().sort_values(ascending=False)[:10]

In [9]:
properties_filtered = properties[properties['property'].isin(set(top_properties.index))]
properties_filtered.shape

(9889797, 5)

In [10]:
# Далее трансформации стоит делать в соответствии с используемым алгоритмом
properties_filtered.head(10)

Unnamed: 0,timestamp,itemid,property,value,event_datetime
0,1435460400000,460429,categoryid,1338,2015-06-28 03:00:00
1,1441508400000,206783,888,1116713 960601 n277.200,2015-09-06 03:00:00
3,1431226800000,59481,790,n15360.000,2015-05-10 03:00:00
5,1436065200000,285026,available,0,2015-07-05 03:00:00
10,1439089200000,450113,888,1038400 45956 n504.000,2015-08-09 03:00:00
14,1434250800000,169055,790,n21000.000,2015-06-14 03:00:00
15,1437274800000,186518,available,0,2015-07-19 03:00:00
16,1435460400000,178601,790,n5400.000,2015-06-28 03:00:00
17,1436670000000,319291,888,1292080,2015-07-12 03:00:00
21,1431226800000,344365,159,519769,2015-05-10 03:00:00


In [11]:
train, test = train_test_split(events, test_size=0.3, shuffle=False)