In [116]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from sklearn.metrics import precision_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from implicit.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

warnings.filterwarnings("ignore")

In [90]:
items1 = pd.read_csv('./data/item_properties_part1.csv')
items2 = pd.read_csv('./data/item_properties_part2.csv')
items = pd.concat([items1, items2])
items["timestamp"] = pd.to_datetime(items["timestamp"], unit="ms")
items.head(10)

Unnamed: 0,timestamp,itemid,property,value
0,2015-06-28 03:00:00,460429,categoryid,1338
1,2015-09-06 03:00:00,206783,888,1116713 960601 n277.200
2,2015-08-09 03:00:00,395014,400,n552.000 639502 n720.000 424566
3,2015-05-10 03:00:00,59481,790,n15360.000
4,2015-05-17 03:00:00,156781,917,828513
5,2015-07-05 03:00:00,285026,available,0
6,2015-06-14 03:00:00,89534,213,1121373
7,2015-05-17 03:00:00,264312,6,319724
8,2015-06-07 03:00:00,229370,202,1330310
9,2015-06-14 03:00:00,98113,451,1141052 n48.000


In [101]:
events = pd.read_csv('./data/events.csv')
events["timestamp"] = pd.to_datetime(events["timestamp"], unit="ms").dt.date
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,2015-06-02,257597,view,355908,
1,2015-06-02,992329,view,248676,
2,2015-06-02,111016,view,318965,
3,2015-06-02,483717,view,253185,
4,2015-06-02,951259,view,367447,


In [92]:
category_tree = pd.read_csv('./data/category_tree.csv')
category_tree.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [106]:
events = events.sort_values('timestamp').reset_index(drop=True)
events = events[['visitorid','itemid','event', 'timestamp']]
events["timestamp"] = pd.to_datetime(events["timestamp"])
events

Unnamed: 0,visitorid,itemid,event,timestamp
0,689859,421640,view,2015-05-03
1,595484,129111,view,2015-05-03
2,596477,233611,view,2015-05-03
3,1224313,68470,view,2015-05-03
4,412359,77602,view,2015-05-03
...,...,...,...,...
2756096,699799,73200,view,2015-09-18
2756097,362806,230348,view,2015-09-18
2756098,85274,120740,view,2015-09-18
2756099,1261556,124708,view,2015-09-18


In [107]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   visitorid  int64         
 1   itemid     int64         
 2   event      object        
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 84.1+ MB


In [108]:
events_train = events[events["timestamp"].dt.month <= 8]

events_test = events[events["timestamp"].dt.month > 8]



In [109]:
events_train

Unnamed: 0,visitorid,itemid,event,timestamp
0,689859,421640,view,2015-05-03
1,595484,129111,view,2015-05-03
2,596477,233611,view,2015-05-03
3,1224313,68470,view,2015-05-03
4,412359,77602,view,2015-05-03
...,...,...,...,...
2452386,746619,422425,view,2015-08-31
2452387,1227013,241134,view,2015-08-31
2452388,309188,70042,view,2015-08-31
2452389,645525,23347,view,2015-08-31


In [115]:
events_test = events_test[
    (events_test["visitorid"].isin(events_train["visitorid"]))
    & (events_test["itemid"].isin(events_train["itemid"]))
]

events_test

Unnamed: 0,visitorid,itemid,event,timestamp
2452391,79627,172206,view,2015-09-01
2452392,1236028,105869,view,2015-09-01
2452398,80333,283498,view,2015-09-01
2452399,560891,417156,addtocart,2015-09-01
2452404,79627,356384,view,2015-09-01
...,...,...,...,...
2756079,994820,89323,view,2015-09-18
2756085,1146164,165348,view,2015-09-18
2756086,152963,366177,view,2015-09-18
2756090,819670,234361,view,2015-09-18


In [None]:
id_cols=['visitorid','itemid']
trans_cat_train=dict()
trans_cat_test=dict()

for k in id_cols:
    cate_enc=LabelEncoder()
    trans_cat_train[k]=cate_enc.fit_transform(events_train[k].values)
    trans_cat_test[k]=cate_enc.transform(events_test[k].values)