In [1]:
import pandas as pd
import torch
import pytorch_lightning as pl
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import torch.utils.data as data
from pytorch_lightning.callbacks import ModelCheckpoint
import numpy as np
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from torchrecsys.datasets import InteractionsDataset, SequenceDataset
from torchrecsys.models import BaseModel
from torchrecsys.task import Ranking
from torchrecsys.layers import BruteForceLayer
import torchrecsys as trs
import matplotlib 
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [2]:
candidates = pd.read_csv("data/candidate_items.csv")
train_purchases = pd.read_csv("data/train_purchases.csv",  parse_dates=['date'])
train_sessions = pd.read_csv("data/train_sessions.csv",  parse_dates=['date'])

test_sessions = pd.read_csv("data/test_leaderboard_sessions.csv",  parse_dates=['date'])
final_test = pd.read_csv("data/test_final_sessions.csv")
all_interactions = pd.concat([train_sessions, train_purchases, test_sessions], ignore_index=True)

features = pd.read_csv("data/item_features.csv")

aux = all_interactions.copy()
n_items = all_interactions.item_id.max()+1

In [3]:
import math

def encode_cyclically(df, columnname):
    norm_values = 2 * math.pi * df[columnname] / df[columnname].max()
    df["sin_"+columnname] = np.sin(norm_values)
    df["cos_"+columnname] = np.cos(norm_values)
    df.drop([columnname],axis=1,inplace=True)

train_purchases["hour"] = train_purchases["date"].dt.hour
train_purchases["day"] = train_purchases["date"].dt.dayofyear
train_purchases["dayofweek"] = train_purchases["date"].dt.dayofweek
train_purchases["weekn"] = train_purchases["date"].dt.week
train_purchases["month"] = train_purchases["date"].dt.month
train_purchases["year"] = train_purchases["date"].dt.year

encode_cyclically(train_purchases, "hour")
encode_cyclically(train_purchases, "day")
encode_cyclically(train_purchases, "dayofweek")
encode_cyclically(train_purchases, "weekn")
encode_cyclically(train_purchases, "month")

session_feature_names = ["year"]
aux_names = ["hour", "day", "dayofweek", "month"]
for columnname in aux_names:
    session_feature_names.append("sin_"+columnname)
    session_feature_names.append("cos_"+columnname)
    
    
train_purchases["item_id"] = train_purchases["item_id"].astype('category')

In [4]:
# count_series = train_purchases[["weekn","item_id"]].groupby(["weekn","item_id"]).size()
# new_df = count_series.to_frame(name = 'size').reset_index()

In [5]:
# new_df.sort_values(by=['size'], ascending=False)

In [6]:
# Model

In [7]:
X = train_purchases[session_feature_names].values
y = train_purchases["item_id"].values

x_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.50, random_state=1)

# define the datasets to evaluate each iteration
evalset = [(X, y)]

In [8]:
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score

In [9]:
# accuracy_score(y_test, y_pred)

In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5, random_state=0, n_jobs=1)
clf.fit(x_train, y_train)

RandomForestClassifier(max_depth=10, n_jobs=1, random_state=0)

In [None]:
# # evaluate random forest algorithm for classification
# from numpy import mean
# from numpy import std
# from sklearn.datasets import make_classification
# from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import RepeatedStratifiedKFold
# from sklearn.ensemble import RandomForestClassifier
# # define dataset
# X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=3)
# # define the model
# model = RandomForestClassifier()
# # evaluate the model
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# # report performance
# print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Score

In [None]:
test_sessions = test_sessions[["session_id", "date"]].drop_duplicates("session_id") #We remove duplicates for now, only take 1 date per session


In [None]:
test_sessions = pd.read_csv("data/test_leaderboard_sessions.csv",  parse_dates=['date'])

import math

def encode_cyclically(df, columnname):
    norm_values = 2 * math.pi * df[columnname] / df[columnname].max()
    df["sin_"+columnname] = np.sin(norm_values)
    df["cos_"+columnname] = np.cos(norm_values)
    df.drop([columnname],axis=1,inplace=True)

test_sessions["hour"] = test_sessions["date"].dt.hour
test_sessions["day"] = test_sessions["date"].dt.dayofyear
test_sessions["dayofweek"] = test_sessions["date"].dt.dayofweek
test_sessions["weekn"] = test_sessions["date"].dt.week
test_sessions["month"] = test_sessions["date"].dt.month
test_sessions["year"] = test_sessions["date"].dt.year

encode_cyclically(test_sessions, "hour")
encode_cyclically(test_sessions, "day")
encode_cyclically(test_sessions, "dayofweek")
encode_cyclically(test_sessions, "weekn")
encode_cyclically(test_sessions, "month")

session_feature_names = ["year"]
aux_names = ["hour", "day", "dayofweek", "month"]
for columnname in aux_names:
    session_feature_names.append("sin_"+columnname)
    session_feature_names.append("cos_"+columnname)
    

In [None]:
test_sessions