In [1]:
!pip install -q kaggle
!pip install keras-tuner

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.3.5 kt-legacy-1.0.5


In [2]:
import json
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from datetime import datetime
import matplotlib.pyplot as plt
from model import build_model
from kerastuner.tuners import BayesianOptimization
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from utils import create_chunks, does_file_exists, normalize_data

  from kerastuner.tuners import BayesianOptimization


In [3]:
if not does_file_exists("./kaggle.json"):
  from google.colab import files
  print("Upload your kaggle json api key.")
  files.upload()
  print("api key uploaded successfully!")
  !mkdir ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json
  !mkdir chunks
else:
  print("api key already exists")

Upload your kaggle json api key.


Saving kaggle.json to kaggle.json
api key uploaded successfully!


In [4]:
if not does_file_exists("./otto-recommender-system.zip"):
  !kaggle competitions download -c otto-recommender-system
  !unzip ./otto-recommender-system.zip
else:
  print("dataset already exists")

Downloading otto-recommender-system.zip to /content
 99% 1.91G/1.94G [00:27<00:00, 54.7MB/s]
100% 1.94G/1.94G [00:27<00:00, 76.7MB/s]
Archive:  ./otto-recommender-system.zip
  inflating: sample_submission.csv   
  inflating: test.jsonl              
  inflating: train.jsonl             


In [5]:
# this project only covers the first 100_000 session rows approx 5000_000 rows in total.
CHUNK_SIZE = 100_000
TOTAL_NUMBER_OF_CHUNKS = 1

create_chunks(CHUNK_SIZE, TOTAL_NUMBER_OF_CHUNKS)

creating the 1 chunk.


In [6]:
df = pd.read_json("./chunks/chunk_1.json")
df.head()

Unnamed: 0,session,aid,type,timestamp
0,0,1517085,clicks,2022-07-31 22:00:00.025
1,0,1563459,clicks,2022-07-31 22:01:44.511
2,0,1309446,clicks,2022-08-01 15:23:59.426
3,0,16246,clicks,2022-08-01 15:28:39.997
4,0,1781822,clicks,2022-08-01 15:31:11.344


In [7]:
df.shape

(5227653, 4)

In [8]:
null_counts = df.isnull().sum()
null_counts

session      0
aid          0
type         0
timestamp    0
dtype: int64

In [9]:
# encoding the product_ids
label_encoder = LabelEncoder()
df['aid'] = label_encoder.fit_transform(df['aid'])

In [10]:
df['type'].value_counts()

clicks    4770172
carts      364579
orders      92902
Name: type, dtype: int64

In [11]:
total_unique_sessions = df['session'].nunique()
total_unique_products = df['aid'].nunique()
total_unique_sessions, total_unique_products

(100000, 663079)

In [12]:
df['type_clicks'] = df['type'].apply(lambda x: 1 if x == 'clicks' else 0)
df['type_orders'] = df['type'].apply(lambda x: 1 if x == 'orders' else 0)
df.drop('type', axis=1, inplace=True)
df.head()

Unnamed: 0,session,aid,timestamp,type_clicks,type_orders
0,0,541856,2022-07-31 22:00:00.025,1,0
1,0,558586,2022-07-31 22:01:44.511,1,0
2,0,467564,2022-08-01 15:23:59.426,1,0
3,0,5802,2022-08-01 15:28:39.997,1,0
4,0,636874,2022-08-01 15:31:11.344,1,0


In [13]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['type_clicks', 'type_orders'], axis=1),
    df[['type_clicks', 'type_orders']],
    test_size=0.2,
    random_state=1337,
    stratify=df[['type_clicks', 'type_orders']]
)

In [15]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4182122, 3), (4182122, 2), (1045531, 3), (1045531, 2))

In [16]:
model = build_model(total_unique_sessions, total_unique_products)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 session_input (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 product_input (InputLayer)     [(None, 1)]          0           []                               
                                                                                                  
 session_embedding (Embedding)  (None, 1, 5)         500000      ['session_input[0][0]']          
                                                                                                  
 product_embedding (Embedding)  (None, 1, 5)         3315395     ['product_input[0][0]']          
                                                                                              

In [17]:
model.fit(
    [X_train['session'].values, X_train['aid'].values],
    y_train,
    epochs=10,
    batch_size=131072,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7753c76770>

In [18]:
loss, accuracy = model.evaluate(
    [X_test['session'].values, X_test['aid'].values],
    y_test
)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.10571574419736862, Test Accuracy: 0.9822291135787964
