In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from glob import glob
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

2023-07-04 03:30:38.219234: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Set the path to our data
data_path = Path('dataset/')

In [None]:
# Set the size of the chunks we'll load from the JSONL file
chunksize = 100_000

# Set whether to save the processed chunks to disk as Parquet files
save = True

# Load the JSONL file in chunks
chunks = pd.read_json(data_path / 'train.jsonl', lines=True, chunksize=chunksize)

In [None]:
os.mkdir('train_parquet')

In [None]:
# Loop over the chunks
for e, chunk in enumerate(tqdm(chunks, total=129)):

    # Initialize a dictionary to hold the processed events
    event_dict = {
        'session': [],
        'aid': [],
        'ts': [],
        'type': [],
    }

    # Loop over the sessions and events in the chunk
    for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):

        # Loop over the individual events
        for event in events:
            event_dict['session'].append(session)
            event_dict['aid'].append(event['aid'])
            event_dict['ts'].append(event['ts'])
            event_dict['type'].append(event['type'])

    # save DataFrame
    start = str(e*chunksize).zfill(9)
    end = str(e*chunksize+chunksize).zfill(9)

    # Convert the event dictionary to a DataFrame
    event_df = pd.DataFrame(event_dict)

    # If save is set to True, save the DataFrame to disk as a Parquet file
    if save == True:

        # The file name includes the range of indices included in this chunk
        event_df.to_parquet(f"train_parquet/{start}_{end}.parquet")

100%|██████████| 129/129 [17:41<00:00,  8.23s/it]


Collect sorted list of parquet files in the directory

In [None]:
file_paths = sorted(glob('train_parquet/*'))[:20]
dataframes = []

Loop over each file

In [None]:
for file_path in tqdm(file_paths):
    dataframes.append(pd.read_parquet(file_path))

100%|██████████| 5/5 [00:05<00:00,  1.04s/it]


Concatenate all the dataframes into a single dataframe

In [None]:
dataframe_copy = pd.concat(dataframes).reset_index(drop=True)

In [None]:
del dataframes

In [None]:
dataframe_copy

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
22547351,499999,218792,1659716490375,clicks
22547352,499999,687266,1659716504581,clicks
22547353,499999,218792,1659716518923,clicks
22547354,499999,862274,1659716743631,clicks


# Deep Learning Approach

### Exploring the Deep Learning Approach

Our recommendation model involves several key steps, each serving a crucial role in the process.

1. **Loading and Preprocessing Data:** The first step involves loading data from multiple Parquet files into a combined pandas DataFrame. We then create a copy of the DataFrame. After this, we map the unique aid identifiers (aid) to integers, enhancing computational efficiency for the later steps.

2. **Splitting the Dataset:** Next, we split the dataset into a training set and a test set based on unique sessions, reserving the last 20% of unique sessions for the test set. The training set and test set are split in such a way that all records associated with a particular session remain together in either the training set or the test set.

3. **Preparing Data for Training:** In this step, we first group the training data by 'session' and 'type', creating lists of aid IDs for each group. We do the same for the test data. We then convert these lists into NumPy arrays and apply padding to ensure all arrays have the same length.

4. **Defining the Model:** Here, we define a Sequential model with Keras, starting with an Embedding layer that takes the unique aid identifiers as input and outputs a dense 20-dimensional vector. We then add a Bidirectional LSTM layer. LSTMs (Long Short-Term Memory) are a type of Recurrent Neural Network (RNN) that are great for sequence prediction problems because they can store past information. This LSTM layer has 64 units and includes dropout and recurrent dropout for regularization. The final layer is a Dense output layer with a softmax activation function, which outputs a probability distribution over all unique aid IDs.

5. **Compiling and Training the Model:** We use the Adam optimizer with a learning rate of 0.01, which is a popular choice as it combines the best properties of the AdaGrad and RMSProp algorithms. The model is compiled with a categorical crossentropy loss function, which is suitable for multiclass classification problems. After compiling, we train the model on the training data.

6. **Generating Recommendations:** For each session in the test set, we use the trained model to predict the next aid to be visited. This prediction is based on the sequence of aids visited in the session so far. The output is a probability distribution over all aids, and we select the top 20 aids with the highest probabilities to recommend.

7. **Preparing Submission:** Finally, we map the predicted aid IDs back to their original values and prepare a DataFrame for submission, which includes the session, type, and predicted labels.

Filter out sessions that do not include all types

In [None]:
counts = dataframe_copy.groupby('session')['type'].nunique()
full_sessions = counts[counts == 3].index
dataframe_copy = dataframe_copy[dataframe_copy['session'].isin(full_sessions)]

In [None]:
dataframe_copy

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
22547307,499997,1123617,1661270771250,clicks
22547308,499997,1123617,1661270784442,carts
22547309,499997,1123617,1661270822296,clicks
22547310,499997,1573056,1661342245430,clicks


Generate an array of unique IDs equal to the number of unique aid ids

In [None]:
unique_ids = np.arange(dataframe_copy.aid.nunique())

Shuffle the array to prevent any correlation between new labels and outcome

In [None]:
np.random.shuffle(unique_ids)

In [None]:
unique_ids

array([710336, 804044, 819830, ..., 488958, 243820,  43201])

In [None]:
dataframe_copy

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
22547307,499997,1123617,1661270771250,clicks
22547308,499997,1123617,1661270784442,carts
22547309,499997,1123617,1661270822296,clicks
22547310,499997,1573056,1661342245430,clicks


Create a dictionary mapping each unique aid id to a new unique integer ID

In [None]:
map_aid = {i: j for i, j in zip(dataframe_copy.aid.unique(), unique_ids)}

Add a new column to the dataframe with the new integer IDs for each aid

In [None]:
dataframe_copy['aid_id'] = dataframe_copy['aid'].map(map_aid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_copy['aid_id'] = dataframe_copy['aid'].map(map_aid)


In [None]:
dataframe_copy

Unnamed: 0,session,aid,ts,type,aid_id
0,0,1517085,1659304800025,clicks,710336
1,0,1563459,1659304904511,clicks,804044
2,0,1309446,1659367439426,clicks,819830
3,0,16246,1659367719997,clicks,1006226
4,0,1781822,1659367871344,clicks,656039
...,...,...,...,...,...
22547307,499997,1123617,1661270771250,clicks,783490
22547308,499997,1123617,1661270784442,carts,783490
22547309,499997,1123617,1661270822296,clicks,783490
22547310,499997,1573056,1661342245430,clicks,140632


# Spliting

Test contains the last 20% of total sessions because we need to evaluate our model's ability to predict aid values for future sessions.

In [None]:
# Calculate the number of sessions for the test set (Last 20 % of total sessions)
num_sessions = dataframe_copy['session'].nunique()
test_sessions = int(num_sessions * 0.2)

In [None]:
test_sessions

24098

In [None]:
# Get the session IDs for the test set
test_session_ids = dataframe_copy['session'].unique()[-test_sessions:]

# Split the DataFrame based on the session IDs
train_df = dataframe_copy[~dataframe_copy['session'].isin(test_session_ids)]
test_df = dataframe_copy[dataframe_copy['session'].isin(test_session_ids)]

# Print the sizes of train and test sets
print("Train set size:", len(train_df))
print("Test set size:", len(test_df))

Train set size: 10349751
Test set size: 2216025


In [None]:
train_df

Unnamed: 0,session,aid,ts,type,aid_id
0,0,1517085,1659304800025,clicks,758765
1,0,1563459,1659304904511,clicks,171974
2,0,1309446,1659367439426,clicks,265567
3,0,16246,1659367719997,clicks,56166
4,0,1781822,1659367871344,clicks,876049
...,...,...,...,...,...
18414088,391632,352410,1661284901978,clicks,241797
18414089,391632,352410,1661284952683,carts,241797
18414090,391632,1168501,1661284958323,clicks,743324
18414091,391632,352410,1661284962669,clicks,241797


In [None]:
test_df

Unnamed: 0,session,aid,ts,type,aid_id
18414219,391642,1188834,1659347558823,clicks,1009404
18414220,391642,1512980,1659374468005,clicks,121138
18414221,391642,668046,1659374492037,clicks,158322
18414222,391642,1264540,1659374561357,clicks,908091
18414223,391642,1264540,1659374588919,clicks,908091
...,...,...,...,...,...
22547307,499997,1123617,1661270771250,clicks,48395
22547308,499997,1123617,1661270784442,carts,48395
22547309,499997,1123617,1661270822296,clicks,48395
22547310,499997,1573056,1661342245430,clicks,685633


Group the dataframe by session, aggregating the aid_ids into lists

In [None]:
# Group by 'session' and 'type'
train_dataframe = train_df.groupby(['session', 'type']).agg({'aid_id': lambda x: list(x)})
test_dataframe = test_df.groupby(['session', 'type']).agg({'aid_id': lambda x: list(x)})

In [None]:
train_dataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,aid_id
session,type,Unnamed: 2_level_1
0,carts,"[898872, 318948, 893739, 895985, 895985, 89598..."
0,clicks,"[758765, 171974, 265567, 56166, 876049, 254363..."
0,orders,"[631232, 318948, 10188, 371619]"
3,carts,"[767594, 492848, 8064, 204961, 186935, 347, 53..."
3,clicks,"[767594, 492848, 492848, 767594, 492848, 96887..."
...,...,...
391629,clicks,"[614459, 690972, 690972, 651886, 836486, 69902..."
391629,orders,[690972]
391632,carts,"[199400, 343272, 441064, 548834, 548834, 54883..."
391632,clicks,"[88697, 213951, 406029, 607814, 406029, 8467, ..."


Filter out any sessions with more than 20 aid_ids

In [None]:
train_dataframe = train_dataframe[train_dataframe.aid_id.map(len) <= 20]
test_dataframe = test_dataframe[test_dataframe.aid_id.map(len) <= 20]

1- Train

map function is applied on 'aid_id' column of 'test_dataframe'.
This map function applies the len function on each element of 'aid_id' column.
So, this will return a list of lengths of each element in 'aid_id'.
 max function will then find the maximum length from this list.
So, 'max_length_test' is the maximum length of any 'aid_id' in our dataframe.

In [None]:
max_length = max(map(len, train_dataframe.aid_id))
X = np.asarray([[0]*(max_length-len(xi)) + xi for xi in train_dataframe.aid_id]).astype('int32')

X_train = X[:,:-1]
y_train_label = X[:, -1]

In [None]:
X_train

array([[     0,      0,      0, ..., 337156,  84168, 239188],
       [     0,      0,      0, ..., 319146, 235313, 239188],
       [     0,      0,      0, ...,  67924, 175678, 123385],
       ...,
       [     0,      0,      0, ...,  38526,  69426, 125793],
       [     0,      0,      0, ...,  29366, 173884,  29366],
       [     0,      0,      0, ..., 173884,  21916, 262289]], dtype=int32)

2- Test

In [None]:
max_length_test = max(map(len, test_dataframe.aid_id))
X_t = np.asarray([[0]*(max_length_test-len(xi)) + xi for xi in test_dataframe.aid_id]).astype('int32')

X_test = X_t[:,:-1]
y_test_label = X_t[:, -1]

In [None]:
# One-hot encode the labels
y_train = tf.keras.utils.to_categorical(y_train_label, num_classes=dataframe_copy.aid.nunique())
y_test = tf.keras.utils.to_categorical(y_test_label, num_classes=dataframe_copy.aid.nunique())

In [None]:
del dataframe_copy
del train_df
del test_df
del train_dataframe
del test_dataframe
del y_train_label
del y_test_label

# Define the DL model

In [None]:
model = tf.keras.models.Sequential()

model.add(layers.Embedding(input_dim=dataframe_copy.aid.nunique(), output_dim=20, input_length=X_train.shape[1]))

model.add(layers.Bidirectional(layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)))

model.add(layers.Dense(dataframe_copy.aid.nunique(), activation='softmax'))

optimizer = Adam(learning_rate=0.01)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

2023-07-01 07:07:17.851253: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [None]:
history = model.fit(X_train, y_train, epochs=32, verbose=1)

2023-07-01 07:07:29.911934: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 46870099232 exceeds 10% of free system memory.


Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


Save the model after training

In [None]:
model.save('DNN_model.h5')

# Loading the model

In [None]:
# Specify the path to the saved model directory
model_path = 'DNN_model.h5'

# Load the model
model = tf.keras.models.load_model(model_path)

2023-07-03 08:59:01.138145: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


Make predictions using the trained model

In [None]:
import gc
gc.collect()

21

In [None]:
# Reverse the mapping dictionary
reverse_map = {v: k for k, v in map_aid.items()}

# Use the model to predict the `aid_id` values
y_test_pred = model.predict(X_test)

# For each prediction, find the top 20 `aid_id` values
top_20_aid_ids = np.argsort(y_test_pred, axis=1)[:, -20:]

# Map the `aid_id` values back to `aid` values
top_20_aids = np.vectorize(reverse_map.get)(top_20_aid_ids)

# Note that the result is still in `aid_id` order, which may not be the same as the original `aid` order
# To sort them back into `aid` order, we can do:
top_20_aids_sorted = np.sort(top_20_aids, axis=1)



2023-07-03 08:59:31.128637: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 9424207936 exceeds 10% of free system memory.


In [None]:
# Create a DataFrame to hold the results
results = pd.DataFrame({
    'session_type': [f"{session}_{event_type}" for session, event_type in test_dataframe.index],
    'labels': [' '.join(map(str, aids)) for aids in top_20_aids_sorted]
})

# Write the results to a CSV file
results.to_csv('submission.csv', index=False)

In [None]:
df = pd.read_csv('submission.csv')

Generating Labels for Test Data based on Historical Events ( function from the otto-offical-github )

In [None]:
from typing import List, Dict

def ground_truth(events: List[Dict]):
    prev_labels = {"clicks": [], "carts": [], "orders": []}

    for event in reversed(events):
        event["labels"] = {}

        for label in ['clicks', 'carts', 'orders']:
            if prev_labels[label]:
                event["labels"][label] = prev_labels[label].copy()

        if event["type"] == "clicks":
            prev_labels['clicks'].insert(0, event["aid"])
        if event["type"] == "carts":
            prev_labels['carts'].insert(0, event["aid"])
        elif event["type"] == "orders":
            prev_labels['orders'].insert(0, event["aid"])

    return events[:-1]

In [None]:
test_dataframe.reset_index(inplace=True)

In [None]:
test = test_df[test_df['session'].isin(test_dataframe['session'])]

In [None]:
# assuming df is our DataFrame
test['aid'] = test['aid_id']
test = test.drop('aid_id', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['aid'] = test['aid_id']


In [None]:
# Convert the DataFrame to a list of dictionaries
test_events = test.to_dict('records')

In [None]:
# Call the function with the list of event dictionaries
ground_truth_events = ground_truth(test_events)

In [None]:
# Write the data to jsonl file
import json

with open('test_labels.jsonl', 'w') as f:
    for event in ground_truth_events:
        f.write(json.dumps(event) + '\n')

In [None]:
import json

with open('test_labels.jsonl', 'r') as f:
    count = 0
    for line in f:
        if count < 1:
            event = json.loads(line)
            print('--- Event', count+1, '---')
            print(json.dumps(event, indent=4))  # Print formatted JSON
            count += 1
        else:
            break

--- Event 1 ---
{
    "session": 69527,
    "aid": 302080,
    "ts": 1659314407307,
    "type": "clicks",
    "labels": {
        "clicks": [
            149420,
            275872,
            189898,
            183306,
            189935,
            183306,
            338648,
            338565,
            21553,
            200407,
            145606,
            221304,
            136446,
            145606,
            200407,
            173115,
            298985,
            77380,
            222224,
            20352,
            302080,
            50068,
            113698,
            107256,
            260040,
            260040,
            50068,
            169087,
            169087,
            113698,
            55219,
            302080,
            83625,
            132976,
            105693,
            105693,
            105693,
            113698,
            149420,
            249687,
            276645,
            249687,
            310602,
     

In [None]:
!pip install beartype

Collecting beartype
  Downloading beartype-0.14.1-py3-none-any.whl (739 kB)
[K     |████████████████████████████████| 739 kB 7.0 MB/s eta 0:00:01
[?25hInstalling collected packages: beartype
Successfully installed beartype-0.14.1


In [None]:
!python -m evaluate --test-labels test_labels.jsonl --predictions submission.csv

INFO:root:Reading labels from test_labels.jsonl
Preparing labels: 100%|█████████████████| 10000/10000 [00:05<00:00, 1800.38it/s]
INFO:root:Read 80 labels
INFO:root:Reading predictions from submission.csv
Preparing predictions: 100%|███████████| 29998/29998 [00:01<00:00, 26007.25it/s]
INFO:root:Read 6022 predictions
INFO:root:Calculating scores
Evaluating sessions: 100%|███████████████████| 80/80 [00:00<00:00, 69227.22it/s]
INFO:root:Scores: {'clicks': 0.1435137547052574, 'carts': 0.5789133247089263, 'orders': 0.6167213114754099, 'total': 0.5580581597684495}


# co-visitation matrix

### Understanding the Item-Item Collaborative Filtering Approach

Our recommendation approach involves several steps, each serving an essential role in the process.

1. **Mapping unique aid identifiers:** This is our preprocessing stage where we convert each unique aid ID in the dataset into an integer. This conversion is vital for enhanced performance, particularly because we need to generate a matrix, i.e., the co-visitation matrix, that utilizes these IDs.

2. **Splitting the dataset:** Next, we partition our data into a training set and a test set, based on unique sessions. We opt for sessions, rather than individual records, to ensure all records associated with a particular session remain together in either the training or test set.

3. **Creating a co-visitation matrix:** This step embodies the essence of Item-Item collaborative filtering. We construct a co-visitation matrix using the training data, where the entries account for the frequency at which each pair of aids (items) were visited during the same session. Consequently, this matrix encapsulates the "similarity" between pairs of items grounded on their co-visitation count. This similarity is then leveraged to generate recommendations.

4. **Generating recommendations:** For each unique session in the test set, we calculate a "score" for each aid. This score is the sum of the co-visitation counts of that aid with all other aids visited during that session, as documented in the co-visitation matrix. Subsequently, we select the top 20 aids boasting the highest scores. These aids are those that are most "similar" (i.e., frequently co-visited) to the aids in the current session, and are therefore recommended.

5. **Evaluating the model:** Finally, we assess our model's performance using recall. For each session, the algorithm determines how many of the actual aids visited during that session feature among the top 20 recommended aids. The final performance metric is then derived as the average of these recall values across all sessions.

This approach stands out as an Item-Item Collaborative Filtering technique, where recommendations are based on the similarity between items. It differs from User-User Collaborative Filtering, where the similarity between users (sessions, in this context) would drive the recommendation process.


In [None]:
from collections import Counter
from itertools import combinations
from scipy import sparse as sps
from tqdm import tqdm

In [None]:
# Set the path to our data
data_path = Path('dataset/')

In [None]:
file_paths = sorted(glob('train_parquet/*'))[:20]
dataframes = []

In [None]:
for file_path in tqdm(file_paths):
    dataframes.append(pd.read_parquet(file_path))

100%|██████████| 5/5 [00:06<00:00,  1.22s/it]


In [None]:
dataframes = pd.concat(dataframes).reset_index(drop=True)
dataframes

In [None]:
dataframe_copy = dataframes.copy()

In [None]:
del dataframes

In [None]:
dataframe_copy

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
22547351,499999,218792,1659716490375,clicks
22547352,499999,687266,1659716504581,clicks
22547353,499999,218792,1659716518923,clicks
22547354,499999,862274,1659716743631,clicks


In [None]:
# Map the unique aid ids to integers
unique_ids = np.arange(dataframe_copy.aid.nunique())
np.random.shuffle(unique_ids)
map_aid = {i:j for i, j in zip(dataframe_copy.aid.unique(), unique_ids)}

In [None]:
# Reverse mapping for later use
reverse_map_aid = {v: k for k, v in map_aid.items()}

In [None]:
# Assign each aid an integer id
dataframe_copy['aid_id'] = dataframe_copy['aid'].map(map_aid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe_copy['aid_id'] = dataframe_copy['aid'].map(map_aid)


In [None]:
# Splitting data into training and test set
num_sessions = dataframe_copy['session'].nunique()  # total number of sessions
test_sessions = int(num_sessions * 0.01)  # 1% of total sessions for testing
test_session_ids = dataframe_copy['session'].unique()[-test_sessions:]  # session ids for test set

In [None]:
# Create training and test dataframes
train_df = dataframe_copy[~dataframe_copy['session'].isin(test_session_ids)]
test_df = dataframe_copy[dataframe_copy['session'].isin(test_session_ids)]

In [None]:
# Create a co-visitation matrix for training data
# Co-visitation is defined as the number of times two aids appear in the same session
co_visits = Counter()
for _, group in train_df.groupby('session'):
    aids = group['aid_id'].values
    co_visits.update(combinations(aids, 2))

In [None]:
# Create sparse matrix from the counter
rows, cols = zip(*co_visits.keys())
data = list(co_visits.values())
co_matrix = sps.coo_matrix((data, (rows, cols)), shape=(len(unique_ids), len(unique_ids)))
co_matrix_csr = co_matrix.tocsr()  # Convert the matrix to CSR format for efficient arithmetic and matrix operations

In [None]:
# Function to calculate recall
def recall_per_session(predicted_aids, actual_aids):
    hits = len(set(predicted_aids) & set(actual_aids))
    total_actual = len(actual_aids)
    recall = hits / total_actual
    return recall

# Function to generate recommendations for each session
def generate_recommendations(df):
    recommendations = dict()
    for (session, type_), group in tqdm(df.groupby(['session', 'type']), "Generating recommendations"):
        aids = group['aid_id'].values
        scores = np.asarray(co_matrix_csr[aids].sum(axis=0)).squeeze()
        top_aid_ids = np.argpartition(scores, -20)[-20:]  # Get the top 20 aids
        recommendations[(session, type_)] = top_aid_ids
    return recommendations

In [None]:
# Generate recommendations for test set
test_recommendations = generate_recommendations(test_df)

Generating recommendations: 100%|██████████| 3612/3612 [01:03<00:00, 57.14it/s]


In [None]:
# Print the top 20 recommendations for each session and type in the test set
for (session_id, type_), top20_aids in test_recommendations.items():
    print(f"Session ID: {session_id}, Type: {type_}")
    print(f"Recommended aid values: {top20_aids}")
    print("\n")

Session ID: 494453, Type: carts
Recommended aid values: [ 538784  565928  203840  212138  913250 1005065  498029  101103  748427
  108118  947482  726955  789397  569482  523262  639091  299897  136961
  591416   66250]


Session ID: 494453, Type: clicks
Recommended aid values: [712196 212816 913312 789397  50983 445871 746656 845728 443845  78158
 585963 448448 224751 523262 898705 158628 661375 833121 952600 591416]


Session ID: 494453, Type: orders
Recommended aid values: [341262 172443 579818 153585 479047 843431 140250 746149 162460 254988
 450537 144646 573789 158628 593683 836864 968772 745548 213673 661339]


Session ID: 494458, Type: carts
Recommended aid values: [846676 262774 840721 353615 642322 378195 818726 569785 638657  32538
 934602 263906 794144 443784 946352 746761 160030 595866 883477 429208]


Session ID: 494458, Type: clicks
Recommended aid values: [846676 262774 840721 934602 160030 818726 946352 569785 429208 638657
 353615 595866 746761  32538 794144 263906 37

Session ID: 498735, Type: carts
Recommended aid values: [   2264  178665  805833  982657  735208  578705  887673  668889  159915
 1004135  282161  728310  132072  485375  761780  321911   19560  948728
  473473   87997]


Session ID: 498735, Type: clicks
Recommended aid values: [ 473059  980914  639068  485375  159915  363640  291629  658445  948728
  132072  282161  449859   19560  733526  736232  224751  100952 1004135
  220048   87997]


Session ID: 498735, Type: orders
Recommended aid values: [  41281  321911   44393  736321  761780  582621  159915  282161  393943
  218205  178665  485375  527088  668889  713034  923049   87997  132072
  360371 1004135]


Session ID: 498736, Type: carts
Recommended aid values: [442346 120427 357964 486220 555124 237872 682605 391801 506312 540975
 683935 281866 261637 230287   6717 248941 793043 454170 146603 290306]


Session ID: 498736, Type: clicks
Recommended aid values: [201292 717751 824774 854542 330946 132116 524998 591416 888323 411370
 28

In [None]:
# Calculate recall for test set
recalls = {"clicks": [], "carts": [], "orders": []}
weights = {"clicks": 0.1, "carts": 0.3, "orders": 0.6}

In [None]:
#for (session, type_), actual in test_df.groupby(['session', 'type'])['aid_id']:
#    if (session, type_) in test_recommendations:
#        predicted = test_recommendations[(session, type_)]
#        recalls[type_].append(recall_per_session(predicted, actual))

In [None]:
# Calculate recall for test set
recalls = []
for (session, type_), actual in test_df.groupby(['session', 'type'])['aid_id']:
    if (session, type_) in test_recommendations:
        predicted = test_recommendations[(session, type_)]
        recalls[type_].append(recall_per_session(predicted, actual))

# Calculate weighted recall for test set
weighted_recall = sum(weights[type_] * np.mean(recall) for type_, recall in recalls.items())

print("Weighted recall on test set: ", weighted_recall)

Weighted Recall on test set:  0.4178251472236578


# Matrix Factorization

### Understanding the Matrix Factorization Approach

Our recommendation approach entails several critical steps to build and evaluate a Matrix Factorization model for our session-based recommendation task. Here's a breakdown of these steps:

1. **Mapping unique aid and session identifiers:** The preprocessing stage involves converting each unique aid ID and session ID in the dataset into an integer. This transformation ensures improved performance during the creation of an interaction matrix. This interaction matrix represents the session-aid interactions where rows represent sessions and columns represent aids. Entries in this matrix are binary, indicating whether an aid was visited during a particular session.

2. **Splitting the dataset:** Next, we partition our data into a training set and a test set, based on unique sessions. We opt for sessions, rather than individual records, to ensure all records associated with a particular session remain together in either the training or test set.

3. **Creating a Session-aid Interaction Matrix:** We create an interaction matrix, where the rows represent unique sessions, and the columns represent unique aids. If a particular aid is visited during a specific session, the corresponding entry in the matrix is marked as 1; otherwise, it remains 0. This matrix plays a crucial role in characterizing the interactions between sessions and aids.

4. **Matrix Factorization using Non-negative Matrix Factorization (NMF):** At the heart of our approach is the application of Non-negative Matrix Factorization (NMF) on the interaction matrix. NMF decomposes the interaction matrix into two lower-rank matrices: one representing the 'session-latent factor' relationships and the other representing 'aid-latent factor' relationships. The latent factors can be thought of as underlying patterns that explain the observed interactions between sessions and aids.

5. **Generating Recommendations:** For each unique session, we compute a 'score' for each aid based on the dot product of the session's latent factor vector and the latent factor vectors of all aids. This score measures the likelihood of the session interacting with each aid. We then select the top 20 aids with the highest scores as our recommendations.

6. **Evaluating the Model:** Finally, we assess the performance of our model using recall. For each session, the algorithm calculates how many of the actual aids visited during that session are included in the top 20 recommended aids. The final performance metric is the average of these recall values across all sessions.

Matrix Factorization, unlike Collaborative Filtering, doesn't rely on explicit similarity measures between items or users. Instead, it infers latent factors from the interaction data, which captures the underlying patterns driving the interactions. These inferred patterns are then used to make recommendations.

In [None]:
from scipy import sparse as sps
from sklearn.decomposition import NMF
from tqdm import tqdm

In [None]:
# Set the path to our data
data_path = Path('dataset/')

In [None]:
file_paths = sorted(glob('train_parquet/*'))[:20]
dataframes = []

In [None]:
for file_path in tqdm(file_paths):
    dataframes.append(pd.read_parquet(file_path))

100%|██████████| 5/5 [00:06<00:00,  1.25s/it]


In [None]:
dataframes = pd.concat(dataframes).reset_index(drop=True)
dataframes

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
22547351,499999,218792,1659716490375,clicks
22547352,499999,687266,1659716504581,clicks
22547353,499999,218792,1659716518923,clicks
22547354,499999,862274,1659716743631,clicks


In [None]:
dataframe_copy = dataframes.copy()

In [None]:
dataframe_copy

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
22547351,499999,218792,1659716490375,clicks
22547352,499999,687266,1659716504581,clicks
22547353,499999,218792,1659716518923,clicks
22547354,499999,862274,1659716743631,clicks


In [None]:
# Map the unique aid ids to integers
unique_ids = np.arange(dataframe_copy.aid.nunique())
np.random.shuffle(unique_ids)
map_aid = {i:j for i, j in zip(dataframe_copy.aid.unique(), unique_ids)}

In [None]:
# Reverse mapping for later use
reverse_map_aid = {v: k for k, v in map_aid.items()}

# Assign each aid an integer id
dataframe_copy['aid_id'] = dataframe_copy['aid'].map(map_aid)

# Create a unique index for session & type pair
dataframe_copy['session_type'] = dataframe_copy['session'].astype(str) + '_' + dataframe_copy['type']

In [None]:
# Assign each aid an integer id
dataframe_copy['aid_id'] = dataframe_copy['aid'].map(map_aid)

# Map the unique session ids to integers
unique_sessions = np.arange(dataframe_copy.session.nunique())
np.random.shuffle(unique_sessions)
map_session = {i:j for i, j in zip(dataframe_copy.session.unique(), unique_sessions)}

# Assign each session an integer id
dataframe_copy['session_id'] = dataframe_copy['session'].map(map_session)

In [None]:
# Splitting data into training and test set
num_sessions = dataframe_copy['session'].nunique()  # total number of sessions
test_sessions = int(num_sessions * 0.2)  # 20% of total sessions for testing
test_session_ids = dataframe_copy['session'].unique()[-test_sessions:]  # session ids for test set

# Create training and test dataframes
train_df = dataframe_copy[~dataframe_copy['session'].isin(test_session_ids)]
test_df = dataframe_copy[dataframe_copy['session'].isin(test_session_ids)]

In [None]:
train_df

Unnamed: 0,session,aid,ts,type,aid_id,session_type,session_id
0,0,1517085,1659304800025,clicks,1237022,0_clicks,347641
1,0,1563459,1659304904511,clicks,312397,0_clicks,347641
2,0,1309446,1659367439426,clicks,64457,0_clicks,347641
3,0,16246,1659367719997,clicks,462668,0_clicks,347641
4,0,1781822,1659367871344,clicks,705412,0_clicks,347641
...,...,...,...,...,...,...,...
18739942,399999,680003,1661371629957,clicks,1299238,399999_clicks,476997
18739943,399999,1810497,1661371643038,clicks,612305,399999_clicks,476997
18739944,399999,1810497,1661703105912,clicks,612305,399999_clicks,476997
18739945,399999,1073953,1661703159544,clicks,884634,399999_clicks,476997


In [None]:
test_df

Unnamed: 0,session,aid,ts,type,aid_id,session_type,session_id
18739947,400000,1365182,1659348034546,clicks,1245076,400000_clicks,207441
18739948,400000,394152,1659348055819,clicks,979322,400000_clicks,207441
18739949,400000,394152,1659348182607,clicks,979322,400000_clicks,207441
18739950,400000,394152,1659348194817,clicks,979322,400000_clicks,207441
18739951,400000,394152,1659348203227,clicks,979322,400000_clicks,207441
...,...,...,...,...,...,...,...
22547351,499999,218792,1659716490375,clicks,1056345,499999_clicks,332485
22547352,499999,687266,1659716504581,clicks,446698,499999_clicks,332485
22547353,499999,218792,1659716518923,clicks,1056345,499999_clicks,332485
22547354,499999,862274,1659716743631,clicks,42506,499999_clicks,332485


In [None]:
# Create session-aid matrix
rows, cols = train_df['session_id'], train_df['aid_id']
data = np.ones(len(rows))
matrix = sps.coo_matrix((data, (rows, cols)))

In [None]:
# Perform matrix factorization using Non-negative Matrix Factorization (NMF)
n_components = 1000 # Number of components to keep
nmf = NMF(n_components=n_components, init='random', random_state=0)
W = nmf.fit_transform(session_aid_matrix)  # Session-latent factors matrix
H = nmf.components_  # aid-latent factors matrix



In [None]:
# Function to calculate recall
def recall_per_session(predicted_aids, actual_aids):
    hits = len(set(predicted_aids) & set(actual_aids))
    total_actual = len(actual_aids)
    recall = hits / total_actual
    return recall

# Function to generate recommendations for each session
def generate_recommendations(df):
    recommendations = dict()
    for session in tqdm(df['session_id'].unique(), "Generating recommendations"):
        session_vector = W[session]
        scores = np.dot(session_vector, H)
        top_aid_ids = np.argpartition(scores, -20)[-20:]  # Get the top 20 aids
        recommendations[session] = top_aid_ids
    return recommendations

In [None]:
# Generate recommendations for each session
recommendations = generate_recommendations(test_df)

Generating recommendations: 100%|██████████| 12/12 [00:00<00:00, 336.31it/s]


In [None]:
# Get the session-type for each session_id
session_type_df = test_df[['session', 'type', 'session_id']].drop_duplicates()

# Print the top 20 recommendations for each session and type in the test set
for _, row in session_type_df.iterrows():
    if row['session_id'] in recommendations:
        print(f"Session ID: {row['session']}, Type: {row['type']}")
        print(f"Recommended aid values: {recommendations[row['session_id']]}")
        print("\n")

Session ID: 1245, Type: carts
Recommended aid values: [15163 15165 15164 15167 15162 15168 15160 15159 15166 15158 15177 15169
 15170 15171 15172 15173 15174 15175 15176 45512]


Session ID: 1245, Type: clicks
Recommended aid values: [15163 15165 15164 15167 15162 15168 15160 15159 15166 15158 15177 15169
 15170 15171 15172 15173 15174 15175 15176 45512]


Session ID: 1246, Type: clicks
Recommended aid values: [15163 15165 15164 15167 15162 15168 15160 15159 15166 15158 15177 15169
 15170 15171 15172 15173 15174 15175 15176 45512]


Session ID: 1247, Type: clicks
Recommended aid values: [15163 15165 15164 15167 15162 15168 15160 15159 15166 15158 15177 15169
 15170 15171 15172 15173 15174 15175 15176 45512]


Session ID: 1247, Type: carts
Recommended aid values: [15163 15165 15164 15167 15162 15168 15160 15159 15166 15158 15177 15169
 15170 15171 15172 15173 15174 15175 15176 45512]


Session ID: 1248, Type: clicks
Recommended aid values: [15163 15165 15164 15167 15162 15168 15160 1515

In [None]:
To calculate the weighted recall with our specified weights, the grouping in the code needs to take into account the 'type' of the action:

# Generate recommendations for test set
test_recommendations = generate_recommendations(test_df)

# Calculate recall for test set
recalls = {"clicks": [], "carts": [], "orders": []}
weights = {"clicks": 0.1, "carts": 0.3, "orders": 0.6}

for (session, type_), group in test_df.groupby(['session_id', 'type']):
    actual = group['aid_id'].values
    if (session, type_) in test_recommendations:
        predicted = test_recommendations[(session, type_)]
        recalls[type_].append(recall_per_session(predicted, actual))

In [None]:
# Calculate weighted recall for test set
weighted_recall = sum(weights[type_] * np.mean(recall) for type_, recall in recalls.items())

print("Weighted recall on test set: ", weighted_recall)

Weighted Recall on test set: 0.3358752896514369
