In [1]:
import pandas as pd

In [2]:
data_file = "./data/mini_sparkify_event_data.parquet"

In [3]:
df = pd.read_parquet(data_file)

In [4]:
columns = df.columns

In [5]:
columns

Index(['artist', 'auth', 'firstName', 'gender', 'itemInSession', 'lastName',
       'length', 'level', 'location', 'method', 'page', 'registration',
       'sessionId', 'song', 'status', 'ts', 'userAgent', 'userId'],
      dtype='object')

In [6]:
groupBy = "userId"
orderBy = "ts"

In [7]:
# copy the `userId` column into a new column as a deep copy


In [8]:
# print original df out as a string so it can be copied into a test
# sample only the first 5 rows
from tabulate import tabulate
original_sample = df.head(5)
print(tabulate(original_sample, headers='keys', tablefmt='psql'))

+----+------------------+-----------+-------------+----------+-----------------+------------+----------+---------+--------------------------------+----------+----------+----------------+-------------+-------------------------------+----------+---------------+-----------------------------------------------------------------------------------------------------------------+----------+
|    | artist           | auth      | firstName   | gender   |   itemInSession | lastName   |   length | level   | location                       | method   | page     |   registration |   sessionId | song                          |   status |            ts | userAgent                                                                                                       |   userId |
|----+------------------+-----------+-------------+----------+-----------------+------------+----------+---------+--------------------------------+----------+----------+----------------+-------------+------------------------------

In [9]:
grouped = df.sort_values(orderBy).groupby(groupBy)

In [10]:
sorted_df = grouped.apply(lambda x: x.to_dict("records")).reset_index()

  sorted_df = grouped.apply(lambda x: x.to_dict("records")).reset_index()


In [11]:
sorted_df.columns

Index(['userId', 0], dtype='object')

In [12]:
len(sorted_df)

226

In [13]:
sorted_df.head(5)

Unnamed: 0,userId,0
0,,"[{'artist': None, 'auth': 'Logged Out', 'first..."
1,10.0,"[{'artist': 'Sea Wolf', 'auth': 'Logged In', '..."
2,100.0,"[{'artist': 'Evanescence', 'auth': 'Logged In'..."
3,100001.0,"[{'artist': None, 'auth': 'Logged In', 'firstN..."
4,100002.0,"[{'artist': None, 'auth': 'Logged In', 'firstN..."


In [14]:
from sentence_transformers import SentenceTransformer

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [16]:
name_cache = {} # maps things like 'artist' to the embedding of 'artist'

In [17]:
data_cache = {} # maps the data to the embedding of the data

In [18]:
import torch

In [19]:
from tqdm.auto import tqdm

In [20]:
def encode_row(row):
    result = []
    for d in row[0]: # row[1] contains the actual data
        for k, v in d.items():
            if k not in [groupBy, orderBy]:
                if k not in name_cache:
                    name_cache[k] = model.encode(k, convert_to_tensor=True)
                v = str(v)
                if v not in data_cache:
                    data_cache[v] = model.encode(str(v), convert_to_tensor=True)
                k_emb = name_cache[k]
                v_emb = data_cache[v]
                summed = k_emb + v_emb
                result.append(summed)
    return torch.stack(result)

In [21]:
tensors = []

In [22]:
# processed = sorted_df.apply(encode_row, axis=1)
for index, row in tqdm(sorted_df.iterrows(), total=len(sorted_df)):
    tensors.append(encode_row(row))

  0%|          | 0/226 [00:00<?, ?it/s]

8346
18


In [24]:
tensors[0].shape

torch.Size([133536, 384])

In [None]:
unique_pages_file = "./data/unique_pages.json"

In [None]:
import json

In [None]:
with open(unique_pages_file) as f:
    unique_pages = json.load(f)

In [None]:
def encode_row_targets(row):
    result = []
    for d in row[0]:
        for k, v in d.items():
            if k == "page":
                idx = unique_pages[v]
            else:
                idx = -1
            result.append(idx)
    return torch.tensor(result)

In [None]:
targets = []
for index, row in tqdm(sorted_df.iterrows(), total=len(sorted_df)):
    targets.append(encode_row_targets(row))
targets = torch.stack(targets)