# Processing for Spotify data

#### This version is based on "**WSDM Cup:The Music Streaming Sessions Dataset**" at https://research.atspotify.com/datasets/. Using "Test_Set.tar.gz (14G)". After unzip, we randome choose a file for creating the simulator ("log_prehistory_20180918_000000000000.csv", 648.37MB)

In [1]:
import pandas as pd
import numpy as np
import torch
import random

In [2]:
df = pd.read_csv(filepath_or_buffer='./log_prehistory_20180918_000000000000.csv')


df = df[['session_id', 'session_position', 'session_length', 'track_id_clean', 'skip_3']]
print(f"Read {len(df)} rows.")
df.head(20)

Read 3942478 rows.


Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_3
0,65_00000cda-eb14-42b5-be70-64afd00b159e,1,20,t_442fb6aa-1c6a-48db-81f2-f820df5bc4c6,False
1,65_00000cda-eb14-42b5-be70-64afd00b159e,2,20,t_f8fc3210-d734-416a-aac9-0ad43c78e9b4,False
2,65_00000cda-eb14-42b5-be70-64afd00b159e,3,20,t_7e9d453e-035c-46b4-a05d-dc631ec42eff,True
3,65_00000cda-eb14-42b5-be70-64afd00b159e,4,20,t_442fb6aa-1c6a-48db-81f2-f820df5bc4c6,False
4,65_00000cda-eb14-42b5-be70-64afd00b159e,5,20,t_f8fc3210-d734-416a-aac9-0ad43c78e9b4,False
5,65_00000cda-eb14-42b5-be70-64afd00b159e,6,20,t_7e9d453e-035c-46b4-a05d-dc631ec42eff,True
6,65_00000cda-eb14-42b5-be70-64afd00b159e,7,20,t_2426ed3d-aa65-49cf-b323-66d5cd7bedae,True
7,65_00000cda-eb14-42b5-be70-64afd00b159e,8,20,t_5c1d21a3-0d09-4098-b783-09c180e2dcb5,True
8,65_00000cda-eb14-42b5-be70-64afd00b159e,9,20,t_2426ed3d-aa65-49cf-b323-66d5cd7bedae,False
9,65_00000cda-eb14-42b5-be70-64afd00b159e,10,20,t_889887f2-02b4-4cb7-8207-c7df6b115854,True


#### Note for some reason, the session_length is 20 but the session_position is only halfed. This is consistent along the dataset.

In [3]:
unique_sessions = df.drop_duplicates(subset=['session_id'])
print(f"Before filter out sequence length, we have {unique_sessions['session_length'].unique()} possible sequence length.")
unique_sessions['session_length'].describe()

Before filter out sequence length, we have [20 15 10 11 13 14 17 12 18 19 16] possible sequence length.


count    483798.000000
mean         16.544361
std           3.835032
min          10.000000
25%          13.000000
50%          18.000000
75%          20.000000
max          20.000000
Name: session_length, dtype: float64

#### It turns out, a sequence length of 20 is the most frequent, so we choose 20 as the default sequence length.

In [4]:
df_20 = df[df['session_length'] == 20]
print(f"Keep {len(df_20)} rows, which is {np.round(len(df_20)/len(df)*100,decimals=2)}% of initial length.")
df_20.head(20)

Keep 2284600 rows, which is 57.95% of initial length.


Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_3
0,65_00000cda-eb14-42b5-be70-64afd00b159e,1,20,t_442fb6aa-1c6a-48db-81f2-f820df5bc4c6,False
1,65_00000cda-eb14-42b5-be70-64afd00b159e,2,20,t_f8fc3210-d734-416a-aac9-0ad43c78e9b4,False
2,65_00000cda-eb14-42b5-be70-64afd00b159e,3,20,t_7e9d453e-035c-46b4-a05d-dc631ec42eff,True
3,65_00000cda-eb14-42b5-be70-64afd00b159e,4,20,t_442fb6aa-1c6a-48db-81f2-f820df5bc4c6,False
4,65_00000cda-eb14-42b5-be70-64afd00b159e,5,20,t_f8fc3210-d734-416a-aac9-0ad43c78e9b4,False
5,65_00000cda-eb14-42b5-be70-64afd00b159e,6,20,t_7e9d453e-035c-46b4-a05d-dc631ec42eff,True
6,65_00000cda-eb14-42b5-be70-64afd00b159e,7,20,t_2426ed3d-aa65-49cf-b323-66d5cd7bedae,True
7,65_00000cda-eb14-42b5-be70-64afd00b159e,8,20,t_5c1d21a3-0d09-4098-b783-09c180e2dcb5,True
8,65_00000cda-eb14-42b5-be70-64afd00b159e,9,20,t_2426ed3d-aa65-49cf-b323-66d5cd7bedae,False
9,65_00000cda-eb14-42b5-be70-64afd00b159e,10,20,t_889887f2-02b4-4cb7-8207-c7df6b115854,True


In [5]:
print(f"After filter out sequence length, we have {df_20['session_id'].nunique()} session")
print(f"After filter out sequence length, we have {df_20['track_id_clean'].nunique()} track_id")

After filter out sequence length, we have 228460 session


After filter out sequence length, we have 301783 track_id


In [6]:
# Set random seed for reproducibility
global_seed = 42

def set_seed(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

#### Dealing with a very sparse and large matrix is memory consuming, we wish to handle a smaller matrix. To limit the matrix size, we sample 6000 unique session_id to create the sparse matrix. This is not necessary but help us to down size the matrix.


In [7]:
population_size = 6000

sampled_session_ids = df_20['session_id'].drop_duplicates().sample(n=population_size, random_state=global_seed)
df_20_filtered = df_20[df_20['session_id'].isin(sampled_session_ids)]
df_20_filtered.reset_index(drop=True, inplace=True)
print(f"With a subpopulation of {population_size}, we have {df_20_filtered['session_id'].nunique()} session and {df_20_filtered['track_id_clean'].nunique()} track_id.")


With a subpopulation of 6000, we have 6000 session and 26707 track_id.


In [8]:
session_track_matrix = df_20_filtered.pivot_table(index='session_id', columns='track_id_clean', aggfunc='size', fill_value=0)
session_track_matrix

track_id_clean,t_0000496c-869f-4350-83f1-bd1c14ea79ba,t_00042d9b-e795-41a9-89ad-504373dd4287,t_000518e0-996c-46b0-9167-f831f5f8f513,t_00059b93-803a-4c3c-8ea6-d9938bd17264,t_0007a9bf-2faf-4345-b005-388b1d9e3d94,t_000b1102-79f5-497b-848b-6e793089b0ea,t_00174427-afef-4a1c-be96-38c2b6f01396,t_0017689d-4a22-480d-b313-93e38bb63ac1,t_0019b9be-3ea4-4722-800f-2b37cbe5a4b8,t_001e01c6-e635-4a72-8837-dad6c2389b8d,...,t_ffdf73bf-7b3b-476a-9031-e262fa3339e6,t_ffe1e63d-7f6d-47d9-9ecf-65db05ae7ccb,t_ffe3816d-baf0-4ddf-81ba-7598a03b2973,t_ffe4f45a-c85b-487e-b909-c9a401ab9130,t_ffeb7c85-d909-4b98-8fa4-1d7488faea99,t_ffecf968-2ea1-4fda-98e0-ebd43e313425,t_ffefb779-813a-4cd4-ba52-9fa2fa2bdf38,t_fff12ea1-83ef-468b-9fa4-eb80c8c1f8d4,t_fff9b035-3943-4557-bdd0-15fc2c5dc03e,t_fffe4528-f29c-467c-a751-e9db03964faa
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65_0008dac3-8379-4369-90a1-b7ceccbb2d51,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65_0025a3cc-485b-407e-9fc6-2b58fc07b957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65_0026980c-c0de-46bd-860b-02b4050bf72c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65_00285fa4-2c95-45af-a57e-1b73de06b895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65_0028933d-c089-494a-8d17-900f12b7af10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65_ffe2d9f1-1ffa-4a39-bbff-53c7b7a382ee,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65_ffe678b7-a111-4f84-b3a5-ad559509fc7f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65_ffeb15fb-1e48-4469-808d-ca069506f21d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65_ffefaee3-5f51-4cd3-bb0b-68143cd4cec4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
session_track_matrix_ = session_track_matrix.to_numpy()

u, s, vt = np.linalg.svd(session_track_matrix_, full_matrices=False)

#### Looking at the data, the numerical values are too extreme, we wish to standardize them first. This also help speed up clustering later.

In [10]:
def standardize(embeddings):
    # Compute the mean and standard deviation for each feature
    mean_vals = np.mean(embeddings, axis=0)
    std_vals = np.std(embeddings, axis=0)
    # Avoid division by zero by setting any zero std values to 1
    std_vals[std_vals == 0] = 1
    # Standardize the embeddings
    return (embeddings - mean_vals) / std_vals

In [11]:
normalised_u = standardize(u[:, :50])
normalised_v = standardize(vt.T[:, :50])

In [12]:
l = 10

session_embeddings = normalised_u[:, :l]
track_embeddings = normalised_v[:, :l]

print(f"Track embedding shape of {track_embeddings.shape} and session embedding shape of {session_embeddings.shape}")

Track embedding shape of (26707, 10) and session embedding shape of (6000, 10)


In [13]:
session_embeddings[0]

array([ 0.3865183 , -0.04937551,  0.07676365, -0.08082001,  0.12558457,
       -0.02435866, -0.05186017,  0.07997957, -0.11185562, -0.11709471])

In [14]:
track_embeddings[0]

array([ 0.04759581, -0.08667425,  0.04949918, -0.00480566,  0.02343548,
        0.03090921,  0.0017227 , -0.01937166, -0.00455287, -0.02484083])

In [15]:
from sklearn.cluster import KMeans

num_clusters = l

kmeans = KMeans(n_init='auto', n_clusters=num_clusters, random_state=1234)
kmeans.fit(track_embeddings)
track_clusters = kmeans.labels_
print(track_clusters.shape)

cluster_distribution = np.bincount(track_clusters, minlength=num_clusters)
print("Cluster distribution:", cluster_distribution)

(26707,)
Cluster distribution: [26661    12    11     1     1     6     1     4     6     4]


#### We observe using a stardard Kmenas algorithm without constrains results in a highly imbalanced cluster. With additional constratins on the size of the cluster, we get more reasonable results.

In [16]:
from k_means_constrained import KMeansConstrained

num_clusters = l
min_size = len(track_embeddings) // num_clusters - 50
max_size = len(track_embeddings) // num_clusters + 50

clf = KMeansConstrained(
    n_clusters=num_clusters,
    size_min=min_size,
    size_max=max_size,
    random_state=1234
)

clf.fit(track_embeddings)
track_clusters_balanced = clf.labels_

cluster_distribution = np.bincount(track_clusters_balanced)
print("Balanced cluster distribution:", cluster_distribution)


Balanced cluster distribution: [2720 2627 2720 2720 2720 2620 2620 2620 2720 2620]


In [17]:
centroid_vectors = clf.cluster_centers_
print("Centroid vectors for each cluster:\n for instance, the first one: \n", centroid_vectors[0])

Centroid vectors for each cluster:
 for instance, the first one: 
 [ 0.06953203 -0.01117762  0.01909637 -0.02113779  0.03288418 -0.00680944
 -0.01537758  0.02478403 -0.03482736 -0.036354  ]


In [18]:
track_id_to_cluster = {track_id: cluster for track_id, cluster in zip(session_track_matrix.columns, track_clusters_balanced)}
len(track_id_to_cluster)

26707

In [19]:
def track_to_cluster_vector(track_id, track_id_to_cluster, num_clusters):
    # Initialize a count vector with zeros for each cluster
    cluster_vector = np.zeros(num_clusters, dtype=int)
    # Set the value at the cluster index to 1
    cluster_index = track_id_to_cluster.get(track_id)
    cluster_vector[cluster_index] = 1
    return cluster_vector

df_20_filtered['F'] = df_20_filtered['track_id_clean'].apply(lambda x: track_to_cluster_vector(x, track_id_to_cluster, num_clusters))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20_filtered['F'] = df_20_filtered['track_id_clean'].apply(lambda x: track_to_cluster_vector(x, track_id_to_cluster, num_clusters))


In [20]:
def get_centroid_vector(track_id, track_id_to_cluster, centroid_vectors):
    cluster_index = track_id_to_cluster.get(track_id)
    return centroid_vectors[cluster_index]

df_20_filtered['F_vector'] = df_20_filtered['track_id_clean'].apply(lambda x: get_centroid_vector(x, track_id_to_cluster, centroid_vectors))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20_filtered['F_vector'] = df_20_filtered['track_id_clean'].apply(lambda x: get_centroid_vector(x, track_id_to_cluster, centroid_vectors))


In [21]:
df_20_filtered.head(20)

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_3,F,F_vector
0,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,1,20,t_8eecd43d-3146-47ab-9ff6-3d300abe3216,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0...."
1,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,2,20,t_47040682-2572-4b22-a3c3-7a3426634371,True,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0.06953126222753003, -0.011178203525205396, 0..."
2,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,3,20,t_c3e3086e-b38f-467b-b05a-9aa511a8aef5,True,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.06953172399542647, -0.011177516303562769, 0..."
3,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,4,20,t_22435160-213e-4aac-903e-9e60d643d761,True,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0695004329733046, -0.011181794135149432, 0...."
4,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,5,20,t_8163fb61-4a3f-4b45-9b3f-f125da29c86b,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0...."
5,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,6,20,t_e241fa6f-b682-4114-98ad-70f79a0dd99b,True,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0.05367344872695791, -0.0060483777861048615, ..."
6,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,7,20,t_7acef85a-fc13-4038-afe6-00d31c373f4b,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0...."
7,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,8,20,t_2bb41382-4b9c-4d5e-98ee-b321b675a6ff,True,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0695004329733046, -0.011181794135149432, 0...."
8,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,9,20,t_758b880c-dbb8-403e-b7c5-e3932f707aa9,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0...."
9,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,10,20,t_27da1c72-64d5-432b-b520-2071e99e8ffe,False,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.05197004493717251, -0.011174131555338772, -..."


#### We now create the X value based on the skip_3. In the official documentation, the skip_3 is denoted as "**Skip3: Boolean indicating if most of the track was played**", to reflect the value, we consider converting it to a reward, say if it is True, the reward is 1 and if not the reward is 0.

In [22]:
def assign_skip_value(row):
    base_value = 1 if row['skip_3'] else 0
    # noise = np.random.normal(loc=0, scale=0.5)
    return base_value

set_seed(global_seed)
# df_20_filtered['skip_value'] = df_20_filtered.apply(assign_skip_value_with_noise, axis=1)
df_20_filtered['skip_value'] = df_20_filtered.apply(assign_skip_value, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_20_filtered['skip_value'] = df_20_filtered.apply(assign_skip_value, axis=1)


In [23]:
df_20_filtered.head(40)

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_3,F,F_vector,skip_value
0,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,1,20,t_8eecd43d-3146-47ab-9ff6-3d300abe3216,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0....",1
1,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,2,20,t_47040682-2572-4b22-a3c3-7a3426634371,True,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0.06953126222753003, -0.011178203525205396, 0...",1
2,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,3,20,t_c3e3086e-b38f-467b-b05a-9aa511a8aef5,True,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.06953172399542647, -0.011177516303562769, 0...",1
3,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,4,20,t_22435160-213e-4aac-903e-9e60d643d761,True,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0695004329733046, -0.011181794135149432, 0....",1
4,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,5,20,t_8163fb61-4a3f-4b45-9b3f-f125da29c86b,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0....",1
5,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,6,20,t_e241fa6f-b682-4114-98ad-70f79a0dd99b,True,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]","[0.05367344872695791, -0.0060483777861048615, ...",1
6,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,7,20,t_7acef85a-fc13-4038-afe6-00d31c373f4b,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0....",1
7,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,8,20,t_2bb41382-4b9c-4d5e-98ee-b321b675a6ff,True,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0695004329733046, -0.011181794135149432, 0....",1
8,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,9,20,t_758b880c-dbb8-403e-b7c5-e3932f707aa9,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0....",1
9,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,10,20,t_27da1c72-64d5-432b-b520-2071e99e8ffe,False,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[0.05197004493717251, -0.011174131555338772, -...",0


#### We further augment the skip_3 value as a aggregation effect over the sequence with a decay factor. Each aggregation is applied to the specific session_id and we name this new aggregation effect X. 

In [24]:
def compute_X(group):
    # Compute the cumulative sum of X up to the current time point
    X_cumsum = group['skip_value'].cumsum()
    # Add Gaussian noise to the cumulative sum
    X_prime = X_cumsum + np.random.normal(0, 0.5, size=len(X_cumsum))
    # Add X' to the group
    group["X"] = X_prime
    return group

set_seed(global_seed)
df_20_filtered = df_20_filtered.groupby('session_id').apply(compute_X).reset_index(drop=True)

In [25]:
df_20_filtered.head()

Unnamed: 0,session_id,session_position,session_length,track_id_clean,skip_3,F,F_vector,skip_value,X
0,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,1,20,t_8eecd43d-3146-47ab-9ff6-3d300abe3216,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0....",1,1.248357
1,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,2,20,t_47040682-2572-4b22-a3c3-7a3426634371,True,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[0.06953126222753003, -0.011178203525205396, 0...",1,1.930868
2,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,3,20,t_c3e3086e-b38f-467b-b05a-9aa511a8aef5,True,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0.06953172399542647, -0.011177516303562769, 0...",1,3.323844
3,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,4,20,t_22435160-213e-4aac-903e-9e60d643d761,True,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0.0695004329733046, -0.011181794135149432, 0....",1,4.761515
4,65_0008dac3-8379-4369-90a1-b7ceccbb2d51,5,20,t_8163fb61-4a3f-4b45-9b3f-f125da29c86b,True,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0.0695320313643396, -0.011177622068879926, 0....",1,4.882923


In [26]:
df_final = df_20_filtered[['F', 'X', 'F_vector', 'session_id']]
df_final.head()

Unnamed: 0,F,X,F_vector,session_id
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1.248357,"[0.0695320313643396, -0.011177622068879926, 0....",65_0008dac3-8379-4369-90a1-b7ceccbb2d51
1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",1.930868,"[0.06953126222753003, -0.011178203525205396, 0...",65_0008dac3-8379-4369-90a1-b7ceccbb2d51
2,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",3.323844,"[0.06953172399542647, -0.011177516303562769, 0...",65_0008dac3-8379-4369-90a1-b7ceccbb2d51
3,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",4.761515,"[0.0695004329733046, -0.011181794135149432, 0....",65_0008dac3-8379-4369-90a1-b7ceccbb2d51
4,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",4.882923,"[0.0695320313643396, -0.011177622068879926, 0....",65_0008dac3-8379-4369-90a1-b7ceccbb2d51


In [27]:
z_dim = 5

unique_sessions = df_final['session_id'].unique()
session_to_Z = {session: session_embeddings[i][:z_dim] for i, session in enumerate(unique_sessions)}


df_final['Z'] = df_final['session_id'].map(session_to_Z)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Z'] = df_final['session_id'].map(session_to_Z)


In [28]:
df_final.head()

Unnamed: 0,F,X,F_vector,session_id,Z
0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",1.248357,"[0.0695320313643396, -0.011177622068879926, 0....",65_0008dac3-8379-4369-90a1-b7ceccbb2d51,"[0.3865182998978819, -0.04937550520932488, 0.0..."
1,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",1.930868,"[0.06953126222753003, -0.011178203525205396, 0...",65_0008dac3-8379-4369-90a1-b7ceccbb2d51,"[0.3865182998978819, -0.04937550520932488, 0.0..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]",3.323844,"[0.06953172399542647, -0.011177516303562769, 0...",65_0008dac3-8379-4369-90a1-b7ceccbb2d51,"[0.3865182998978819, -0.04937550520932488, 0.0..."
3,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]",4.761515,"[0.0695004329733046, -0.011181794135149432, 0....",65_0008dac3-8379-4369-90a1-b7ceccbb2d51,"[0.3865182998978819, -0.04937550520932488, 0.0..."
4,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",4.882923,"[0.0695320313643396, -0.011177622068879926, 0....",65_0008dac3-8379-4369-90a1-b7ceccbb2d51,"[0.3865182998978819, -0.04937550520932488, 0.0..."


In [29]:
N = df_final['session_id'].nunique()
T = df_final.groupby('session_id').size().iloc[0]
l = len(df_final['F'].iloc[0])
z_dim = len(df_final['Z'].iloc[0])

print(f'We have a time series of {N} X {T} X {l}. and z of dimension {z_dim}')

We have a time series of 6000 X 10 X 10. and z of dimension 5


In [30]:
F_array = np.zeros((N, T, l))
F_vector_array = np.zeros((N, T, l))
X_array = np.zeros((N, T))
Z_array = np.zeros((N, z_dim))


for i, (session_id, group) in enumerate(df_final.groupby('session_id')):
    # Ensure the group is sorted
    group = group.sort_index()
    F_array[i, :len(group), :] = np.array(group['F'].tolist())
    F_vector_array[i, :len(group), :] = np.array(group['F_vector'].tolist())
    X_array[i, :len(group)] = group['X'].values
    Z_array[i] = group['Z'].iloc[0]

# Z_array = session_embeddings[:, :z_dim]

In [31]:
np.save('./data/x.npy', X_array)
np.save('./data/f_vec.npy', F_vector_array)
np.save('./data/f.npy', F_array)
np.save('./data/z.npy', Z_array)

In [32]:
import json

df_final['F'] = df_final['F'].apply(lambda x: json.dumps(x.tolist()))
df_final['F_vector'] = df_final['F_vector'].apply(lambda x: json.dumps(x.tolist()))
df_final['Z'] = df_final['Z'].apply(lambda x: json.dumps(x.tolist()))

df_final.to_csv('spotify_data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['F'] = df_final['F'].apply(lambda x: json.dumps(x.tolist()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['F_vector'] = df_final['F_vector'].apply(lambda x: json.dumps(x.tolist()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['Z'] = df_final['Z'].apply(lambda x: json.