# Data Preprocessing

## Dataloading

In [25]:
import numpy as np
import pandas as pd
import pickle

In [26]:
base_df = pd.read_csv(
    "../data/csv/playlists_dataset/spotify_dataset.csv", on_bad_lines="skip"
)

In [27]:
clean_df = base_df[["trackname", "playlistname", "artistname", "user_id"]]

## Dropping rows with missing values

In [28]:
clean_df = clean_df.dropna(how="any")

## Cleaning the var types

In [29]:
clean_df["trackname"] = clean_df["trackname"].astype("string")
clean_df["playlistname"] = clean_df["playlistname"].astype("string")
clean_df["artistname"] = clean_df["artistname"].astype("string")
clean_df["user_id"] = clean_df["user_id"].astype("string")

print(clean_df.dtypes)

trackname       string[python]
playlistname    string[python]
artistname      string[python]
user_id         string[python]
dtype: object


## Combining trackname and artistname

Combining the trackname and artistname into one string identify individual songs

In [30]:
clean_df["track_and_artist"] = clean_df["trackname"] + " " + clean_df["artistname"]
clean_df["playlist_and_user_id"] = clean_df["playlistname"] + " " + clean_df["user_id"]

## Filtering wrong playlists

It's pretty sure that the extremly long playlists exist because of equal names of different playlists like "love" or "rock" ... 
Need to investigate if every user id is connected to one playlist or how it works out if multiple users work on one playlist... 

In [31]:
playlist_sizes = clean_df.groupby("playlist_and_user_id").size()

# Compute the lower and upper quantiles
lower_n_percent_threshold = playlist_sizes.quantile(0.20)
upper_n_percent_threshold = playlist_sizes.quantile(0.98)

# Keep only those playlists within the specified size range
valid_playlists = playlist_sizes[
    (playlist_sizes >= lower_n_percent_threshold)
    & (playlist_sizes <= upper_n_percent_threshold)
].index

# Filter the DataFrame to keep only valid playlists
filtered_df = clean_df[clean_df["playlist_and_user_id"].isin(valid_playlists)]

clean_df = filtered_df

In [32]:
# Recompute the group sizes in the filtered DataFrame
final_playlist_sizes = clean_df.groupby("playlist_and_user_id").size()

# Find the smallest playlist length
smallest_playlist_length = final_playlist_sizes.min()

print("Smallest playlist length:", smallest_playlist_length)

Smallest playlist length: 10


## Saving the cleaned Data

In [33]:
clean_df.to_csv("../data/csv/playlists_dataset/playlist_data_v3.csv")

In [34]:
df = pd.read_csv("../data/csv/playlists_dataset/playlist_data_v3.csv")

## Tokenize Data

In [35]:
tokenized_playlist_np = clean_df.groupby("playlist_and_user_id")[
    "track_and_artist"
].apply(list)

In [36]:
tokenized_playlist = tokenized_playlist_np.tolist()

## Extrapolate data

In [37]:
n = 3

temp_arr = tokenized_playlist_np

extrapolated_data = tokenized_playlist

# Shuffle each sub-array n times and collect results
for subarray in temp_arr:
    for _ in range(n):
        extrapolated_data.append(np.random.permutation(subarray).tolist())

In [38]:
len(extrapolated_data)

735012

## training and test split

In [39]:
tokenized_data = extrapolated_data

# split value
split_fac = 0.9

max_idx = len(tokenized_data) - 1
anchor = int(max_idx * split_fac)

In [40]:
training_set = tokenized_playlist[:anchor]
test_set = tokenized_playlist[anchor:]

In [41]:
with open("../data/tokenized_data/playlist_names/dataset_train_v3.pkl", "wb") as f:
    pickle.dump(training_set, f)


with open("../data/tokenized_data/playlist_names/dataset_test_v3.pkl", "wb") as f:
    pickle.dump(test_set, f)