# Data Preprocessing

## Dataloading

In [25]:
import numpy as np
import pandas as pd
import pickle

In [26]:
base_df = pd.read_csv(
    "../data/csv/playlists_dataset/spotify_dataset.csv", on_bad_lines="skip"
)

In [27]:
clean_df = base_df[["trackname", "playlistname", "artistname", "user_id"]]

## Dropping rows with missing values

In [28]:
clean_df = clean_df.dropna(how="any")

## Cleaning the var types

In [29]:
clean_df["trackname"] = clean_df["trackname"].astype("string")
clean_df["playlistname"] = clean_df["playlistname"].astype("string")
clean_df["artistname"] = clean_df["artistname"].astype("string")
clean_df["user_id"] = clean_df["user_id"].astype("string")

print(clean_df.dtypes)

trackname       string[python]
playlistname    string[python]
artistname      string[python]
user_id         string[python]
dtype: object


## Combining trackname and artistname

Combining the trackname and artistname into one string identify individual songs

In [30]:
clean_df["track_and_artist"] = clean_df["trackname"] + " " + clean_df["artistname"]
clean_df["playlist_and_user_id"] = clean_df["playlistname"] + " " + clean_df["user_id"]

## Filtering wrong playlists

It's pretty sure that the extremly long playlists exist because of equal names of different playlists like "love" or "rock" ... 
Need to investigate if every user id is connected to one playlist or how it works out if multiple users work on one playlist... 

In [31]:
playlist_sizes = clean_df.groupby("playlist_and_user_id").size()

# Compute the lower and upper quantiles
lower_n_percent_threshold = playlist_sizes.quantile(0.20)
upper_n_percent_threshold = playlist_sizes.quantile(0.98)

# Keep only those playlists within the specified size range
valid_playlists = playlist_sizes[
    (playlist_sizes >= lower_n_percent_threshold)
    & (playlist_sizes <= upper_n_percent_threshold)
].index

# Filter the DataFrame to keep only valid playlists
filtered_df = clean_df[clean_df["playlist_and_user_id"].isin(valid_playlists)]

clean_df = filtered_df

In [32]:
# Recompute the group sizes in the filtered DataFrame
final_playlist_sizes = clean_df.groupby("playlist_and_user_id").size()

# Find the smallest playlist length
smallest_playlist_length = final_playlist_sizes.min()

print("Smallest playlist length:", smallest_playlist_length)

Smallest playlist length: 10


## Saving the cleaned Data

In [33]:
clean_df.to_csv("../data/csv/playlists_dataset/playlist_data_v3.csv")

In [34]:
df = pd.read_csv("../data/csv/playlists_dataset/playlist_data_v3.csv")

## Tokenize Data

In [35]:
tokenized_playlist_np = clean_df.groupby("playlist_and_user_id")[
    "track_and_artist"
].apply(list)

In [36]:
tokenized_playlist = tokenized_playlist_np.tolist()

## Extrapolate data

In [37]:
n = 3

temp_arr = tokenized_playlist_np

extrapolated_data = tokenized_playlist

# Shuffle each sub-array n times and collect results
for subarray in temp_arr:
    for _ in range(n):
        extrapolated_data.append(np.random.permutation(subarray).tolist())

In [38]:
len(extrapolated_data)

735012

## training and test split

In [39]:
tokenized_data = extrapolated_data

# split value
split_fac = 0.9

max_idx = len(tokenized_data) - 1
anchor = int(max_idx * split_fac)

In [40]:
training_set = tokenized_playlist[:anchor]
test_set = tokenized_playlist[anchor:]

In [41]:
with open("../data/tokenized_data/playlist_names/dataset_train_v3.pkl", "wb") as f:
    pickle.dump(training_set, f)


with open("../data/tokenized_data/playlist_names/dataset_test_v3.pkl", "wb") as f:
    pickle.dump(test_set, f)

# Data Preprocessing - V2 
This is an other part of the notebook to work with the freshly pulled spotify dataset

In [4]:
import pandas as pd
import numpy as np
import pickle
import io
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df = pd.read_csv("../data/csv/playlists_dataset/playlist_data_spotify_fresh.csv")

In [None]:
print("--- Basic DataFrame Info ---")
df.info()
print("\\n")

print("--- Descriptive Statistics for Numerical Columns ---")
# errors='coerce' will turn unparsable dates into NaT (Not a Time).
print("--- Descriptive Statistics for Object/Categorical Columns ---")
print(df.describe(include=["object"]))
print("\\n")

# --- 4. Value Counts for Key Columns ---
print("--- Value Counts for Key Columns ---")
print("Number of unique playlists:")
print(df["playlist_id"].nunique())

print("Number of unique tracks (by ID):")
print(df["track_id"].nunique())
print("\\n")

print("Number of unique track names:")
print(df["track_name"].nunique())
print("\\n")

# Artist Analysis - Handling comma-separated artists
print("--- Artist Analysis ---")
# Ensure artist_name is treated as string and handle potential NaNs before splitting
df["artist_name"] = df["artist_name"].astype(
    str
)  # Convert to string to use .str methods
all_artists = df["artist_name"].str.split(",\\s*").explode().str.strip()
# Remove 'nan' strings if they resulted from original NaNs converted to string
all_artists = all_artists[all_artists.str.lower() != "nan"]

print("Number of unique individual artists:")
print(all_artists.nunique())
print("\\nTop 10 most frequent individual artists:")
print(all_artists.value_counts().nlargest(10))
print("\\n")

# --- Playlist Analysis: Tracks per Playlist ---
print("--- Tracks per Playlist Analysis ---")
tracks_per_playlist = (
    df.groupby("playlist_id")["track_id"].count().sort_values(ascending=False)
)
print(tracks_per_playlist)
print("\\n")

# --- Most Frequent Track Names ---
print("--- Most Frequent Track Names ---")
# Consider that track names might not be unique (e.g. remixes, covers, same name different artist)
print(df["track_name"].value_counts().nlargest(10))
print("\\n")

# --- Missing Values ---
print("--- Missing Values per Column ---")
print(df.isnull().sum())
print("\\n")

print(
    "Basic analysis complete. You can expand on this with more specific questions about your data!"
)

In [6]:
df.head()

Unnamed: 0,playlist_id,track_id,track_name,track_external_urls,release_date,artist_name
0,5owFYKDZnehRxuuOLk36iS,1MDyUzZgyrdeQVmV1FU3WQ,Als du gingst - edit,https://open.spotify.com/track/1MDyUzZgyrdeQVm...,,"Contec, Lina Maly"
1,5owFYKDZnehRxuuOLk36iS,3t854jxXLppSGbOEYGQ3mI,Shake That,https://open.spotify.com/track/3t854jxXLppSGbO...,,"Sonny Wern, Danimal, okafuwa"
2,5owFYKDZnehRxuuOLk36iS,5yujUAF2VoPWKfYMKoBqSK,Wie? - Techno,https://open.spotify.com/track/5yujUAF2VoPWKfY...,,"FUTURAMI, XEKNO!"
3,5owFYKDZnehRxuuOLk36iS,4zRKaWqz94saRIpW5hPcHC,STARGAZING - TECHNO,https://open.spotify.com/track/4zRKaWqz94saRIp...,,"XEKNO!, TEKTOSHI, VXLTAGE"
4,5owFYKDZnehRxuuOLk36iS,0bb1R14dsWjFO01iJ6GqF3,SHE DOESN'T MIND - TECHNO,https://open.spotify.com/track/0bb1R14dsWjFO01...,,"PSYKADELIK, TEKTOSHI, Phantom X"


In [8]:
print(len(df))
df_cleaned = df.drop_duplicates(subset=["playlist_id", "track_id"], keep="first")
print(len(df_cleaned))

5362389
3492451


In [9]:
tokenized_data = df_cleaned.groupby("playlist_id")["track_id"].apply(list).tolist()

In [10]:
len(tokenized_data)

46515

In [13]:
data_length = len(tokenized_data)

split_90_percent = int(data_length * 0.90)
split_95_percent = int(data_length * 0.95)

In [14]:
training_set = tokenized_data[:split_90_percent]
validation_set = tokenized_data[split_90_percent:split_95_percent]
test_set = tokenized_data[split_95_percent:]

In [15]:
with open(
    "../data/tokenized_data/playlist_names/fresh_dataset_train_v1.pkl", "wb"
) as f:
    pickle.dump(training_set, f)

with open("../data/tokenized_data/playlist_names/fresh_dataset_val_v1.pkl", "wb") as f:
    pickle.dump(validation_set, f)

with open("../data/tokenized_data/playlist_names/fresh_dataset_test_v1.pkl", "wb") as f:
    pickle.dump(test_set, f)