# Data Preprocessing

## Dataloading

In [6]:
import numpy as np
import pandas as pd
import pickle

In [7]:
base_df = pd.read_csv(
    "../data/csv/playlists_dataset/spotify_dataset.csv", on_bad_lines="skip"
)

In [8]:
clean_df = base_df[["trackname", "playlistname", "artistname"]]

## Dropping rows with missing values

In [9]:
clean_df = clean_df.dropna(how="any")

## Cleaning the var types

In [10]:
clean_df["trackname"] = clean_df["trackname"].astype("string")
clean_df["playlistname"] = clean_df["playlistname"].astype("string")
clean_df["artistname"] = clean_df["artistname"].astype("string")

print(clean_df.dtypes)

trackname       string[python]
playlistname    string[python]
artistname      string[python]
dtype: object


## Combining trackname and artistname

Combining the trackname and artistname into one string identify individual songs

In [11]:
clean_df["track_and_artist"] = clean_df["trackname"] + " " + clean_df["artistname"]

## Filtering wrong playlists

It's pretty sure that the extremly long playlists exist because of equal names of different playlists like "love" or "rock" ... 
Need to investigate if every user id is connected to one playlist or how it works out if multiple users work on one playlist... 

In [None]:
playlist_sizes = clean_df.groupby("playlistname").size()

upper_5_percent_threshold = playlist_sizes.quantile(0.95)

valid_playlists = playlist_sizes[playlist_sizes <= upper_5_percent_threshold].index

filtered_df = clean_df[clean_df["playlistname"].isin(valid_playlists)]

clean_df = filtered_df

## Saving the cleaned Data

In [13]:
clean_df.to_csv("../data/csv/playlists_dataset/playlist_data.csv")

In [14]:
df = pd.read_csv("../data/csv/playlists_dataset/playlist_data.csv")

## Tokenize Data

In [15]:
tokenized_playlist_np = clean_df.groupby("playlistname")["track_and_artist"].apply(list)

In [16]:
tokenized_playlist = tokenized_playlist_np.tolist()

## Extrapolate data

In [17]:
n = 3

temp_arr = tokenized_playlist_np

extrapolated_data = tokenized_playlist

# Shuffle each sub-array n times and collect results
for subarray in temp_arr:
    for _ in range(n):
        extrapolated_data.append(np.random.permutation(subarray).tolist())

In [18]:
len(extrapolated_data)

629280

## training and test split

In [19]:
tokenized_data = extrapolated_data

# split value
split_fac = 0.9

max_idx = len(tokenized_data) - 1
anchor = int(max_idx * split_fac)

In [20]:
training_set = tokenized_playlist[:anchor]
test_set = tokenized_playlist[anchor:]

In [21]:
with open("../data/tokenized_data/playlist_names/dataset_train.pkl", "wb") as f:
    pickle.dump(training_set, f)


with open("../data/tokenized_data/playlist_names/dataset_test.pkl", "wb") as f:
    pickle.dump(test_set, f)