In [22]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("../data/csv/playlists_dataset/playlist_data_v3.csv")

In [None]:
print(df["trackname"].unique().shape)

(2004520,)


In [None]:
num_of_entrys = df["trackname"].shape[0]
print(num_of_entrys)

12856831


In [26]:
num_of_playlists = df["playlist_and_user_id"].unique().shape[0]
print(num_of_playlists)
# 157505 playlists without combining playlistname and the user_id

# 220023 playlists with user_id

231560


In [27]:
avg_playlist_len = num_of_entrys / num_of_playlists
print(avg_playlist_len)

55.52267662808775


In [28]:
tokenized_playlist_np = df.groupby("playlist_and_user_id")["track_and_artist"].apply(
    list
)
tokenized_playlist = tokenized_playlist_np.tolist()

In [20]:
playlist_lengths = np.array([len(playlist) for playlist in tokenized_playlist])

total_playlists = len(tokenized_playlist)

total_songs = sum(playlist_lengths)

average_songs_per_playlist = np.mean(playlist_lengths)

variance_songs_per_playlist = np.var(playlist_lengths)
std_dev_songs_per_playlist = np.std(playlist_lengths)

shortest_playlist = min(playlist_lengths)
longest_playlist = max(playlist_lengths)

metrics = {
    "Total Playlists": total_playlists,
    "Total Songs": total_songs,
    "Average Songs per Playlist": average_songs_per_playlist,
    "Variance of Songs per Playlist": variance_songs_per_playlist,
    "Standard Deviation of Songs per Playlist": std_dev_songs_per_playlist,
    "Shortest Playlist Length": shortest_playlist,
    "Longest Playlist Length": longest_playlist,
}

for key, value in metrics.items():
    print(f"{key}: {value}")

Total Playlists: 231560
Total Songs: 12856831
Average Songs per Playlist: 55.52267662808775
Variance of Songs per Playlist: 73546.52178668605
Standard Deviation of Songs per Playlist: 271.1946197598434
Shortest Playlist Length: 1
Longest Playlist Length: 47362


In [34]:
# Remove extreme outliers (e.g., playlists with lengths beyond a reasonable range)
# Using Interquartile Range (IQR) method for outlier detection
Q1 = np.percentile(playlist_lengths, 0)  # First quartile (25th percentile)
Q3 = np.percentile(playlist_lengths, 98)  # Third quartile (75th percentile)
IQR = Q3 - Q1  # Interquartile range

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

filtered_playlist_lengths = playlist_lengths[
    (playlist_lengths >= lower_bound) & (playlist_lengths <= upper_bound)
]

total_playlists = len(filtered_playlist_lengths)
total_songs = sum(filtered_playlist_lengths)
average_songs_per_playlist = np.mean(filtered_playlist_lengths)
variance_songs_per_playlist = np.var(filtered_playlist_lengths)
std_dev_songs_per_playlist = np.std(filtered_playlist_lengths)
shortest_playlist = min(filtered_playlist_lengths)
longest_playlist = max(filtered_playlist_lengths)

metrics = {
    "Total Playlists (after filtering)": total_playlists,
    "Total Songs (after filtering)": total_songs,
    "Average Songs per Playlist (after filtering)": average_songs_per_playlist,
    "Variance of Songs per Playlist (after filtering)": variance_songs_per_playlist,
    "Standard Deviation of Songs per Playlist (after filtering)": std_dev_songs_per_playlist,
    "Shortest Playlist Length (after filtering)": shortest_playlist,
    "Longest Playlist Length (after filtering)": longest_playlist,
}

for key, value in metrics.items():
    print(f"{key}: {value}")

Total Playlists (after filtering): 230201
Total Songs (after filtering): 9641891
Average Songs per Playlist (after filtering): 41.88466166524038
Variance of Songs per Playlist (after filtering): 6527.9404983117975
Standard Deviation of Songs per Playlist (after filtering): 80.7956712844927
Shortest Playlist Length (after filtering): 1
Longest Playlist Length (after filtering): 926


In [11]:
# Search string
search_string = "die young"

# Case-insensitive search in a column
result = df[df["trackname"].str.contains(search_string, case=False, na=False)]

print(result)

         Unnamed: 0                     trackname  \
1206           2663  One Of Us Is Gonna Die Young   
1610          11625  One Of Us Is Gonna Die Young   
2961          13174  One Of Us Is Gonna Die Young   
4234          14636             The Bad Die Young   
4758          16064                     Die Young   
...             ...                           ...   
6101595    12873308     Better Hope You Die Young   
6103736    12875906  Live Fast Die Young - B-Side   
6106449    12879435              Rather Die Young   
6112702    12890321                  We Die Young   
6113062    12890681                     Die Young   

                              playlistname       artistname  \
1206               tove's spamlista (okok)          The Ark   
1610                          January 2014          The Ark   
2961                       Svenska Hjärtan          The Ark   
4234                   For England, James?    Nicholas Dodd   
4758                             Summering      