In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/csv/playlists_dataset/playlist_data.csv')

In [4]:
print(df['trackname'].unique().shape)
# 2032044 unique songs

(2032044,)


In [5]:
num_of_entrys = df['trackname'].shape[0]
print(num_of_entrys)
# 12891680 entrys of songs

12891680


In [6]:
num_of_playlists = df['playlistname'].unique().shape[0]
print(num_of_playlists)
# 157505 playlists

157505


In [7]:
avg_playlist_len = num_of_entrys / num_of_playlists
print(avg_playlist_len)

81.84933811625028


In [8]:
tokenized_playlist_np = df.groupby("playlistname")["trackname"].apply(list)
tokenized_playlist = tokenized_playlist_np.tolist()

In [9]:
import numpy as np 

playlist_lengths = np.array([len(playlist) for playlist in tokenized_playlist])

total_playlists = len(tokenized_playlist)

total_songs = sum(playlist_lengths)

average_songs_per_playlist = np.mean(playlist_lengths)

variance_songs_per_playlist = np.var(playlist_lengths)
std_dev_songs_per_playlist = np.std(playlist_lengths)

shortest_playlist = min(playlist_lengths)
longest_playlist = max(playlist_lengths)

metrics = {
    "Total Playlists": total_playlists,
    "Total Songs": total_songs,
    "Average Songs per Playlist": average_songs_per_playlist,
    "Variance of Songs per Playlist": variance_songs_per_playlist,
    "Standard Deviation of Songs per Playlist": std_dev_songs_per_playlist,
    "Shortest Playlist Length": shortest_playlist,
    "Longest Playlist Length": longest_playlist
}

for key, value in metrics.items():
    print(f"{key}: {value}")

Total Playlists: 157504
Total Songs: 12890434
Average Songs per Playlist: 81.84194687119057
Variance of Songs per Playlist: 11685923.924633183
Standard Deviation of Songs per Playlist: 3418.468066931909
Shortest Playlist Length: 1
Longest Playlist Length: 1337085


In [10]:
# Remove extreme outliers (e.g., playlists with lengths beyond a reasonable range)
# Using Interquartile Range (IQR) method for outlier detection
Q1 = np.percentile(playlist_lengths, 25)  # First quartile (25th percentile)
Q3 = np.percentile(playlist_lengths, 75)  # Third quartile (75th percentile)
IQR = Q3 - Q1  # Interquartile range

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

filtered_playlist_lengths = playlist_lengths[(playlist_lengths >= lower_bound) & (playlist_lengths <= upper_bound)]

total_playlists = len(filtered_playlist_lengths)
total_songs = sum(filtered_playlist_lengths)
average_songs_per_playlist = np.mean(filtered_playlist_lengths)
variance_songs_per_playlist = np.var(filtered_playlist_lengths)
std_dev_songs_per_playlist = np.std(filtered_playlist_lengths)
shortest_playlist = min(filtered_playlist_lengths)
longest_playlist = max(filtered_playlist_lengths)

metrics = {
    "Total Playlists (after filtering)": total_playlists,
    "Total Songs (after filtering)": total_songs,
    "Average Songs per Playlist (after filtering)": average_songs_per_playlist,
    "Variance of Songs per Playlist (after filtering)": variance_songs_per_playlist,
    "Standard Deviation of Songs per Playlist (after filtering)": std_dev_songs_per_playlist,
    "Shortest Playlist Length (after filtering)": shortest_playlist,
    "Longest Playlist Length (after filtering)": longest_playlist
}

for key, value in metrics.items():
    print(f"{key}: {value}")


Total Playlists (after filtering): 138847
Total Songs (after filtering): 3519750
Average Songs per Playlist (after filtering): 25.349845513406844
Variance of Songs per Playlist (after filtering): 512.453400175634
Standard Deviation of Songs per Playlist (after filtering): 22.637433604002773
Shortest Playlist Length (after filtering): 1
Longest Playlist Length (after filtering): 103


In [11]:
# Search string
search_string = "die young"

# Case-insensitive search in a column
result = df[df['trackname'].str.contains(search_string, case=False, na=False)]

print(result)

          Unnamed: 0                     trackname  \
1509            1509          Too Old To Die Young   
2107            2107                If I Die Young   
2663            2663  One Of Us Is Gonna Die Young   
4884            4884                     Die Young   
8162            8162  One Of Us Is Gonna Die Young   
...              ...                           ...   
12875906    12875906  Live Fast Die Young - B-Side   
12879435    12879435              Rather Die Young   
12890321    12890321                  We Die Young   
12890681    12890681                     Die Young   
12890984    12890984                     Die Young   

                               playlistname                    artistname  
1509                          Work playlist  Brother Dege (AKA Dege Legg)  
2107                                Starred                The Band Perry  
2663                tove's spamlista (okok)                       The Ark  
4884                     Everything at once    

In [17]:
playlist_lengths.sort()
print(playlist_lengths[-100:])

[   1188    1189    1190    1190    1190    1191    1192    1192    1193
    1195    1198    1199    1199    1201    1201    1202    1203    1203
    1207    1207    1207    1209    1210    1212    1212    1213    1214
    1215    1215    1217    1217    1218    1218    1219    1220    1220
    1220    1222    1224    1226    1226    1226    1227    1231    1232
    1232    1234    1234    1235    1236    1237    1237    1238    1239
    1239    1239    1239    1239    1240    1241    1245    1245    1247
    1247    1248    1249    1249    1249    1250    1250    1251    1253
    1254    1254    1254    1256    1257    1257    1257    1257    1257
    1259    1260    1260    1262    1264    1265    1267    1269    1269
    1274    1276    1277    1277    1278    1279    1279    1279    1280
    1280    1280    1281    1281    1282    1284    1284    1284    1284
    1284    1285    1285    1287    1287    1287    1288    1289    1290
    1290    1292    1293    1294    1297    1297   