In [2]:
import os
import json

### Filter data

In [45]:
input_dir = "../data"
output_dir = "../data2"

# Create the output folder if it doesn't exist
output_folder = output_dir
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        # Load the JSON file
        with open(os.path.join(input_dir, filename)) as f:
            data = json.load(f)
        # Keep playlists with at least k followers
        # k = 9: 9k playlists, ~110 MB
        # k = 8: 11k playlists, ~136 MB
        k = 9
        playlists = data["playlists"]
        filtered_playlists = [playlist for playlist in playlists if playlist["num_followers"] >= 9]
        data["playlists"] = filtered_playlists
        # Save the edited JSON file to the output directory
        with open(os.path.join(output_dir, filename), "w") as f:
            json.dump(data, f)

# ~3m runtime

### Clean data

In [8]:
# Define the directory containing the original JSON files
input_dir = "../data2"
# Define the directory where the edited JSON files will be saved
output_dir = "../data2"

# Create the output folder if it doesn't exist
output_folder = output_dir
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
# Define the list of unnecessary fields
playlist_fields_to_remove = ["num_tracks", "collaborative", "modified_at", "num_albums", "num_edits", "duration_ms", "num_artists"]
track_fields_to_remove = ["pos", "album_name", "duration_ms", "artist_uri", "album_uri"]

for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        with open(os.path.join(input_dir, filename)) as f:
            data = json.load(f)

        # Remove unnecessary fields from each playlist
        for playlist in data["playlists"]:
            for field in playlist_fields_to_remove:
                if field in playlist:
                    del playlist[field]
            # Remove unnecessary fields from each track
            for track in playlist["tracks"]:
                for field in track_fields_to_remove:
                    if field in track:
                        del track[field]
        # Save the edited JSON file to the output directory
        with open(os.path.join(output_dir, filename), "w") as f:
            json.dump(data, f)

### Relabel PID

In [5]:
# Define the directory containing the original JSON files
input_dir = "../data2"
# Define the directory where the edited JSON files will be saved
output_dir = "../data2"

# Create the output folder if it doesn't exist
output_folder = output_dir
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
pid = 0
# Iterate over each JSON file in the input directory
for filename in os.listdir(input_dir):
    # Make sure we're only working with JSON files
    if filename.endswith(".json"):
        # Load the JSON file
        with open(os.path.join(input_dir, filename)) as f:
            data = json.load(f)
        # Keep playlists with >10 followers
        playlists = data["playlists"]
        for playlist in playlists:
            playlist["pid"] = pid
            pid += 1
        # Save the edited JSON file to the output directory
        with open(os.path.join(output_dir, filename), "w") as f:
            json.dump(data, f)

In [6]:
print(pid)

11173


In [3]:
folder_path = "../data3/"
file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
print("Number of files in folder:", file_count)

Number of files in folder: 1000


### Combine files

In [4]:
# Path to the input folder with JSON files
input_folder = '../data2'

# Path to the output folder to save the new combined JSON file
output_folder = '../data3'

# Create the output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Initialize an empty dictionary to store all playlists
all_playlists = {"playlists": []}

# Loop through each file in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.json'):
        # Load the contents of the file into a dictionary
        with open(os.path.join(input_folder, filename), 'r') as f:
            data = json.load(f)
        # Append all playlists in the file to the dictionary of all playlists
        all_playlists["playlists"].extend(data["playlists"])

# Save the new combined JSON file to the output folder
output_file = os.path.join(output_folder, 'all_playlists.json')
with open(output_file, 'w') as f:
    json.dump(all_playlists, f)
