# Notebook to download and process media data defined by a base URL

First download artists and tracks metadata using `curl` from a terminal:

```bash
mkdir -p data/artists/{json,media} data/tracks/json

# read base_url from base_url.txt
base_url=$(cat base_url.txt)

pushd data/artists/json
# We know that there are 22 pages of artists by inspecting the total count from the first page
curl -sSfo "page_#1.json" "$base_url/artists?order_key=&page=[1-22]"
popd

pushd data/tracks/json
# We know that there are 58 pages of tracks by inspecting the total count from the first page
curl -sSfo "page_#1.json" "$base_url/tracks?Language=AR&Method=tracks&order_key=&page=[1-58]"
popd
```

The above should produce JSON files in both `data/artists/json` and `data/tracks/json` directories.

## Install dependencies

```bash
pip install -r requirements.txt
```

## Define common functions


In [1]:
import os
import json
import threading
import requests

# Read all json files in a directory
def read_json_files(path):
    files = os.listdir(path)
    json_files = []
    for file in files:
        if file.endswith(".json"):
            json_files.append(file)
    return json_files

# Do a GET request to a url and return the response content
def get_request(url):
    response = requests.get(url, headers={
        'Accept': '*/*',
        'User-Agent': 'curl/7.54.0'
    })
    response.raise_for_status()
    return response.content

# Execute a callable on each json file in a separate thread
def launch_threads(json_files, callable):
    threads = []
    for file in json_files:
        thread = threading.Thread(target=callable, args=(file,))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()

# Get the artist id from a track object
def get_artist_id_from_track_object(obj):
    track_id = obj["id"]
    artists = obj["artists"]
    if len(artists) == 0 or artists[0]["id"] is None:
        print(f'Warning: track: {track_id} has no artists, processing media file on root directory.')
        return None
    else:
        return obj["artists"][0]["id"]

# Read base_url from file
def read_base_url(base_url_file):
    with open(base_url_file, 'r') as f:
        return f.readline()
    
base_url = read_base_url("base_url.txt")

## Download artists artwork

In [None]:
def download_images_from_json_file(file):
    with open(f'data/artists/json/{file}') as json_file:
        data = json.load(json_file)
    for obj in data["object"]:
        id = obj["id"]
        image_url = obj["image"]["web_url"]
        ext = image_url.split(".")[-1]
        # create directory if it doesn't exist
        dir = f'data/artists/media/{id}'
        if not os.path.exists(dir):
            os.makedirs(dir)
        # save image to disk
        with open(f'{dir}/artwork.{ext}', 'wb') as f:
            f.write(get_request(image_url))
    print(f'Processed file: {file} containing {len(data["object"])} objects.')

json_files = read_json_files("data/artists/json")
launch_threads(json_files, download_images_from_json_file)

## Download tracks media

In [None]:
def download_media_from_json_file(file):
    with open(f'data/tracks/json/{file}') as json_file:
        data = json.load(json_file)
    for obj in data["object"]:
        track_id = obj["id"]
        artist_id = get_artist_id_from_track_object(obj)
        artist_id = "" if artist_id is None else f'{artist_id}/'
        size = obj["size"]
        # check if file already exists
        media_path = f'data/artists/media/{artist_id}{track_id}.mp3'
        if os.path.exists(media_path) and os.path.getsize(media_path) == size:
            continue
        # request download url
        download_url = f'{base_url}/tracks/{track_id}/url?Method=GetTrackDownloadLink&TrackID={track_id}&Type=2'
        download_metadata = json.loads(get_request(download_url))
        media_url = download_metadata["object"]["url"]
        # remove backslashes from url
        media_url = media_url.replace("\\", "")
        # download media file
        try:
            response = get_request(media_url)
        except requests.exceptions.HTTPError as e:
            print(f'Warning: track: {track_id} in file {file} raised HTTP error {e}')
            continue
        # save media file to disk
        with open(media_path, 'wb') as f:
            f.write(response)
        # verify file size
        actual_size = os.path.getsize(media_path)
        if size != actual_size:
            print(f'Warning: media file: {media_path} in json file {file} has actual size: {actual_size} but expected size: {size}.')

    print(f'Processed file: {file} containing {len(data["object"])} objects.')
    
json_files = read_json_files("data/tracks/json")
launch_threads(json_files, download_media_from_json_file)

To monitor the progress of the download, run the following command in a terminal:

```bash
find data/artists/media -type f -name "*.mp3" -size +0 -exec ls -l {} \; | wc -l
```

This will print the number of non-empty files in the `artists/media` directory matching the `*.mp3` pattern.
The total number of tracks can be found in any of the `data/tracks/json/page_*.json` files under key `meta_data.total_count`.

## Set ID3 tags for tracks

In [5]:
import music_tag

# Load artists data in memory including artwork indexed by id
artists = {}
for file in read_json_files("data/artists/json"):
    with open(f'data/artists/json/{file}') as json_file:
        data = json.load(json_file)
    for obj in data["object"]:
        id = obj["id"]
        artists[id] = obj
        ext = obj["image"]["web_url"].split(".")[-1]
        with open(f'data/artists/media/{id}/artwork.{ext}', 'rb') as img:
            artists[id]["artwork"] = img.read()
print(f'Loaded {len(artists)} artists.')
default_artist = {
    "title": "قارئ"
}
with open(f'data/artists/media/artwork.png', 'rb') as img:
    default_artist["artwork"] = img.read()

def set_id3_tags_from_json_file(file):
    with open(f'data/tracks/json/{file}') as json_file:
        data = json.load(json_file)
    for obj in data["object"]:
        track_id = obj["id"]
        artist_id = get_artist_id_from_track_object(obj)
        if artist_id is None or artist_id not in artists:
            artist = default_artist
            artist_id = ""
        else:
            artist = artists[artist_id]
            artist_id = f'{artist_id}/'
        # set id3 tags
        media_path = f'data/artists/media/{artist_id}{track_id}.mp3'
        if not os.path.exists(media_path):
            print(f'Warning: media file: {media_path} not found.')
            continue
        tags = music_tag.load_file(media_path)
        # preserve original title in the comment field
        # tags["comment"] = tags["title"]
        tags["tracktitle"] = obj["title"]
        tags["artist"] = tags["albumartist"] = tags["album"] = artist["title"]
        try:
            tags["artwork"] = artist["artwork"]
        except Exception as e:
            print(f'Warning: could not set artwork for track: {track_id} in file {file}. Error: {e}. Setting default artwork.')
            tags["artwork"] = default_artist["artwork"]
        tags["genre"] = ""
        # save id3 tags
        tags.save()

# Process tracks in parallel
json_files = read_json_files("data/tracks/json")
launch_threads(json_files, set_id3_tags_from_json_file)