#### Step 0: Dependencies

Run this one time when starting, then restart the kernel

In [None]:
# !pip install pandas pandas-gbq==0.12.0 --user

# Data prep

## In this notebook we will load the songs from the zip file, and perform transoformations to prepare the data for two-tower training
Steps
1. Extract from the zip file
2. Upload to BQ
3. Enrich features for the playlist songs
4. Cross-join songs with features (excpected rows = n_songs x n_playlists)
5. Remove after-the-fact (later position songs) from the newly generated samples
6. Create a clean train table, and flatten structs or use arrays

#### Unzip the file and uplaod to BQ
[Source of data if you want to download zip](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge/dataset_files)

In [3]:
# Set your variables for your project, region, and dataset name
SOURCE_BUCKET = 'gs://spotify-builtin-2t/source_data/'
PROJECT_ID = 'hybrid-vertex'
REGION = 'us-central1'
bq_dataset = 'mdp_eda_test'

import time

In [4]:
from google.cloud import bigquery

bigquery_client = bigquery.Client(project=PROJECT_ID)

In [2]:
# !gsutil cp {SOURCE_BUCKET}spotify_million_playlist_dataset.zip .
# !unzip spotify_million_playlist_dataset.zip

#### This step can take up to xxxxxxx

In [None]:
%%time
import os
import json
import pandas as pd
data_files = os.listdir('data')

#make sure there is not already existing data in the playlists table
#loops over json files - converts to pandas then upload/appends
for filename in data_files:
    with open(f'data/{filename}') as f:
        json_dict = json.load(f)
        df = pd.DataFrame(json_dict['playlists'])
        df.to_gbq(
        destination_table=f'{bq_dataset}.playlists', 
        project_id=PROJECT_ID, # TODO: param
        location=REGION, 
        progress_bar=False, 
        reauth=True, 
        if_exists='append'
        ) 

### Import bigquery and run parameterized queries to shape the data

This query formats the json strings to be read as Bigquery structs, to be manipulated in subsequent queries

In [None]:
%%time


json_extract_query = f"""create or replace table `{PROJECT_ID}.{bq_dataset}.playlists_nested` as (
with json_parsed as (SELECT * except(tracks), JSON_EXTRACT_ARRAY(tracks) as json_data FROM `{PROJECT_ID}.{bq_dataset}.playlists` )

select json_parsed.* except(json_data),
ARRAY(SELECT AS STRUCT
JSON_EXTRACT_SCALAR(json_data, "$.pos") as pos, 
JSON_EXTRACT_SCALAR(json_data, "$.artist_name") as artist_name,
JSON_EXTRACT_SCALAR(json_data, "$.track_uri") as track_uri,
JSON_EXTRACT_SCALAR(json_data, "$.artist_uri") as artist_uri,
JSON_EXTRACT_SCALAR(json_data, "$.track_name") as track_name,
JSON_EXTRACT_SCALAR(json_data, "$.album_uri") as album_uri,
JSON_EXTRACT_SCALAR(json_data, "$.duration_ms") as duration_ms,
JSON_EXTRACT_SCALAR(json_data, "$.album_name") as album_name
from json_parsed.json_data
) as tracks,
from json_parsed) """

bigquery_client.query(json_extract_query).result()

## Now enrich the playlist songs with the new features

`unique_track_features` - create from file

+

`unique_artist_features` - create from file

These are additional files where features were added 

In [34]:
# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"

job_config = bigquery.LoadJobConfig(
    autodetect=True,
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
uri = f"{SOURCE_BUCKET}unique_artists_features.json"

table_id = f"{PROJECT_ID}.{bq_dataset}.unique_artists_features"

load_job = bigquery_client.load_table_from_uri(
    uri,
    table_id,
    location=REGION,  # Must match the destination dataset location.
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = bigquery_client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Loaded 295860 rows.


In [35]:
# TODO(developer): Set table_id to the ID of the table to create.
# table_id = "your-project.your_dataset.your_table_name"

job_config = bigquery.LoadJobConfig(
    autodetect=True,
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
uri = f"{SOURCE_BUCKET}unique_track_features.json"

table_id = f"{PROJECT_ID}.{bq_dataset}.unique_track_features"

load_job = bigquery_client.load_table_from_uri(
    uri,
    table_id,
    location=REGION,  # Must match the destination dataset location.
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = bigquery_client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Loaded 2262292 rows.


In [37]:
%%time
enrich_query = f"""CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.enriched_data` AS (
    SELECT
    a.* except(tracks),
      ARRAY(
    SELECT
      AS STRUCT CAST(track.pos AS int64) AS pos_can,
      case when track.artist_name = '' then 'NONE' else track.artist_name end AS artist_name_can,
      case when track.track_uri = '' then 'NONE' else track.track_uri  end AS track_uri_can,
      case when track.track_name = '' then 'NONE' else track.track_name  end AS track_name_can,
      case when track.album_uri = '' then 'NONE' else track.album_uri  end AS album_uri_can,
      case when track.artist_uri = '' then 'NONE' else track.artist_uri  end AS artist_uri_can,
      CAST(track.duration_ms AS float64) / 1.0 AS duration_ms_can,
      case when track.album_name = '' then 'NONE' else track.album_name end AS album_name_can,
      CAST(IFNULL(tf.track_pop, 0.0) as float64) / 1.0 AS track_pop_can,
      CAST(IFNULL(af.artist_pop, 0.0) as float64) / 1.0  AS artist_pop_can,
      case when af.artist_genres[OFFSET(0)] = '' then ['NONE'] else af.artist_genres end AS artist_genres_can,
      CAST(IFNULL(af.artist_followers, 0.0) as float64) / 1.0 AS artist_followers_can
    FROM
      UNNEST(tracks) as track
    INNER JOIN
      `{PROJECT_ID}.{bq_dataset}.unique_track_features` AS tf --track features
    ON
      (track.track_uri = tf.track_uri)
    INNER JOIN
      `{PROJECT_ID}.{bq_dataset}.unique_artists_features` AS af --artist features
      ON
      (track.artist_uri = af.artist_uri)
      ) AS tracks
  FROM 
  `{PROJECT_ID}.{bq_dataset}.playlists_nested` as a)"""

bigquery_client.query(enrich_query).result()

CPU times: user 31.1 ms, sys: 221 µs, total: 31.3 ms
Wall time: 48.6 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f47581f7430>

## Cross join + get rid of after-the-fact `pos` data in playlist

cross_join_songxplaylist_struct_query

`hybrid-vertex.spotify_train_3.ordered_position_training`

We create a data structure that creates unique song-playlist combos (every possible via cross-join). There is also a portion of pulling the last song in the playlist as the "seed track"

In [None]:
%%time
cross_join_query = f"""
  CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.ordered_position_training` AS (
  WITH
    -- get every combination of song and its parent playlist
    unnest_cross AS(
    SELECT
      b.*,
      CONCAT(b.pid,"-",track.pos_can) AS pid_pos_id,
      CAST(track.pos_can AS int64) AS pos_can,
      track.artist_name_can ,
      track.track_uri_can ,
      track.album_uri_can,
      track.track_name_can ,
      track.artist_uri_can ,
      CAST(track.duration_ms_can AS float64) AS duration_ms_can,
      track.album_name_can ,
      track.track_pop_can ,
      track.artist_pop_can,
      track.artist_genres_can ,
      track.artist_followers_can 
    FROM (
      SELECT
        * EXCEPT(duration_ms)
      FROM
        `{PROJECT_ID}.{bq_dataset}.enriched_data`) AS b
    CROSS JOIN
      UNNEST(tracks) AS track)
  SELECT
    a.* EXCEPT(tracks,
      num_tracks,
      num_artists,
      num_albums,
      num_followers,
      num_edits),
    ARRAY(
    SELECT
      AS STRUCT CAST(track.pos_can AS int64) AS pos_pl,
      track.artist_name_can AS artist_name_pl,
      track.track_uri_can AS track_uri_pl,
      track.track_name_can AS track_name_pl,
      track.album_uri_can AS album_uri_pl,
      track.artist_uri_can AS artist_uri_pl,
      CAST(track.duration_ms_can AS float64) AS duration_ms_pl,
      track.album_name_can AS album_name_pl,
      track.track_pop_can AS track_pop_pl,
      track.artist_pop_can AS artist_pop_pl,
      track.artist_genres_can AS artist_genres_pl,
      track.artist_followers_can AS artist_followers_pl,
    FROM
      UNNEST(tracks) AS track
    WHERE
      CAST(track.pos_can AS int64) < a.pos_can ORDER BY CAST(track.pos_can AS int64)) AS seed_playlist_tracks,
    ----- seed track part
    trx.pos_can AS pos_seed_track,
    trx.artist_name_can AS artist_name_seed_track,
    trx.artist_uri_can AS artist_uri_seed_track,
    trx.track_name_can AS track_name_seed_track,
    trx.track_uri_can AS track_uri_seed_track,
    trx.album_name_can AS album_name_seed_track,
    trx.album_uri_can AS album_uri_seed_track,
    trx.duration_ms_can AS duration_seed_track,
    trx.track_pop_can AS track_pop_seed_track,
    trx.artist_pop_can AS artist_pop_seed_track,
    trx.artist_genres_can as artist_genres_seed_track,
    trx.artist_followers_can as artist_followers_seed_track
  FROM
    unnest_cross AS a -- with statement
    ,
    UNNEST(a.tracks) AS trx
  WHERE
    CAST(trx.pos_can AS int64) = a.pos_can-1);
    """

bigquery_client.query(cross_join_query).result()

## Update the playlist metadata with the new samples created above

Trainv3-clean-track-features

Get new metadata for the tracks now that there are updated track counts, durations, etc...

`hybrid-vertex.spotify_train_3.train`


In [6]:
%%time
get_new_metadata_query = f"""
create or replace table `{PROJECT_ID}.{bq_dataset}.train_dif_artist` as (
WITH
  playlist_features_clean AS (
  SELECT
    pid_pos_id,
    SUM(trx.duration_ms_pl) / 1.0 AS duration_ms_seed_pl,
    COUNT(1) / 1.0 AS n_songs_pl,
    COUNT(DISTINCT trx.artist_name_pl) / 1.0 AS num_artists_pl,
    COUNT(DISTINCT trx.album_uri_pl) /1.0 AS num_albums_pl,
  FROM
    `{PROJECT_ID}.{bq_dataset}.ordered_position_training`,
    UNNEST(seed_playlist_tracks) AS trx
  GROUP BY
    pid_pos_id)
    
SELECT
  a.* except(artist_genres_can, artist_genres_seed_track, track_pop_can, artist_pop_can, artist_followers_can,
            track_pop_seed_track, artist_pop_seed_track),
  b.* except(pid_pos_id),
  IFNULL(a.artist_genres_can[OFFSET(0)], "NONE") as artist_genres_can,
  IFNULL(a.artist_genres_seed_track[OFFSET(0)], "NONE") as artist_genres_seed_track,
  IFNULL(a.track_pop_can, 0.0) / 1.0 as  track_pop_can, 
  IFNULL(a.artist_pop_can, 0.0) / 1.0 as artist_pop_can,
  IFNULL(a.artist_followers_can, 0.0) / 1.0 as artist_followers_can,
  IFNULL(a.track_pop_seed_track, 0.0) / 1.0 as track_pop_seed_track,
  IFNULL(a.artist_pop_seed_track, 0.0) / 1.0 as artist_pop_seed_track,
  
FROM
  `{PROJECT_ID}.{bq_dataset}.ordered_position_training` a
INNER JOIN
  playlist_features_clean b
ON
  a.pid_pos_id = b.pid_pos_id 
  WHERE album_uri_can != album_uri_seed_track and artist_uri_seed_track != artist_uri_can)
  """

bigquery_client.query(get_new_metadata_query).result()

CPU times: user 52.1 ms, sys: 7.18 ms, total: 59.3 ms
Wall time: 3min 4s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fdb52da8850>

## For TFRecords
Get rid of structs by creating new table with arrays from playlist_seed


In [37]:
%%time
train_flatten_query = f"""
create or replace table `{PROJECT_ID}.{bq_dataset}.train_flatten_dif_artist` as (
SELECT a.* except(seed_playlist_tracks, description),
    RAND() as random,
    IFNULL(a.description, "") as description_pl,
    ARRAY(select t.pos_pl from UNNEST(seed_playlist_tracks) t) as pos_pl,
    ARRAY(select t.artist_name_pl from UNNEST(seed_playlist_tracks) t) as artist_name_pl,
    ARRAY(select t.track_uri_pl from UNNEST(seed_playlist_tracks) t) as track_uri_pl,
    ARRAY(select t.track_name_pl from UNNEST(seed_playlist_tracks) t) as track_name_pl,
    ARRAY(select t.duration_ms_pl from UNNEST(seed_playlist_tracks) t) as duration_ms_songs_pl,
    ARRAY(select t.album_name_pl from UNNEST(seed_playlist_tracks) t) as album_name_pl,
    ARRAY(select cast(t.artist_pop_pl as FLOAT64) from UNNEST(seed_playlist_tracks) t) as artist_pop_pl,
    ARRAY(select t.artist_followers_pl from UNNEST(seed_playlist_tracks) t) as artists_followers_pl,
    ARRAY(select case when t.track_pop_pl is null then 0. else t.track_pop_pl end from UNNEST(seed_playlist_tracks) t) as track_pop_pl,
    ARRAY(select t.artist_genres_pl[OFFSET(0)] from UNNEST(seed_playlist_tracks) t) as artist_genres_pl
    from `{PROJECT_ID}.{bq_dataset}.train_dif_artist` a
)
"""

bigquery_client.query(train_flatten_query).result()

CPU times: user 38 ms, sys: 8.46 ms, total: 46.4 ms
Wall time: 53.3 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fdb45b4d690>

## We will use this table to export jsonl in the next notebook

Links on built in two tower and data [requriements](https://cloud.google.com/vertex-ai/docs/matching-engine/train-embeddings-two-tower#training_data)

In [38]:
%%time
VALIDATION_P = 0.1

validation_creation = f"""
CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.train_flatten_dif_artist_valid` AS (
    SELECT * except(random)
  FROM
    `{PROJECT_ID}.{bq_dataset}.train_flatten_dif_artist` where random < {VALIDATION_P})"""

bigquery_client.query(validation_creation).result()

CPU times: user 20 ms, sys: 3.23 ms, total: 23.2 ms
Wall time: 20.3 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fdb45b43ad0>

In [39]:
%%time
VALIDATION_P = 0.1

validation_creation = f"""
CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.train_flatten_dif_artist_train` AS (
    SELECT * except(random)
  FROM
    `{PROJECT_ID}.{bq_dataset}.train_flatten_dif_artist` where random >= {VALIDATION_P})"""

bigquery_client.query(validation_creation).result()

CPU times: user 15.1 ms, sys: 11.6 ms, total: 26.7 ms
Wall time: 35.3 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fdb45cf30d0>

In [40]:
### Export train

BUCKET = 'gs://spotify-builtin-2t'
PROJECT = PROJECT_ID
DATASET_ID = bq_dataset
TABLE = 'train_flatten_dif_artist_train'
LOCATION = REGION

from google.cloud import bigquery
client = bigquery.Client()

destination_uri = f"{BUCKET}/train_data_parquet_dif_artist_train/*.snappy.parquet"
dataset_ref = bigquery.DatasetReference(PROJECT, DATASET_ID)
table_ref = dataset_ref.table(TABLE)
job_config = bigquery.job.ExtractJobConfig()
job_config.destination_format = bigquery.DestinationFormat.PARQUET
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    job_config=job_config,
    # Location must match that of the source table.
    location=LOCATION,
)  # API request
extract_job.result()  # Waits for job to complete.

ExtractJob<project=hybrid-vertex, location=us-central1, id=686bf6ab-7b5b-4dd7-846d-6c5831b836c5>

In [None]:
### Export validation

BUCKET = 'gs://spotify-builtin-2t'
PROJECT = PROJECT_ID
DATASET_ID = bq_dataset
TABLE = 'train_flatten_dif_artist_valid'
LOCATION = REGION

from google.cloud import bigquery
client = bigquery.Client()

destination_uri = f"{BUCKET}/train_data_parquet_dif_artist_valid/*.snappy.parquet"
dataset_ref = bigquery.DatasetReference(PROJECT, DATASET_ID)
table_ref = dataset_ref.table(TABLE)
job_config = bigquery.job.ExtractJobConfig()
job_config.destination_format = bigquery.DestinationFormat.PARQUET
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    job_config=job_config,
    # Location must match that of the source table.
    location=LOCATION,
)  # API request
extract_job.result()  # Waits for job to complete.