# TODOs
* clean up notebook
* parameterize
* offer large and small options for producing dataset (create optionals)

#### Step 0: Dependencies

Run this one time when starting, then restart the kernel

In [1]:
# !pip install pandas pandas-gbq==0.12.0 --user

# Data prep

## In this notebook we will load the songs from the zip file, and perform transformations to prepare the data for two-tower training
Steps
1. Extract from the zip file
2. Upload to BQ
3. Enrich features for the playlist songs
4. Cross-join songs with features (expected rows = n_songs x n_playlists)
5. Remove after-the-fact (later position songs) from the newly generated samples
6. Create a clean train table, and flatten structs or use arrays

#### Unzip the file and upload to BQ
Source of data if you want to download zip: gs://spotify-million-playlist-dataset

In [15]:
# Set your variables for your project, region, and dataset name
SOURCE_BUCKET = 'gs://spotify-million-playlist-dataset'
PROJECT_ID = 'hybrid-vertex'
REGION = 'us-central1'
bq_dataset = 'a_spotify_ds'

import time
from google.cloud import bigquery

bigquery_client = bigquery.Client(project=PROJECT_ID, location=REGION)
bigquery_client.location

'us-central1'

In [16]:

dataset = bigquery_client.create_dataset(bq_dataset, timeout=30)  # Make an API request.

print(f"Created dataset {bigquery_client.project}.{dataset.dataset_id} in location: {dataset.location}")

Created dataset hybrid-vertex.a_spotify_ds in location: us-central1


## Next create unique artist and song tables
These tables contain features obtained via the public Spotify API. Features such as track and artist popularity are in this data. For more detail on loading json data to Bigquery, [see here](https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-json)

![](img/unique-songs.png)

In [19]:
table_id = f"{PROJECT_ID}.{bq_dataset}.unique_track_features"

job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("track_pop", "INTEGER"),
        bigquery.SchemaField("tracks_playlist_titles", "STRING"),
        bigquery.SchemaField("track_uri", "STRING"),
    ],
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
uri = f"{SOURCE_BUCKET}/unique_track_features.gzip"

load_job = bigquery_client.load_table_from_uri(
    uri,
    table_id,
    location=f"{REGION}",  # Must match the destination dataset location
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

destination_table = bigquery_client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Loaded 4524584 rows.


### Unique artists

![](img/unique-artists.png)

In [21]:
start_time = time.time()

table_id = f"{PROJECT_ID}.{bq_dataset}.unique_artist_features"

job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("artist_genres", "STRING", "REPEATED"),
        bigquery.SchemaField("artist_pop", "INTEGER"),
        bigquery.SchemaField("artist_followers", "INTEGER"),
        bigquery.SchemaField("artist_uri", "STRING"),
        bigquery.SchemaField("artist_name", "STRING"),
    ],
    source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)
uri = f"{SOURCE_BUCKET}/unique_artist_features.gzip"

load_job = bigquery_client.load_table_from_uri(
    uri,
    table_id,
    location=f"{REGION}",  # Must match the destination dataset location
    job_config=job_config,
)  # Make an API request.

load_job.result()  # Waits for the job to complete.

end_time = time.time()
runtime_mins = int((end_time - start_time) / 60)
print(f"elapsed time (mins): {runtime_mins}")

destination_table = bigquery_client.get_table(table_id)
print("Loaded {} rows.".format(destination_table.num_rows))

Loaded 295860 rows.
elapsed time (mins): 0


### Track `audio_features`

* remove `nulls`; replace with `-1` etc.

In [None]:
# CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds.unique_track_features_plus` AS (
#   SELECT 
#     unis.track_pop as track_pop,
#     unis.tracks_playlist_titles as tracks_playlist_titles,
#     unis.track_uri as track_uri,
#     auds.danceability as danceability,
#     auds.energy as energy,
#     auds.key as key,
#     auds.loudness as loudness,
#     auds.mode as mode,
#     auds.speechiness as speechiness,
#     auds.acousticness as acousticness,
#     auds.instrumentalness as instrumentalness,
#     auds.liveness as liveness,
#     auds.valence as valence,
#     auds.tempo as tempo,
#     auds.time_signature as time_signature,
#   -- uri,
#   FROM 
#     `hybrid-vertex.a_spotify_ds.unique_track_features` as unis
#   LEFT JOIN  
#     `hybrid-vertex.spotify_mpd.track_audio` as auds
#   ON
#     unis.track_uri = auds.uri
# );

converting track key, mode, and time signature to string...

In [None]:
CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds.unique_track_features_plus_v3` AS (
  SELECT 
    unis.track_pop as track_pop,
    unis.tracks_playlist_titles as tracks_playlist_titles,
    unis.track_uri as track_uri,
    auds.track_name as track_name,
    auds.danceability as danceability,
    auds.energy as energy,
    CAST(auds.key AS STRING) as key,
    auds.loudness as loudness,
    CAST(auds.mode AS STRING) as mode,
    auds.speechiness as speechiness,
    auds.acousticness as acousticness,
    auds.instrumentalness as instrumentalness,
    auds.liveness as liveness,
    auds.valence as valence,
    auds.tempo as tempo,
    CAST(auds.time_signature AS STRING) as time_signature,
  -- uri,
  FROM 
    `hybrid-vertex.a_spotify_ds.unique_track_features` as unis
  LEFT JOIN  
    `hybrid-vertex.spotify_mpd.track_audio` as auds
  ON
    unis.track_uri = auds.uri
);

##### The data is now in BQ

## The tables are set for feature enrichment
We will visit these tables later, now let's load the Million Playlist dataset locally and push bq using `pandas-gbq` (see requirements installation at the top)

In [22]:
!pwd
# gsutil cp gs://spotify-million-playlist-dataset/spotify_million_playlist_dataset.zip .

/home/jupyter/jw-repo/spotify_mpd_two_tower


In [24]:
# start_time = time.time()

# !gsutil cp {SOURCE_BUCKET}/spotify_million_playlist_dataset.zip .
# !unzip spotify_million_playlist_dataset.zip

# end_time = time.time()
# runtime_mins = int((end_time - start_time) / 60)
# print(f"elapsed time (mins): {runtime_mins}")

#### This step can take up to 30 minutes

In [28]:
import os

data_files = os.listdir('data/data')
data_files[0:2]

['mpd.slice.906000-906999.json', 'mpd.slice.253000-253999.json']

In [29]:
%%time
import os
import json
import pandas as pd
data_files = os.listdir('data/data')

#make sure there is not already existing data in the playlists table
#loops over json files - converts to pandas then upload/appends
for filename in data_files:
    with open(f'data/data/{filename}') as f:
        json_dict = json.load(f)
        df = pd.DataFrame(json_dict['playlists'])
        df.to_gbq(
        destination_table=f'{bq_dataset}.playlists', 
        project_id=PROJECT_ID, # TODO: param
        location=REGION, 
        progress_bar=False, 
        reauth=True, 
        if_exists='append'
        ) 

CPU times: user 13min 10s, sys: 36.4 s, total: 13min 46s
Wall time: 1h 52min 19s


Now the data is loaded but the playlists are nested as one large string that needs to be parsed - we will use json compatible functionality with BigQuery to address

![](img/tracks-string.png)

### Import bigquery and run parameterized queries to shape the data

This query formats the json strings to be read as Bigquery structs, to be manipulated in subsequent queries

In [None]:
%%time

json_extract_query = f"""create or replace table `{PROJECT_ID}.{bq_dataset}.playlists_nested` as (
with json_parsed as (SELECT * except(tracks), JSON_EXTRACT_ARRAY(tracks) as json_data FROM `{PROJECT_ID}.{bq_dataset}.playlists` )

select json_parsed.* except(json_data),
ARRAY(SELECT AS STRUCT
JSON_EXTRACT_SCALAR(json_data, "$.pos") as pos, 
JSON_EXTRACT_SCALAR(json_data, "$.artist_name") as artist_name,
JSON_EXTRACT_SCALAR(json_data, "$.track_uri") as track_uri,
JSON_EXTRACT_SCALAR(json_data, "$.artist_uri") as artist_uri,
JSON_EXTRACT_SCALAR(json_data, "$.track_name") as track_name,
JSON_EXTRACT_SCALAR(json_data, "$.album_uri") as album_uri,
JSON_EXTRACT_SCALAR(json_data, "$.duration_ms") as duration_ms,
JSON_EXTRACT_SCALAR(json_data, "$.album_name") as album_name
from json_parsed.json_data
) as tracks,
from json_parsed) """

bigquery_client.query(json_extract_query).result()

Now `playlists_nested` has parsed the string data to a struct with arrays that will allow us to process the data much more easily

![](img/playlists-nested.png)

## Now enrich the playlist songs with the new features

`unique_track_features` - created from file above

+

`unique_artist_features` - created from file above

These are additional tables where features were added in the beginning of the notebook

In [37]:
%%time
enrich_query = f"""CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.enriched_data` AS (
    SELECT
    a.* except(tracks),
      ARRAY(
    SELECT
      AS STRUCT CAST(track.pos AS int64) AS pos_can,
      case when track.artist_name = '' then 'NONE' else track.artist_name end AS artist_name_can,
      case when track.track_uri = '' then 'NONE' else track.track_uri  end AS track_uri_can,
      case when track.album_uri = '' then 'NONE' else track.album_uri  end AS album_uri_can,
      case when track.artist_uri = '' then 'NONE' else track.artist_uri  end AS artist_uri_can,
      CAST(track.duration_ms AS float64) / 1.0 AS duration_ms_can,
      case when track.album_name = '' then 'NONE' else track.album_name end AS album_name_can,
      CAST(IFNULL(tf.track_pop, 0.0) as float64) / 1.0 AS track_pop_can,
      CAST(IFNULL(af.artist_pop, 0.0) as float64) / 1.0  AS artist_pop_can,
      case when af.artist_genres[OFFSET(0)] = '' then ['NONE'] else af.artist_genres end AS artist_genres_can,
      CAST(IFNULL(af.artist_followers, 0.0) as float64) / 1.0 AS artist_followers_can
    FROM
      UNNEST(tracks) as track
    INNER JOIN
      `{PROJECT_ID}.{bq_dataset}.unique_track_features` AS tf --track features
    ON
      (track.track_uri = tf.track_uri)
    INNER JOIN
      `{PROJECT_ID}.{bq_dataset}.unique_artists_features` AS af --artist features
      ON
      (track.artist_uri = af.artist_uri)
      ) AS tracks
  FROM 
  `{PROJECT_ID}.{bq_dataset}.playlists_nested` as a)"""

bigquery_client.query(enrich_query).result()

CPU times: user 31.1 ms, sys: 221 µs, total: 31.3 ms
Wall time: 48.6 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f47581f7430>

### JT -adding audio features

In [None]:

# CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds.enriched_data_jt` AS (
#   SELECT
#   a.* except(tracks),
#       ARRAY(
#     SELECT
#       AS STRUCT CAST(track.pos AS int64) AS pos_can,
#       case when track.artist_name = '' then 'NONE' else track.artist_name end AS artist_name_can,
#       case when track.track_uri = '' then 'NONE' else track.track_uri  end AS track_uri_can,
#       case when track.album_uri = '' then 'NONE' else track.album_uri  end AS album_uri_can,
#       case when track.artist_uri = '' then 'NONE' else track.artist_uri  end AS artist_uri_can,
#       CAST(track.duration_ms AS float64) / 1.0 AS duration_ms_can,
#       case when track.album_name = '' then 'NONE' else track.album_name end AS album_name_can,
#       CAST(IFNULL(tf.track_pop, 0.0) as float64) / 1.0 AS track_pop_can,

#       CASE WHEN tf.tracks_playlist_titles = '' then 'NONE' ELSE tf.tracks_playlist_titles END AS tracks_playlist_titles,
#       CAST(IFNULL(tf.danceability, 0.0) as float64) / 1.0 AS track_danceability,
#       CAST(IFNULL(tf.energy, 0.0) as float64) / 1.0 AS track_energy,
#       CAST(IFNULL(tf.key, 0.0) as float64) / 1.0 AS track_key,
#       CAST(IFNULL(tf.loudness, 0.0) as float64) / 1.0 AS track_loudness,
#       CAST(IFNULL(tf.mode, 0.0) as float64) / 1.0 AS track_mode,
#       CAST(IFNULL(tf.speechiness, 0.0) as float64) / 1.0 AS track_speechiness,
#       CAST(IFNULL(tf.acousticness, 0.0) as float64) / 1.0 AS track_acousticness,
#       CAST(IFNULL(tf.instrumentalness, 0.0) as float64) / 1.0 AS track_instrumentalness,
#       CAST(IFNULL(tf.liveness, 0.0) as float64) / 1.0 AS track_liveness,
#       CAST(IFNULL(tf.valence, 0.0) as float64) / 1.0 AS track_valence,
#       CAST(IFNULL(tf.tempo, 0.0) as float64) / 1.0 AS track_tempo,
#       CAST(IFNULL(tf.time_signature, 0.0) as float64) / 1.0 AS time_signature,

#       CAST(IFNULL(af.artist_pop, 0.0) as float64) / 1.0  AS artist_pop_can,
#       case when af.artist_genres[OFFSET(0)] = '' then ['NONE'] else af.artist_genres end AS artist_genres_can,
#       CAST(IFNULL(af.artist_followers, 0.0) as float64) / 1.0 AS artist_followers_can
#     FROM
#       UNNEST(tracks) as track
#     INNER JOIN
#       `hybrid-vertex.a_spotify_ds.unique_track_features_plus` AS tf --track features
#     ON
#       (track.track_uri = tf.track_uri)
#     INNER JOIN
#       `hybrid-vertex.a_spotify_ds.unique_artist_features` AS af --artist features
#       ON
#       (track.artist_uri = af.artist_uri)
#       ) AS tracks
#   FROM 
#   `hybrid-vertex.mdp_eda_test.playlists_nested` as a)

Converting mode, key, and time_signatrue to categorical/string

In [None]:
CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds.enriched_data_jt_v3` AS (
  SELECT
  a.* except(tracks),
      ARRAY(
    SELECT
      AS STRUCT CAST(track.pos AS int64) AS pos_can,
      case when track.artist_name = '' then 'NONE' else track.artist_name end AS artist_name_can,
      case when track.track_uri = '' then 'NONE' else track.track_uri  end AS track_uri_can,
      case when track.track_name = '' then 'NONE' else track.track_name  end AS track_name_can,
      case when track.album_uri = '' then 'NONE' else track.album_uri  end AS album_uri_can,
      case when track.artist_uri = '' then 'NONE' else track.artist_uri  end AS artist_uri_can,
      CAST(track.duration_ms AS float64) / 1.0 AS duration_ms_can,
      case when track.album_name = '' then 'NONE' else track.album_name end AS album_name_can,
      CAST(IFNULL(tf.track_pop, 0.0) as float64) / 1.0 AS track_pop_can,

      CASE WHEN tf.tracks_playlist_titles = '' then 'NONE' ELSE tf.tracks_playlist_titles END AS tracks_playlist_titles,
      CAST(IFNULL(tf.danceability, 0.0) as float64) / 1.0 AS track_danceability,
      CAST(IFNULL(tf.energy, 0.0) as float64) / 1.0 AS track_energy,
      -- CAST(IFNULL(tf.key, 0.0) as float64) / 1.0 AS track_key,
      CASE WHEN tf.key IS NULL THEN 'NONE' ELSE tf.key END AS track_key,
      CAST(IFNULL(tf.loudness, 0.0) as float64) / 1.0 AS track_loudness,
      -- CAST(IFNULL(tf.mode, 0.0) as float64) / 1.0 AS track_mode,
      CASE WHEN tf.mode IS NULL THEN 'NONE' ELSE tf.mode END AS track_mode,
      CAST(IFNULL(tf.speechiness, 0.0) as float64) / 1.0 AS track_speechiness,
      CAST(IFNULL(tf.acousticness, 0.0) as float64) / 1.0 AS track_acousticness,
      CAST(IFNULL(tf.instrumentalness, 0.0) as float64) / 1.0 AS track_instrumentalness,
      CAST(IFNULL(tf.liveness, 0.0) as float64) / 1.0 AS track_liveness,
      CAST(IFNULL(tf.valence, 0.0) as float64) / 1.0 AS track_valence,
      CAST(IFNULL(tf.tempo, 0.0) as float64) / 1.0 AS track_tempo,
      -- CAST(IFNULL(tf.time_signature, 0.0) as float64) / 1.0 AS time_signature,
      CASE WHEN tf.time_signature IS NULL THEN 'NONE' ELSE tf.time_signature END AS time_signature,

      CAST(IFNULL(af.artist_pop, 0.0) as float64) / 1.0  AS artist_pop_can,
      case when af.artist_genres[OFFSET(0)] = '' then ['NONE'] else af.artist_genres end AS artist_genres_can,
      CAST(IFNULL(af.artist_followers, 0.0) as float64) / 1.0 AS artist_followers_can
    FROM
      UNNEST(tracks) as track
    INNER JOIN
      `hybrid-vertex.a_spotify_ds.unique_track_features_plus_v3` AS tf --track features
    ON
      (track.track_uri = tf.track_uri)
    INNER JOIN
      `hybrid-vertex.a_spotify_ds.unique_artist_features` AS af --artist features
      ON
      (track.artist_uri = af.artist_uri)
      ) AS tracks
  FROM 
  `hybrid-vertex.mdp_eda_test.playlists_nested` as a)

## Cross join + get rid of after-the-fact `pos` data in playlist

cross_join_songxplaylist_struct_query

`hybrid-vertex.spotify_train_3.ordered_position_training`

We create a data structure that creates unique song-playlist combos (every possible via cross-join). There is also a portion of pulling the last song in the playlist as the "seed track"
________
### Note on the approach

Semantic matching requires pairs, triplets (tuples generally) of co-occurrences between pairs. This is a very broad definition, and with this newer approach many new use cases are being explored. A simple example are finding pairs of user queries and purchases. The training example pair are: (the features we know from the user query, the features we know on the product they ultimately purchased).

There are other approaches where triples are considered, and there are advanced techniques on negative sampling, finding “bad” examples of query, product pairs, which we will not cover here.

Note there are other sampling techniques we highlight below (different artist/album)

The chosen task was predicting the next song on a playlist, given the playlist existing order. The approach taken was to create pairs for all children songs and their parent playlists. We did leveraging BigQuery’s `UNNEST` and `CROSS JOIN`. 

We also had rich features for playlists, albums and songs in another table that was later used to enrich post `CROSS JOIN`. This was done to optimize the computation since the cross-joining is expensive and it was subsequently much quicker to enrich after this step.

Now that we completed this step, we had all combinations of child song, playlist pairs. The song was the candidate label but the playlist still contained the candidate label and all songs after. Additional criteria was added to remove the candidate song and all songs that occur after the candidate in the playlist. For the sake of performance we also only considered the last 5 played songs. Other sampling configurations are available in the example notebook as well (only predicting when there are album and artist switches).

What this results in is a training dataset that has all possible child song candidates joined with the full playlist data, and the playlist data is properly censored as to only contain songs up to before the candidate song.

![](img/semantic-pair.png)

In [3]:
# %%time
# cross_join_query = f"""
#   CREATE OR REPLACE TABLE
#   `{PROJECT_ID}.{bq_dataset}.ordered_position_training` AS (
#   WITH
#     -- get every combination of song and its parent playlist
#     unnest_cross AS(
#     SELECT
#       b.*,
#       CONCAT(b.pid,"-",track.pos_can) AS pid_pos_id,
#       CAST(track.pos_can AS int64) AS pos_can,
#       IFNULL(track.artist_name_can, "NONE") as artist_name_can ,
#       track.track_uri_can ,
#       track.album_uri_can,
#       IFNULL(track.track_name_can, "NONE") as track_name_can ,
#       track.artist_uri_can ,
#       CAST(track.duration_ms_can AS float64) AS duration_ms_can,
#       track.album_name_can ,
#       track.track_pop_can ,
#       track.artist_pop_can,
#       track.artist_genres_can ,
#       track.artist_followers_can 
#     FROM (
#       SELECT
#         * EXCEPT(duration_ms)
#       FROM
#         `{PROJECT_ID}.{bq_dataset}.enriched_data`) AS b
#     CROSS JOIN
#       UNNEST(tracks) AS track)
#   SELECT
#     a.* EXCEPT(tracks,
#       num_tracks,
#       num_artists,
#       num_albums,
#       num_followers,
#       num_edits),
#     ARRAY(
#     SELECT
#       AS STRUCT CAST(track.pos_can AS int64) AS pos_pl,
#       track.artist_name_can AS artist_name_pl,
#       track.track_uri_can AS track_uri_pl,
#       track.track_name_can AS track_name_pl,
#       track.album_uri_can AS album_uri_pl,
#       track.artist_uri_can AS artist_uri_pl,
#       CAST(track.duration_ms_can AS float64) AS duration_ms_pl,
#       track.album_name_can AS album_name_pl,
#       track.track_pop_can AS track_pop_pl,
#       track.artist_pop_can AS artist_pop_pl,
#       track.artist_genres_can AS artist_genres_pl,
#       track.artist_followers_can AS artist_followers_pl,
#     FROM
#       UNNEST(tracks) AS track
#     WHERE
#       CAST(track.pos_can AS int64) < a.pos_can ORDER BY CAST(track.pos_can AS int64)) AS seed_playlist_tracks,
#     ----- seed track part
#     trx.pos_can AS pos_seed_track,
#     trx.artist_name_can AS artist_name_seed_track,
#     trx.artist_uri_can AS artist_uri_seed_track,
#     trx.track_name_can AS track_name_seed_track,
#     trx.track_uri_can AS track_uri_seed_track,
#     trx.album_name_can AS album_name_seed_track,
#     trx.album_uri_can AS album_uri_seed_track,
#     trx.duration_ms_can AS duration_seed_track,
#     trx.track_pop_can AS track_pop_seed_track,
#     trx.artist_pop_can AS artist_pop_seed_track,
#     trx.artist_genres_can as artist_genres_seed_track,
#     trx.artist_followers_can as artist_followers_seed_track
#   FROM
#     unnest_cross AS a -- with statement
#     ,
#     UNNEST(a.tracks) AS trx
#   WHERE
#     CAST(trx.pos_can AS int64) = a.pos_can-1);
#     """

# bigquery_client.query(cross_join_query).result()

CPU times: user 71.8 ms, sys: 15.7 ms, total: 87.5 ms
Wall time: 8min 12s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fc8861f7b10>

In [None]:
# CREATE OR REPLACE TABLE
# `hybrid-vertex.a_spotify_ds_1m.all_seed_can_pl_tracks` AS (
#   SELECT 
#     left_pls.pid AS pid_src,
#     left_pls.name AS pl_name_src,
#     left_pls.collaborative AS pl_collaborative_src,
#     left_pls.num_tracks AS num_pl_tracks_src,
#     left_pls.num_albums AS num_pl_albums_src,
#     left_pls.num_artists AS num_pl_artists_src,
#     left_pls.num_followers AS num_pl_followers_src,
#     left_pls.duration_ms AS pl_duration_ms_src,
#     IFNULL(left_pls.description, "NONE") AS pl_description_src,
#     -- nested tracks
#     ARRAY(
#       SELECT
#         AS STRUCT CAST(un_tracks.pos_can AS int64) AS pos_pl,
#         un_tracks.artist_name_can AS artist_name_pl,
#         un_tracks.track_uri_can AS track_uri_pl,
#         un_tracks.track_name AS track_name_pl,
#         un_tracks.album_uri_can AS album_uri_pl,
#         un_tracks.artist_uri_can AS artist_uri_pl,
#         CAST(un_tracks.duration_ms_can AS float64) AS duration_ms_pl,
#         un_tracks.album_name_can AS album_name_pl,
#         IFNULL(un_tracks.track_pop_can, 0.0) / 1.0 AS track_pop_pl,
#         IFNULL(un_tracks.artist_pop_can, 0.0) / 1.0 AS artist_pop_pl,
#         IFNULL(un_tracks.artist_genres_can[OFFSET(0)], "NONE") AS artist_genres_pl,
#         IFNULL(un_tracks.artist_followers_can, 0.0) / 1.0 AS artist_followers_pl,
#         un_tracks.tracks_playlist_titles AS tracks_playlist_titles_pl,
#         IFNULL(un_tracks.track_danceability, 0.0) / 1.0 AS track_danceability_pl,
#         IFNULL(un_tracks.track_energy, 0.0) / 1.0 as track_enrgy_pl,
#         IFNULL(un_tracks.track_key, 0.0) / 1.0 as track_key_pl,
#         IFNULL(un_tracks.track_loudness, 0.0) / 1.0 as track_loudness_pl,
#         IFNULL(un_tracks.track_mode, 0.0) / 1.0 as track_mode_pl,
#         IFNULL(un_tracks.track_speechiness, 0.0) / 1.0 as track_speechiness_pl,
#         IFNULL(un_tracks.track_acousticness, 0.0) / 1.0 as track_acousticness_pl,
#         IFNULL(un_tracks.track_liveness, 0.0) / 1.0 as track_liveness_pl,
#         IFNULL(un_tracks.track_valence, 0.0) / 1.0 as track_valence_pl,
#         IFNULL(un_tracks.track_tempo, 0.0) / 1.0 as track_tempo_pl,
#         IFNULL(un_tracks.time_signature, 0.0) / 1.0 as time_signature_pl,
#       FROM
#         UNNEST(left_pls.tracks) AS un_tracks
#     ) AS seed_playlist_tracks,
#     -- candidate track
#     seeds_and_cans.pid_pos_id,
#     seeds_and_cans.pos_can,
#     seeds_and_cans.track_uri_can,
#     seeds_and_cans.track_name_can,
#     seeds_and_cans.artist_uri_can,
#     seeds_and_cans.artist_name_can,
#     seeds_and_cans.album_uri_can,
#     seeds_and_cans.album_name_can,
#     IFNULL(seeds_and_cans.duration_ms_can, 0.0) / 1.0 AS duration_ms_can,
#     IFNULL(seeds_and_cans.track_pop_can, 0.0) / 1.0 AS track_pop_can,
#     IFNULL(seeds_and_cans.artist_pop_can, 0.0) / 1.0 AS artist_pop_can,
#     IFNULL(seeds_and_cans.artist_genres_can[OFFSET(0)], "NONE") AS artist_genres_can,
#     IFNULL(seeds_and_cans.artist_followers_can, 0.0) / 1.0 AS artist_followers_can,
#     IFNULL(can_feat_plus.tracks_playlist_titles, "NONE") AS track_pl_titles_can,
#     IFNULL(can_feat_plus.danceability, 0.0) / 1.0 AS track_danceability_can,
#     IFNULL(can_feat_plus.energy, 0.0) / 1.0 AS track_energy_can,
#     IFNULL(can_feat_plus.key, 0.0) / 1.0 AS track_key_can,
#     IFNULL(can_feat_plus.loudness, 0.0) / 1.0 AS track_loudness_can,
#     IFNULL(can_feat_plus.mode, 0.0) / 1.0 AS track_mode_can,
#     IFNULL(can_feat_plus.speechiness, 0.0) / 1.0 AS track_speechiness_can,
#     IFNULL(can_feat_plus.acousticness, 0.0) / 1.0 AS track_acousticness_can,
#     IFNULL(can_feat_plus.instrumentalness, 0.0) / 1.0 AS track_instrumentalness_can,
#     IFNULL(can_feat_plus.liveness, 0.0) / 1.0 AS track_liveness_can,
#     IFNULL(can_feat_plus.valence, 0.0) / 1.0 AS track_valence_can,
#     IFNULL(can_feat_plus.tempo, 0.0) / 1.0 AS track_tempo_can,
#     IFNULL(can_feat_plus.time_signature, 0.0) / 1.0 AS track_time_signature_can,
#     -- seed tracks
#     seeds_and_cans.pos_seed_track,
#     seeds_and_cans.track_uri_seed_track,
#     seeds_and_cans.track_name_seed_track,
#     seeds_and_cans.artist_uri_seed_track,
#     seeds_and_cans.artist_name_seed_track,
#     seeds_and_cans.album_uri_seed_track,
#     seeds_and_cans.album_name_seed_track,
#     IFNULL(seeds_and_cans.duration_seed_track, 0.0) / 1.0 AS duration_seed_track,
#     IFNULL(seeds_and_cans.track_pop_seed_track, 0.0) / 1.0 AS track_pop_seed_track,
#     IFNULL(seeds_and_cans.artist_pop_seed_track, 0.0) / 1.0 AS artist_pop_seed_track,
#     IFNULL(seeds_and_cans.artist_genres_seed_track[OFFSET(0)], "NONE") AS artist_genres_seed_track,
#     IFNULL(seeds_and_cans.artist_followers_seed_track, 0.0) / 1.0 AS artist_followers_seed_track,
#     IFNULL(seed_feat_plus.tracks_playlist_titles, "NONE") AS track_pl_titles_seed_track,
#     IFNULL(seed_feat_plus.danceability, 0.0) / 1.0 AS danceability_seed_track,
#     IFNULL(seed_feat_plus.energy, 0.0) / 1.0 AS energy_seed_track,
#     IFNULL(seed_feat_plus.key, 0.0) / 1.0 AS key_seed_track,
#     IFNULL(seed_feat_plus.loudness, 0.0) / 1.0 AS loudness_seed_track,
#     IFNULL(seed_feat_plus.mode, 0.0) / 1.0 AS mode_seed_track,
#     IFNULL(seed_feat_plus.speechiness, 0.0) / 1.0 AS speechiness_seed_track,
#     IFNULL(seed_feat_plus.acousticness, 0.0) / 1.0 AS acousticness_seed_track,
#     IFNULL(seed_feat_plus.instrumentalness, 0.0) / 1.0 AS instrumentalness_seed_track,
#     IFNULL(seed_feat_plus.liveness, 0.0) / 1.0 AS liveness_seed_track,
#     IFNULL(seed_feat_plus.valence, 0.0) / 1.0 AS valence_seed_track,
#     IFNULL(seed_feat_plus.tempo, 0.0) / 1.0 AS tempo_seed_track,
#     IFNULL(seed_feat_plus.time_signature, 0.0) / 1.0 AS time_signature_seed_track,
#   FROM 
#     -- `hybrid-vertex.mdp_eda_test.playlists` as left_pls
#     `hybrid-vertex.a_spotify_ds.enriched_data_jt` as left_pls
#   LEFT JOIN 
#     `hybrid-vertex.mdp_eda_test.ordered_position_training` AS seeds_and_cans
#   ON 
#     left_pls.pid = seeds_and_cans.pid 
#     AND left_pls.num_tracks = (seeds_and_cans.pos_can + 1)
#   LEFT JOIN 
#     `hybrid-vertex.a_spotify_ds.unique_track_features_plus` AS can_feat_plus
#   ON
#    seeds_and_cans.track_uri_can = can_feat_plus.track_uri
#   LEFT JOIN 
#     `hybrid-vertex.a_spotify_ds.unique_track_features_plus` AS seed_feat_plus
#   ON
#    seeds_and_cans.track_uri_can = seed_feat_plus.track_uri

#     );

converting data types...

In [None]:
# CREATE OR REPLACE TABLE
# `hybrid-vertex.a_spotify_ds_1m.all_seed_can_pl_tracks_v3` AS (
#   SELECT 
#     left_pls.pid AS pid_src,
#     left_pls.name AS pl_name_src,
#     left_pls.collaborative AS pl_collaborative_src,
#     left_pls.num_tracks AS num_pl_tracks_src,
#     left_pls.num_albums AS num_pl_albums_src,
#     left_pls.num_artists AS num_pl_artists_src,
#     left_pls.num_followers AS num_pl_followers_src,
#     left_pls.duration_ms AS pl_duration_ms_src,
#     IFNULL(left_pls.description, "NONE") AS pl_description_src,
#     -- nested tracks
#     ARRAY(
#       SELECT
#         AS STRUCT CAST(un_tracks.pos_can AS int64) AS pos_pl,
#         un_tracks.artist_name_can AS artist_name_pl,
#         un_tracks.track_uri_can AS track_uri_pl,
#         un_tracks.track_name_can AS track_name_pl,
#         un_tracks.album_uri_can AS album_uri_pl,
#         un_tracks.artist_uri_can AS artist_uri_pl,
#         CAST(un_tracks.duration_ms_can AS float64) AS duration_ms_pl,
#         un_tracks.album_name_can AS album_name_pl,
#         IFNULL(un_tracks.track_pop_can, 0.0) / 1.0 AS track_pop_pl,
#         IFNULL(un_tracks.artist_pop_can, 0.0) / 1.0 AS artist_pop_pl,
#         IFNULL(un_tracks.artist_genres_can[OFFSET(0)], "NONE") AS artist_genres_pl,
#         IFNULL(un_tracks.artist_followers_can, 0.0) / 1.0 AS artist_followers_pl,
#         un_tracks.tracks_playlist_titles AS tracks_playlist_titles_pl,
#         IFNULL(un_tracks.track_danceability, 0.0) / 1.0 AS track_danceability_pl,
#         IFNULL(un_tracks.track_energy, 0.0) / 1.0 as track_energy_pl,
#         IFNULL(un_tracks.track_key, "NONE") as track_key_pl,
#         IFNULL(un_tracks.track_loudness, 0.0) / 1.0 as track_loudness_pl,
#         IFNULL(un_tracks.track_mode, "NONE") as track_mode_pl,
#         IFNULL(un_tracks.track_speechiness, 0.0) / 1.0 as track_speechiness_pl,
#         IFNULL(un_tracks.track_acousticness, 0.0) / 1.0 as track_acousticness_pl,
#         IFNULL(un_tracks.track_liveness, 0.0) / 1.0 as track_liveness_pl,
#         IFNULL(un_tracks.track_valence, 0.0) / 1.0 as track_valence_pl,
#         IFNULL(un_tracks.track_tempo, 0.0) / 1.0 as track_tempo_pl,
#         IFNULL(un_tracks.time_signature, "NONE") as time_signature_pl,
#       FROM
#         UNNEST(left_pls.tracks) AS un_tracks
#     ) AS seed_playlist_tracks,
#     -- candidate track
#     seeds_and_cans.pid_pos_id,
#     seeds_and_cans.pos_can,
#     seeds_and_cans.track_uri_can,
#     seeds_and_cans.track_name_can,
#     seeds_and_cans.artist_uri_can,
#     seeds_and_cans.artist_name_can,
#     seeds_and_cans.album_uri_can,
#     seeds_and_cans.album_name_can,
#     IFNULL(seeds_and_cans.duration_ms_can, 0.0) / 1.0 AS duration_ms_can,
#     IFNULL(seeds_and_cans.track_pop_can, 0.0) / 1.0 AS track_pop_can,
#     IFNULL(seeds_and_cans.artist_pop_can, 0.0) / 1.0 AS artist_pop_can,
#     IFNULL(seeds_and_cans.artist_genres_can[OFFSET(0)], "NONE") AS artist_genres_can,
#     IFNULL(seeds_and_cans.artist_followers_can, 0.0) / 1.0 AS artist_followers_can,
#     IFNULL(can_feat_plus.tracks_playlist_titles, "NONE") AS track_pl_titles_can,
#     IFNULL(can_feat_plus.danceability, 0.0) / 1.0 AS track_danceability_can,
#     IFNULL(can_feat_plus.energy, 0.0) / 1.0 AS track_energy_can,
#     IFNULL(can_feat_plus.key, "NONE") AS track_key_can,
#     IFNULL(can_feat_plus.loudness, 0.0) / 1.0 AS track_loudness_can,
#     IFNULL(can_feat_plus.mode, "NONE") AS track_mode_can,
#     IFNULL(can_feat_plus.speechiness, 0.0) / 1.0 AS track_speechiness_can,
#     IFNULL(can_feat_plus.acousticness, 0.0) / 1.0 AS track_acousticness_can,
#     IFNULL(can_feat_plus.instrumentalness, 0.0) / 1.0 AS track_instrumentalness_can,
#     IFNULL(can_feat_plus.liveness, 0.0) / 1.0 AS track_liveness_can,
#     IFNULL(can_feat_plus.valence, 0.0) / 1.0 AS track_valence_can,
#     IFNULL(can_feat_plus.tempo, 0.0) / 1.0 AS track_tempo_can,
#     IFNULL(can_feat_plus.time_signature, "NONE") AS track_time_signature_can,
#     -- seed tracks
#     seeds_and_cans.pos_seed_track,
#     seeds_and_cans.track_uri_seed_track,
#     seeds_and_cans.track_name_seed_track,
#     seeds_and_cans.artist_uri_seed_track,
#     seeds_and_cans.artist_name_seed_track,
#     seeds_and_cans.album_uri_seed_track,
#     seeds_and_cans.album_name_seed_track,
#     IFNULL(seeds_and_cans.duration_seed_track, 0.0) / 1.0 AS duration_seed_track,
#     IFNULL(seeds_and_cans.track_pop_seed_track, 0.0) / 1.0 AS track_pop_seed_track,
#     IFNULL(seeds_and_cans.artist_pop_seed_track, 0.0) / 1.0 AS artist_pop_seed_track,
#     IFNULL(seeds_and_cans.artist_genres_seed_track[OFFSET(0)], "NONE") AS artist_genres_seed_track,
#     IFNULL(seeds_and_cans.artist_followers_seed_track, 0.0) / 1.0 AS artist_followers_seed_track,
#     IFNULL(seed_feat_plus.tracks_playlist_titles, "NONE") AS track_pl_titles_seed_track,
#     IFNULL(seed_feat_plus.danceability, 0.0) / 1.0 AS danceability_seed_track,
#     IFNULL(seed_feat_plus.energy, 0.0) / 1.0 AS energy_seed_track,
#     IFNULL(seed_feat_plus.key, "NONE") AS key_seed_track,
#     IFNULL(seed_feat_plus.loudness, 0.0) / 1.0 AS loudness_seed_track,
#     IFNULL(seed_feat_plus.mode, "NONE") AS mode_seed_track,
#     IFNULL(seed_feat_plus.speechiness, 0.0) / 1.0 AS speechiness_seed_track,
#     IFNULL(seed_feat_plus.acousticness, 0.0) / 1.0 AS acousticness_seed_track,
#     IFNULL(seed_feat_plus.instrumentalness, 0.0) / 1.0 AS instrumentalness_seed_track,
#     IFNULL(seed_feat_plus.liveness, 0.0) / 1.0 AS liveness_seed_track,
#     IFNULL(seed_feat_plus.valence, 0.0) / 1.0 AS valence_seed_track,
#     IFNULL(seed_feat_plus.tempo, 0.0) / 1.0 AS tempo_seed_track,
#     IFNULL(seed_feat_plus.time_signature,"NONE") AS time_signature_seed_track,
#   FROM 
#     -- `hybrid-vertex.mdp_eda_test.playlists` as left_pls
#     `hybrid-vertex.a_spotify_ds.enriched_data_jt_v3` as left_pls
#   LEFT JOIN 
#     `hybrid-vertex.mdp_eda_test.ordered_position_training` AS seeds_and_cans
#   ON 
#     left_pls.pid = seeds_and_cans.pid 
#     AND left_pls.num_tracks = (seeds_and_cans.pos_can + 1)
#   LEFT JOIN 
#     `hybrid-vertex.a_spotify_ds.unique_track_features_plus_v3` AS can_feat_plus
#   ON
#    seeds_and_cans.track_uri_can = can_feat_plus.track_uri
#   LEFT JOIN 
#     `hybrid-vertex.a_spotify_ds.unique_track_features_plus_v3` AS seed_feat_plus
#   ON
#    seeds_and_cans.track_uri_can = seed_feat_plus.track_uri

#     );

In [None]:
# CREATE OR REPLACE TABLE
# `hybrid-vertex.a_spotify_ds.ordered_position_training_jt` AS (
# WITH
#   -- get every combination of song and its parent playlist
#   unnest_cross AS(
#   SELECT
#     b.*,
#     CONCAT(b.pid,"-",track.pos_can) AS pid_pos_id,
#     CAST(track.pos_can AS int64) AS pos_can,
#     IFNULL(track.artist_name_can, "NONE") as artist_name_can,
#     track.track_uri_can,
#     track.album_uri_can,
#     IFNULL(track.track_name, "NONE") as track_name_can,
#     track.artist_uri_can,
#     CAST(track.duration_ms_can AS float64) AS duration_ms_can,
#     track.album_name_can,
#     track.track_pop_can,
#     track.artist_pop_can,
#     track.artist_genres_can,
#     track.artist_followers_can,

#   FROM (
#     SELECT
#       * EXCEPT(duration_ms)
#     FROM
#       `hybrid-vertex.a_spotify_ds.enriched_data_jt`) AS b
#   CROSS JOIN
#     UNNEST(tracks) AS track)
# SELECT
#   a.* EXCEPT(tracks,
#     num_tracks,
#     num_artists,
#     num_albums,
#     num_followers,
#     num_edits),
#   ARRAY(
#   SELECT
#     AS STRUCT CAST(track.pos_can AS int64) AS pos_pl,
#     track.artist_name_can AS artist_name_pl,
#     track.track_uri_can AS track_uri_pl,
#     track.track_name AS track_name_pl,
#     track.album_uri_can AS album_uri_pl,
#     track.artist_uri_can AS artist_uri_pl,
#     CAST(track.duration_ms_can AS float64) AS duration_ms_pl,
#     track.album_name_can AS album_name_pl,
#     track.track_pop_can AS track_pop_pl,
#     track.artist_pop_can AS artist_pop_pl,
#     track.artist_genres_can AS artist_genres_pl,
#     track.artist_followers_can AS artist_followers_pl,
#     track.tracks_playlist_titles as tracks_playlist_titles,
#     track.track_danceability as track_danceability,
#     track.track_energy as track_enrgy,
#     track.track_key as track_key,
#     track.track_loudness as track_loudness,
#     track.track_mode as track_mode,
#     track.track_speechiness as track_speechiness,
#     track.track_acousticness as track_acousticness,
#     track.track_liveness as track_liveness,
#     track.track_valence as track_valence,
#     track.track_tempo as track_tempo,
#     track.time_signature as time_signature,
#   FROM
#     UNNEST(tracks) AS track
#   WHERE
#     CAST(track.pos_can AS int64) < a.pos_can ORDER BY CAST(track.pos_can AS int64)) AS seed_playlist_tracks,
#   ----- seed track part
#   trx.pos_can AS pos_seed_track,
#   trx.artist_name_can AS artist_name_seed_track,
#   trx.artist_uri_can AS artist_uri_seed_track,
#   trx.track_name AS track_name_seed_track,
#   trx.track_uri_can AS track_uri_seed_track,
#   trx.album_name_can AS album_name_seed_track,
#   trx.album_uri_can AS album_uri_seed_track,
#   trx.duration_ms_can AS duration_seed_track,
#   trx.track_pop_can AS track_pop_seed_track,
#   trx.artist_pop_can AS artist_pop_seed_track,
#   trx.artist_genres_can as artist_genres_seed_track,
#   trx.artist_followers_can as artist_followers_seed_track,
#   trx.tracks_playlist_titles as tracks_playlist_titles,
#   trx.track_danceability as track_danceability,
#   trx.track_energy as track_enrgy,
#   trx.track_key as track_key,
#   trx.track_loudness as track_loudness,
#   trx.track_mode as track_mode,
#   trx.track_speechiness as track_speechiness,
#   trx.track_acousticness as track_acousticness,
#   trx.track_liveness as track_liveness,
#   trx.track_valence as track_valence,
#   trx.track_tempo as track_tempo,
#   trx.time_signature as time_signature,
# FROM
#   unnest_cross AS a -- with statement
#   ,
#   UNNEST(a.tracks) AS trx
# WHERE
#   CAST(trx.pos_can AS int64) = a.pos_can-1);

## Update the playlist metadata with the new samples created above

Trainv3-clean-track-features

Get new metadata for the tracks now that there are updated track counts, durations, etc...

`hybrid-vertex.spotify_train_3.train`


In [4]:
%%time
get_new_metadata_query = f"""
create or replace table `{PROJECT_ID}.{bq_dataset}.train` as (
WITH
  playlist_features_clean AS (
  SELECT
    pid_pos_id,
    SUM(trx.duration_ms_pl) / 1.0 AS duration_ms_seed_pl,
    COUNT(1) / 1.0 AS n_songs_pl,
    COUNT(DISTINCT trx.artist_name_pl) / 1.0 AS num_artists_pl,
    COUNT(DISTINCT trx.album_uri_pl) /1.0 AS num_albums_pl,
  FROM
    `{PROJECT_ID}.{bq_dataset}.ordered_position_training`,
    UNNEST(seed_playlist_tracks) AS trx
  GROUP BY
    pid_pos_id)
    
SELECT
  a.* except(artist_genres_can, artist_genres_seed_track, track_pop_can, artist_pop_can, artist_followers_can,
            track_pop_seed_track, artist_pop_seed_track),
  b.* except(pid_pos_id),
  IFNULL(a.artist_genres_can[OFFSET(0)], "NONE") as artist_genres_can,
  IFNULL(a.artist_genres_seed_track[OFFSET(0)], "NONE") as artist_genres_seed_track,
  IFNULL(a.track_pop_can, 0.0) / 1.0 as  track_pop_can, 
  IFNULL(a.artist_pop_can, 0.0) / 1.0 as artist_pop_can,
  IFNULL(a.artist_followers_can, 0.0) / 1.0 as artist_followers_can,
  IFNULL(a.track_pop_seed_track, 0.0) / 1.0 as track_pop_seed_track,
  IFNULL(a.artist_pop_seed_track, 0.0) / 1.0 as artist_pop_seed_track,
  
FROM
  `{PROJECT_ID}.{bq_dataset}.ordered_position_training` a
INNER JOIN
  playlist_features_clean b
ON
  a.pid_pos_id = b.pid_pos_id )
  """

bigquery_client.query(get_new_metadata_query).result()

CPU times: user 35.3 ms, sys: 1.54 ms, total: 36.8 ms
Wall time: 2min 57s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fc885937950>

In [5]:
%%time
### Get candidates

get_unique_candidates = f"""
create or replace table `{PROJECT_ID}.{bq_dataset}.candidates` as (
SELECT DISTINCT
  track_name_can,
  artist_name_can,
  album_name_can,
  track_uri_can,
  album_uri_can,
  artist_uri_can,
  track_pop_can,
  artist_genres_can,
  artist_followers_can,
  duration_ms_can,
  artist_pop_can
FROM
  `{PROJECT_ID}.{bq_dataset}.train`
  )
  """

bigquery_client.query(get_unique_candidates).result()

CPU times: user 14.1 ms, sys: 0 ns, total: 14.1 ms
Wall time: 12 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fc88d7e7d10>

In [None]:
# JT candidates v8
CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_hack.candidates_v8` AS (
SELECT DISTINCT
    track_uri_can,
    IFNULL(track_name, "NONE") AS track_name_can,
    IFNULL(artist_uri_can, "NONE") AS artist_uri_can,
    IFNULL(artist_name_can, "NONE") AS artist_name_can,
    IFNULL(album_uri_can, "NONE") AS album_uri_can,
    IFNULL(album_name_can, "NONE") AS album_name_can,
    IFNULL(duration_ms_can, 0.0) / 1.0 AS duration_ms_can,
    IFNULL(track_pop, 0.0) / 1.0 AS track_pop_can,
    IFNULL(artist_pop_can, 0.0) / 1.0 AS artist_pop_can,
    IFNULL(artist_genres_can, "NONE") AS artist_genres_can,
    IFNULL(artist_followers_can, 0.0) / 1.0 AS artist_followers_can,
    IFNULL(tracks_playlist_titles, "NONE") AS track_pl_titles_can,
    IFNULL(danceability, 0.0) / 1.0 AS track_danceability_can,
    IFNULL(energy, 0.0) / 1.0 AS track_energy_can,
    IFNULL(key, "NONE") AS track_key_can,
    IFNULL(loudness, 0.0) / 1.0 AS track_loudness_can,
    IFNULL(mode, "NONE") AS track_mode_can,
    IFNULL(speechiness, 0.0) / 1.0 AS track_speechiness_can,
    IFNULL(acousticness, 0.0) / 1.0 AS track_acousticness_can,
    IFNULL(instrumentalness, 0.0) / 1.0 AS track_instrumentalness_can,
    IFNULL(liveness, 0.0) / 1.0 AS track_liveness_can,
    IFNULL(valence, 0.0) / 1.0 AS track_valence_can,
    IFNULL(tempo, 0.0) / 1.0 AS track_tempo_can,
    IFNULL(time_signature, "NONE") AS track_time_signature_can,
  -- track_uri_can,
  -- track_name_can,
  -- artist_uri_can,
  -- artist_name_can,
  -- album_uri_can,
  -- album_name_can,
  
  -- track_pop_can,
  -- artist_genres_can,
  -- artist_followers_can,
  -- duration_ms_can,
  -- artist_pop_can
FROM
  `hybrid-vertex.a_spotify_hack.full_train_pre_split`
  )

### JT adapt

In [None]:
# -- feats to summarize playlist's tracks/artists
# --     > avg popularity
# --     > avg duration, acousticness, danceability, loudness, energy, speechiness
# -- playlist-track homogeneity 
# --     > artist/genre/album
# --     > key, mode

# CREATE OR REPLACE TABLE
# `hybrid-vertex.a_spotify_ds_1m.feats_1_all_seed_can_pl_track` AS (
# WITH
#   playlist_features_clean AS (
#   SELECT
#     pid_src,
#     SUM(trx.duration_ms_pl) / 1.0 AS pl_duration_ms_new,
#     COUNT(1) / 1.0 AS n_songs_pl_new,
#     COUNT(DISTINCT trx.artist_uri_pl) / 1.0 AS num_pl_artists_new,
#     COUNT(DISTINCT trx.album_uri_pl) / 1.0 AS num_pl_albums_new,
#     AVG(trx.track_pop_pl) AS avg_track_pop_pl_new,
#     AVG(trx.artist_pop_pl) AS avg_artist_pop_pl_new,
#     AVG(trx.artist_followers_pl) AS avg_art_followers_pl_new,
#   FROM
#     `hybrid-vertex.a_spotify_ds_1m.all_seed_can_pl_tracks`,
#     UNNEST(seed_playlist_tracks) AS trx
#   GROUP BY
#     pid_src)
#   SELECT
#     a.* 
#     ,
#     b.* EXCEPT(pid_src),
#   FROM
#     `hybrid-vertex.a_spotify_ds_1m.all_seed_can_pl_tracks` a
#   INNER JOIN
#     playlist_features_clean b
#   ON  
#     a.pid_src = b.pid_src 
# );

In [None]:
CREATE OR REPLACE TABLE
`hybrid-vertex.a_spotify_ds_1m.feats_1_all_seed_can_pl_track_v3` AS (
WITH
  playlist_features_clean AS (
  SELECT
    pid_src,
    SUM(trx.duration_ms_pl) / 1.0 AS pl_duration_ms_new,
    COUNT(1) / 1.0 AS n_songs_pl_new,
    COUNT(DISTINCT trx.artist_uri_pl) / 1.0 AS num_pl_artists_new,
    COUNT(DISTINCT trx.album_uri_pl) / 1.0 AS num_pl_albums_new,
    AVG(trx.track_pop_pl) AS avg_track_pop_pl_new,
    AVG(trx.artist_pop_pl) AS avg_artist_pop_pl_new,
    AVG(trx.artist_followers_pl) AS avg_art_followers_pl_new,
  FROM
    `hybrid-vertex.a_spotify_ds_1m.all_seed_can_pl_tracks_v3`,
    UNNEST(seed_playlist_tracks) AS trx
  GROUP BY
    pid_src)
  SELECT
    a.* 
    ,
    b.* EXCEPT(pid_src),
  FROM
    `hybrid-vertex.a_spotify_ds_1m.all_seed_can_pl_tracks_v3` a
  INNER JOIN
    playlist_features_clean b
  ON  
    a.pid_src = b.pid_src 
);

### train_flatten pre-split

In [None]:
# CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds_1m.train_flatten_pre_split_last_5_feats_v1` as (
#   SELECT a.* EXCEPT(
#     seed_playlist_tracks, 
#     pl_description_src,
#     num_pl_tracks_src,
#     num_pl_albums_src,
#     num_pl_artists_src,
#     pl_duration_ms_src
#   ),
#   ARRAY(select t.pos_pl FROM UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as pos_pl,
#   ARRAY(select t.track_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_uri_pl,
#   ARRAY(select CASE WHEN 
#                     t.track_name_pl IS NULL THEN 'NONE' 
#                     ELSE t.track_name_pl 
#                     END 
#                     FROM UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_name_pl,
#   ARRAY(select t.artist_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_uri_pl,
#   ARRAY(select t.artist_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_name_pl,
#   ARRAY(select t.album_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as album_uri_pl,
#   ARRAY(select t.album_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as album_name_pl,
#   ARRAY(select t.duration_ms_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as duration_ms_songs_pl,
  
#   ARRAY(select t.track_pop_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_pop_pl,
#   ARRAY(select CAST(t.artist_pop_pl as FLOAT64) from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_pop_pl,
#   ARRAY(select t.artist_genres_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_genres_pl,
#   ARRAY(select t.artist_followers_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artists_followers_pl,
#   ARRAY(select t.tracks_playlist_titles_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as tracks_playlist_titles_pl,

#   ARRAY(select t.track_danceability_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_danceability_pl,
#   ARRAY(select t.track_key_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_key_pl,
#   ARRAY(select t.track_loudness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_loudness_pl,
#   ARRAY(select t.track_mode_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_mode_pl,
#   ARRAY(select t.track_speechiness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_speechiness_pl,
#   ARRAY(select t.track_acousticness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_acousticness_pl,
#   ARRAY(select t.track_liveness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_liveness_pl,
#   ARRAY(select t.track_valence_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_valence_pl,
#   ARRAY(select t.track_tempo_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_tempo_pl,
#   ARRAY(select t.time_signature_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as time_signature_pl,
# FROM 
#   `hybrid-vertex.a_spotify_ds_1m.feats_1_all_seed_can_pl_track` a
# )

remove seed_tracks and use converted data types...

In [None]:
# CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds_1m.train_flatten_pre_split_last_5_feats_v3` as (
#   SELECT a.* EXCEPT(
#     seed_playlist_tracks, 
#     pl_description_src,
#     num_pl_tracks_src,
#     num_pl_albums_src,
#     num_pl_artists_src,
#     pl_duration_ms_src,
#     track_uri_seed_track,
#     track_name_seed_track,
#     artist_uri_seed_track,
#     artist_name_seed_track,
#     album_uri_seed_track,
#     album_name_seed_track,
#     artist_genres_seed_track,
#     track_pl_titles_seed_track,
#     duration_seed_track,
#     track_pop_seed_track,
#     artist_pop_seed_track,
#     artist_followers_seed_track,
#     danceability_seed_track,
#     key_seed_track,
#     loudness_seed_track,
#     mode_seed_track,
#     speechiness_seed_track,
#     acousticness_seed_track,
#     liveness_seed_track,
#     valence_seed_track,
#     tempo_seed_track,
#     time_signature_seed_track,
#     pos_seed_track,
#     energy_seed_track,
#     instrumentalness_seed_track
#   ),
#   ARRAY(select t.pos_pl FROM UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as pos_pl,
#   ARRAY(select t.track_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_uri_pl,
#   ARRAY(select CASE WHEN 
#                     t.track_name_pl IS NULL THEN 'NONE' 
#                     ELSE t.track_name_pl 
#                     END 
#                     FROM UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_name_pl,
#   ARRAY(select t.artist_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_uri_pl,
#   ARRAY(select t.artist_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_name_pl,
#   ARRAY(select t.album_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as album_uri_pl,
#   ARRAY(select t.album_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as album_name_pl,
#   ARRAY(select t.duration_ms_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as duration_ms_songs_pl,
  
#   ARRAY(select t.track_pop_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_pop_pl,
#   ARRAY(select CAST(t.artist_pop_pl as FLOAT64) from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_pop_pl,
#   ARRAY(select t.artist_genres_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_genres_pl,
#   ARRAY(select t.artist_followers_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artists_followers_pl,
#   ARRAY(select t.tracks_playlist_titles_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as tracks_playlist_titles_pl,

#   ARRAY(select t.track_danceability_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_danceability_pl,
#   ARRAY(select t.track_energy_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_energy_pl,
#   ARRAY(select t.track_key_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_key_pl,
#   ARRAY(select t.track_loudness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_loudness_pl,
#   ARRAY(select t.track_mode_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_mode_pl,
#   ARRAY(select t.track_speechiness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_speechiness_pl,
#   ARRAY(select t.track_acousticness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_acousticness_pl,
#   ARRAY(select t.track_liveness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_liveness_pl,
#   ARRAY(select t.track_valence_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_valence_pl,
#   ARRAY(select t.track_tempo_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_tempo_pl,
#   ARRAY(select t.time_signature_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as time_signature_pl,
# FROM 
#   `hybrid-vertex.a_spotify_ds_1m.feats_1_all_seed_can_pl_track_v3` a
# )

In [None]:
# CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds_1m.candidates` AS (
# SELECT
#     a.track_uri_can AS track_uri_can,
#     a.track_name_can AS track_name_can,
#     a.artist_uri_can AS artist_uri_can,
#     a.artist_name_can AS artist_name_can,
#     a.album_uri_can AS album_uri_can,
#     a.album_name_can AS album_name_can,
#     IFNULL(a.duration_ms_can, 0.0) / 1.0 AS duration_ms_can,
#     IFNULL(a.track_pop_can, 0.0) / 1.0 AS track_pop_can,
#     IFNULL(a.artist_pop_can, 0.0) / 1.0 AS artist_pop_can,
#     IFNULL(a.artist_genres_can, "NONE") AS artist_genres_can,
#     IFNULL(a.artist_followers_can, 0.0) / 1.0 AS artist_followers_can,
#     IFNULL(b.tracks_playlist_titles, "NONE") AS track_pl_titles_can,
#     IFNULL(b.danceability, 0.0) / 1.0 AS track_danceability_can,
#     IFNULL(b.energy, 0.0) / 1.0 AS track_energy_can,
#     IFNULL(b.key, 0.0) / 1.0 AS track_key_can,
#     IFNULL(b.loudness, 0.0) / 1.0 AS track_loudness_can,
#     IFNULL(b.mode, 0.0) / 1.0 AS track_mode_can,
#     IFNULL(b.speechiness, 0.0) / 1.0 AS track_speechiness_can,
#     IFNULL(b.acousticness, 0.0) / 1.0 AS track_acousticness_can,
#     IFNULL(b.instrumentalness, 0.0) / 1.0 AS track_instrumentalness_can,
#     IFNULL(b.liveness, 0.0) / 1.0 AS track_liveness_can,
#     IFNULL(b.valence, 0.0) / 1.0 AS track_valence_can,
#     IFNULL(b.tempo, 0.0) / 1.0 AS track_tempo_can,
#     IFNULL(b.time_signature, 0.0) / 1.0 AS track_time_signature_can,
# FROM
#   `hybrid-vertex.mdp_eda_test.candidates` as a
# LEFT JOIN
#   `hybrid-vertex.a_spotify_ds.unique_track_features_plus` as b
# ON a.track_uri_can = b.track_uri
# )

## For TFRecords
Get rid of structs by creating new table with arrays from playlist_seed


# Only selecting last 5 songs

song_history is settable but it will impact `MAX_PLAYLIST_LENGTH` in `src/two_tower.py`

In [6]:
%%time
song_history=5
train_flatten_query = f"""
create or replace table `{PROJECT_ID}.{bq_dataset}.train_flatten_pre_split_last_5` as (
SELECT a.* except(seed_playlist_tracks, description),
    IFNULL(a.description, "") as description_pl,
    ARRAY(select t.pos_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as pos_pl,
    ARRAY(select t.artist_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as artist_name_pl,
    ARRAY(select t.track_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as track_uri_pl,
    ARRAY(select t.track_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as track_name_pl,
    ARRAY(select t.duration_ms_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as duration_ms_songs_pl,
    ARRAY(select t.album_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as album_name_pl,
    ARRAY(select cast(t.artist_pop_pl as FLOAT64) from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as artist_pop_pl,
    ARRAY(select t.artist_followers_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as artists_followers_pl,
    ARRAY(select case when t.track_pop_pl is null then 0. else t.track_pop_pl end from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as track_pop_pl,
    ARRAY(select t.artist_genres_pl[OFFSET(0)] from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - {song_history}) as artist_genres_pl
    from `{PROJECT_ID}.{bq_dataset}.train` a
)
"""

bigquery_client.query(train_flatten_query).result()

CPU times: user 14.1 ms, sys: 375 µs, total: 14.5 ms
Wall time: 19.2 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fc88583b7d0>

### JT adapts

In [None]:
seed_histories = [5, 10, 25, 30, 50, 75, 100]
SUFFIX='feats_1'

In [None]:
## TODO

# for seed_length in playlist_histories:
#     train_flatten_query = f'''
#     CREATE OR REPLACE TABLE `hybrid-vertex.a_spotify_ds_1m.train_flatten_pre_split_last_5_feats_v1` as (
#       SELECT a.* EXCEPT(
#         seed_playlist_tracks, 
#         pl_description_src,
#         num_pl_tracks_src,
#         num_pl_albums_src,
#         num_pl_artists_src,
#         pl_duration_ms_src
#       ),
#       ARRAY(select t.pos_pl FROM UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as pos_pl,
#       ARRAY(select t.track_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_uri_pl,
#       ARRAY(select CASE WHEN 
#                         t.track_name_pl IS NULL THEN 'NONE' 
#                         ELSE t.track_name_pl 
#                         END 
#                         FROM UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_name_pl,
#       ARRAY(select t.artist_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_uri_pl,
#       ARRAY(select t.artist_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_name_pl,
#       ARRAY(select t.album_uri_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as album_uri_pl,
#       ARRAY(select t.album_name_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as album_name_pl,
#       ARRAY(select t.duration_ms_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as duration_ms_songs_pl,

#       ARRAY(select t.track_pop_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_pop_pl,
#       ARRAY(select CAST(t.artist_pop_pl as FLOAT64) from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_pop_pl,
#       ARRAY(select t.artist_genres_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artist_genres_pl,
#       ARRAY(select t.artist_followers_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as artists_followers_pl,
#       ARRAY(select t.tracks_playlist_titles_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as tracks_playlist_titles_pl,

#       ARRAY(select t.track_danceability_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_danceability_pl,
#       ARRAY(select t.track_key_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_key_pl,
#       ARRAY(select t.track_loudness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_loudness_pl,
#       ARRAY(select t.track_mode_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_mode_pl,
#       ARRAY(select t.track_speechiness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_speechiness_pl,
#       ARRAY(select t.track_acousticness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_acousticness_pl,
#       ARRAY(select t.track_liveness_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_liveness_pl,
#       ARRAY(select t.track_valence_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_valence_pl,
#       ARRAY(select t.track_tempo_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as track_tempo_pl,
#       ARRAY(select t.time_signature_pl from UNNEST(seed_playlist_tracks) t where pos_pl >= pos_can - 5) as time_signature_pl,
#     FROM 
#       `hybrid-vertex.a_spotify_ds_1m.feats_1_all_seed_can_pl_track` a
#     )
#     '''

## Important for validation strategy
Different playlist ids were selected for validation to prevent cross-contamination with the sampling approach.

In [7]:
%%time
VALIDATION_P = 0.01
song_history=5

validation_creation = f"""
CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.train_flatten_valid_last_5` AS (
    SELECT * 
  FROM
    `{PROJECT_ID}.{bq_dataset}.train_flatten_pre_split_last_5` where MOD(pid, 100) = 0
    AND ARRAY_LENGTH(pos_pl) = {song_history})""" #complete examples only

bigquery_client.query(validation_creation).result()

CPU times: user 10.3 ms, sys: 3.57 ms, total: 13.9 ms
Wall time: 13 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fc8857e8d10>

In [8]:
%%time
VALIDATION_P = 0.01

validation_creation = f"""
CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.train_flatten_last_5` AS (
    SELECT * 
  FROM
    `{PROJECT_ID}.{bq_dataset}.train_flatten_pre_split_last_5` where MOD(pid, 100) != 0
    AND ARRAY_LENGTH(pos_pl) = {song_history})""" #complete examples only"""

bigquery_client.query(validation_creation).result()

CPU times: user 13.5 ms, sys: 0 ns, total: 13.5 ms
Wall time: 17.8 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fc88621cc10>

### JT Adapts

In [None]:
# CREATE OR REPLACE TABLE
#   `hybrid-vertex.a_spotify_ds_1m.train_flatten_valid_last_5_feats_v1` AS (
#     SELECT * 
#   FROM
#     `hybrid-vertex.a_spotify_ds_1m.train_flatten_pre_split_last_5_feats_v1` where MOD(pid_src, 100) = 0
#     AND ARRAY_LENGTH(pos_pl) = 5
#   )

In [None]:
# CREATE OR REPLACE TABLE
#   `hybrid-vertex.a_spotify_ds_1m.train_flatten_last_5_feats_v1` AS (
#     SELECT * 
#   FROM
#     `hybrid-vertex.a_spotify_ds_1m.train_flatten_pre_split_last_5_feats_v1` where MOD(pid_src, 100) != 0
#     -- AND ARRAY_LENGTH(pos_pl) = 5
#   )

## Done - you can move on to the [next notebook](01-tfrecord-beam-pipeline.ipynb) unless you want to do the optional sampling strategy. 

___________

# Optional different artist sampling strategy

In this section, you could create another dataset that only considers the cases when artist switch. This avoids training on cases where an album may be played in it's end to end.

In [6]:
%%time
get_new_metadata_query = f"""
create or replace table `{PROJECT_ID}.{bq_dataset}.train_dif_artist` as (
WITH
  playlist_features_clean AS (
  SELECT
    pid_pos_id,
    SUM(trx.duration_ms_pl) / 1.0 AS duration_ms_seed_pl,
    COUNT(1) / 1.0 AS n_songs_pl,
    COUNT(DISTINCT trx.artist_name_pl) / 1.0 AS num_artists_pl,
    COUNT(DISTINCT trx.album_uri_pl) /1.0 AS num_albums_pl,
  FROM
    `{PROJECT_ID}.{bq_dataset}.ordered_position_training`,
    UNNEST(seed_playlist_tracks) AS trx
  GROUP BY
    pid_pos_id)
    
SELECT
  a.* except(artist_genres_can, artist_geexcpectedres_seed_track, track_pop_can, artist_pop_can, artist_followers_can,
            track_pop_seed_track, artist_pop_seed_track),
  b.* except(pid_pos_id),
  IFNULL(a.artist_genres_can[OFFSET(0)], "NONE") as artist_genres_can,
  IFNULL(a.artist_genres_seed_track[OFFSET(0)], "NONE") as artist_genres_seed_track,
  IFNULL(a.track_pop_can, 0.0) / 1.0 as  track_pop_can, 
  IFNULL(a.artist_pop_can, 0.0) / 1.0 as artist_pop_can,
  IFNULL(a.artist_followers_can, 0.0) / 1.0 as artist_followers_can,
  IFNULL(a.track_pop_seed_track, 0.0) / 1.0 as track_pop_seed_track,
  IFNULL(a.artist_pop_seed_track, 0.0) / 1.0 as artist_pop_seed_track,
  
FROM
  `{PROJECT_ID}.{bq_dataset}.ordered_position_training` a
INNER JOIN
  playlist_features_clean b
ON
  a.pid_pos_id = b.pid_pos_id 
  WHERE album_uri_can != album_uri_seed_track and artist_uri_seed_track != artist_uri_can)
  """

bigquery_client.query(get_new_metadata_query).result()

CPU times: user 52.1 ms, sys: 7.18 ms, total: 59.3 ms
Wall time: 3min 4s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7fdb52da8850>

## For TFRecords
Get rid of structs by creating new table with arrays from playlist_seed


In [17]:
%%time
train_flatten_query = f"""
create or replace table `{PROJECT_ID}.{bq_dataset}.train_flatten_pre_split_dif_artist` as (
SELECT a.* except(seed_playlist_tracks, description),
    IFNULL(a.description, "") as description_pl,
    ARRAY(select t.pos_pl from UNNEST(seed_playlist_tracks) t) as pos_pl,
    ARRAY(select t.artist_name_pl from UNNEST(seed_playlist_tracks) t) as artist_name_pl,
    ARRAY(select t.track_uri_pl from UNNEST(seed_playlist_tracks) t) as track_uri_pl,
    ARRAY(select t.track_name_pl from UNNEST(seed_playlist_tracks) t) as track_name_pl,
    ARRAY(select t.duration_ms_pl from UNNEST(seed_playlist_tracks) t) as duration_ms_songs_pl,
    ARRAY(select t.album_name_pl from UNNEST(seed_playlist_tracks) t) as album_name_pl,
    ARRAY(select cast(t.artist_pop_pl as FLOAT64) from UNNEST(seed_playlist_tracks) t) as artist_pop_pl,
    ARRAY(select t.artist_followers_pl from UNNEST(seed_playlist_tracks) t) as artists_followers_pl,
    ARRAY(select case when t.track_pop_pl is null then 0. else t.track_pop_pl end from UNNEST(seed_playlist_tracks) t) as track_pop_pl,
    ARRAY(select t.artist_genres_pl[OFFSET(0)] from UNNEST(seed_playlist_tracks) t) as artist_genres_pl
    from `{PROJECT_ID}.{bq_dataset}.train_dif_artist` a
)
"""

bigquery_client.query(train_flatten_query).result()

CPU times: user 39 ms, sys: 0 ns, total: 39 ms
Wall time: 32.3 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f81200ea410>

In [19]:
%%time
VALIDATION_P = 0.1

validation_creation = f"""
CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.train_flatten_dif_artist_valid` AS (
    SELECT * 
  FROM
    `{PROJECT_ID}.{bq_dataset}.train_flatten_pre_split_dif_artist` where MOD(pid, 100) = 0)"""

bigquery_client.query(validation_creation).result()

CPU times: user 17.3 ms, sys: 285 µs, total: 17.5 ms
Wall time: 16 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f8120095410>

In [21]:
%%time
VALIDATION_P = 0.1

validation_creation = f"""
CREATE OR REPLACE TABLE
  `{PROJECT_ID}.{bq_dataset}.train_flatten_dif_artist_train` AS (
    SELECT * 
  FROM
    `{PROJECT_ID}.{bq_dataset}.train_flatten_pre_split_dif_artist` where MOD(pid, 100) != 0)"""

bigquery_client.query(validation_creation).result()

CPU times: user 26.1 ms, sys: 741 µs, total: 26.9 ms
Wall time: 38.6 s


<google.cloud.bigquery.table._EmptyRowIterator at 0x7f812007c590>

### Fin
You are all set - go on to [`01-tfrecord-beam-pipeline.ipynb`](01-tfrecord-beam-pipeline.ipynb)