# Data prep

## In this notebook we will load the songs from the zip file, and perform transoformations to prepare the data for two-tower training
Steps
1. Extract from the zip file
2. Upload to BQ
3. Enrich features for the playlist songs
4. Cross-join songs with features (excpected rows = n_songs x n_playlists)
5. Remove after-the-fact (later position songs) from the newly generated samples
6. Create a clean train table, and flatten structs or use arrays

#### Unzip the file and uplaod to BQ

In [None]:
# !gsutil cp gs://matching-engine-content/spotify-million-playlist/spotify_million_playlist_dataset.zip .
# !unzip spotify_million_playlist_dataset.zip

In [None]:
# import os
# import json

# data_files = os.listdir('data')
# # PROJECT_ID = 'jtotten-project'
# bq_dataset = 'mdp_eda'

# for filename in data_files:
#   with open(f'data/{filename}') as f:
#     print(f)
#     json_dict = json.load(f)
#     df = pd.DataFrame(json_dict['playlists'])
#     df.to_gbq(
#     destination_table=f'{bq_dataset}.playlists', 
#     project_id=f'{PROJECT_ID}', # TODO: param
#     location='us-central1', 
#     progress_bar=True, 
#     reauth=True, 
#     if_exists='append'
#     ) 

## Prep queries for 

`hybrid-vertex.spotify_train_3.unique_track_features`

+

`hybrid-vertex.spotify_train_3.unique_artist_features`

+

`hybrid-vertex.spotify_mpd.playlists_nested`

Trainv3-create-enriched-data


In [None]:
%%bigquery
CREATE OR REPLACE TABLE
  `hybrid-vertex.spotify_train_3.enriched_data` AS (
    SELECT
    a.* except(tracks),
      ARRAY(
    SELECT
      AS STRUCT CAST(track.pos AS int64) AS pos_seed,
      track.artist_name AS artist_name_seed,
      track.track_uri AS track_uri_seed,
      track.track_name AS track_name_seed,
      track.album_uri AS album_uri_seed,
      track.artist_uri AS artist_uri_seed,
      CAST(track.duration_ms AS float64) AS duration_ms_seed,
      track.album_name AS album_name_seed,
      tf.tracks_playlist_titles AS tracks_playlist_titles,
      tf.track_pop AS track_pop,
      af.artist_pop AS artist_pop,
      af.artist_genres AS artist_genres,
      af.artist_followers AS artist_followers
    FROM
      UNNEST(tracks) as track
    INNER JOIN
      `hybrid-vertex.spotify_train_3.unique_track_features` AS tf --track features
    ON
      (track.track_uri = tf.track_uri)
    INNER JOIN
      `hybrid-vertex.spotify_train_3.unique_artist_features` AS af --artist features
      ON
      (track.artist_uri = af.artist_uri)
      ) AS tracks
  FROM 
  `hybrid-vertex.spotify_mpd.playlists_nested` as a)


cross_join_songxplaylist_struct_query

`hybrid-vertex.spotify_train_3.ordered_position_training`



In [None]:
%%bigquery
CREATE OR REPLACE TABLE
  `hybrid-vertex.spotify_train_3.ordered_position_training` AS (
  WITH
    -- get every combination of song and its parent playlist
    unnest_cross AS(
    SELECT
      b.*,
      CONCAT(b.pid,"-",track.pos_seed) AS pid_pos_id,
      CAST(track.pos_seed AS int64) AS pos,
      track.artist_name_seed,
      track.track_uri_seed,
      track.artist_uri_seed,
      track.track_name_seed,
      track.album_uri_seed,
      CAST(track.duration_ms_seed AS float64) AS duration_ms_seed,
      track.album_name_seed,
      tracks_playlist_titles AS tracks_playlist_titles,
      track_pop AS track_pop_seed,
      artist_pop AS artist_pop_seed,
      artist_genres AS artist_genres_seed,
      artist_followers AS artist_followers_seed
    FROM (
      SELECT
        * EXCEPT(duration_ms),
        duration_ms AS duration_ms_playlist
      FROM
        `hybrid-vertex.spotify_train_3.enriched_data`) AS b
    CROSS JOIN
      UNNEST(tracks) AS track)
  SELECT
    a.* EXCEPT(tracks,
      num_tracks,
      num_artists,
      num_albums,
      num_followers,
      num_edits),
    ARRAY(
    SELECT
      AS STRUCT CAST(track.pos_seed AS int64) AS pos_seed,
      track.artist_name_seed AS artist_name_seed,
      track.track_uri_seed AS track_uri_seed,
      track.track_name_seed AS track_name_seed,
      track.album_uri_seed AS album_uri_seed,
      track.artist_uri_seed AS artist_uri_seed,
      CAST(track.duration_ms_seed AS float64) AS duration_ms_seed,
      track.album_name_seed AS album_name_seed,
      tracks_playlist_titles AS tracks_playlist_titles,
      track_pop AS track_pop_seed,
      artist_pop AS artist_pop_seed,
      artist_genres AS artist_genres_seed,
      artist_followers AS artist_followers_seed
    FROM
      UNNEST(tracks) AS track
    WHERE
      CAST(track.pos_seed AS int64) < a.pos ) AS seed_playlist_tracks,
    ----- seed track part
    trx.pos_seed AS pos_seed_track,
    trx.artist_name_seed AS artist_name_seed_track,
    trx.artist_uri_seed AS artist_uri_seed_track,
    trx.track_name_seed AS track_name_seed_track,
    trx.track_uri_seed AS track_uri_seed_track,
    trx.album_name_seed AS album_name_seed_track,
    trx.album_uri_seed AS album_uri_seed_track,
    trx.duration_ms_seed AS duration_seed_track
  FROM
    unnest_cross AS a -- with statement
    ,
    UNNEST(a.tracks) AS trx
  WHERE
    CAST(trx.pos_seed AS int64) = a.pos-1 );
  -- LIMIT
  -- 100;

Trainv3-clean-track-features

Get new metadata for the tracks now that there are updated track counts, durations, etc...

`hybrid-vertex.spotify_train_3.train`


In [None]:
%%bigquery
create or replace table `hybrid-vertex.spotify_train_3.train` as (
WITH
  playlist_features_clean AS (
  SELECT
    pid_pos_id as pid_pos_id,
    SUM(trx.duration_ms_seed) AS duration_ms_seed_pl,
    COUNT(1) AS n_songs,
    COUNT(DISTINCT trx.artist_name_seed) AS num_artists,
    COUNT(DISTINCT trx.album_uri_seed) AS num_albums,
  FROM
    `hybrid-vertex.spotify_train_3.ordered_position_training`,
    UNNEST(seed_playlist_tracks) AS trx
  GROUP BY
    pid_pos_id)
SELECT
  a.*,
  b.* except(pid_pos_id)
FROM
  `hybrid-vertex.spotify_train_3.ordered_position_training` a
INNER JOIN
  playlist_features_clean b
ON
  a.pid_pos_id = b.pid_pos_id)

## get rid of structs by createing new table with arrays from playlist_seed


In [10]:
%%bigquery
create or replace table `hybrid-vertex.spotify_train_3.train_flatten` as (
SELECT a.* except(seed_playlist_tracks),
    ARRAY(select t.pos_seed from UNNEST(seed_playlist_tracks) t) as pos_seed_pl,
    ARRAY(select t.artist_name_seed from UNNEST(seed_playlist_tracks) t) as artist_name_seed_pl,
    ARRAY(select t.track_uri_seed from UNNEST(seed_playlist_tracks) t) as track_uri_seed_pl,
    ARRAY(select t.track_name_seed from UNNEST(seed_playlist_tracks) t) as track_name_seed_pl,
    ARRAY(select t.duration_ms_seed from UNNEST(seed_playlist_tracks) t) as duration_ms_seed_songs_pl,
    ARRAY(select t.album_name_seed from UNNEST(seed_playlist_tracks) t) as album_name_seed_pl,
    ARRAY(select t.artist_pop_seed from UNNEST(seed_playlist_tracks) t) as artist_pop_seed_pl,
    ARRAY(select t.artist_followers_seed from UNNEST(seed_playlist_tracks) t) as artists_followers_seed_pl,
    ARRAY(select case when t.track_pop_seed is null then 0 else t.track_pop_seed end from UNNEST(seed_playlist_tracks) t) as track_pop_seed_pl,
    --- skip nested artist_genres_seed
    from `hybrid-vertex.spotify_train_3.train` a
)

Query complete after 0.00s: 100%|██████████| 3/3 [00:00<00:00, 1596.21query/s]                        
