#### Example

```
{"query": {"user_id": ["1"], "bucketized_user_age": [18.0], "raw_user_age": [24.0], "timestamp": [0.012986151007588376], "user_gender": ["Male"], "user_occupation_label": [18], "user_occupation_text": ["technician"], "user_zip_code": ["85711"]}, "candidate": {"movie_id": ["1"], "movie_title": ["Toy Story"], "year": [1995], "movie_genres": ["2", "3", "4"]}}
```

In [18]:
BUCKET = 'gs://spotify-builtin-2t'
PROJECT = 'hybrid-vertex'
DATASET_ID = 'spotify_train_3'
TABLE = 'train_json_export_table'
LOCATION = 'us-central1'

In [26]:
# !gsutil mb -l us-central1 $BUCKET

Creating gs://spotify-builtin-2t/...


### More info from [here](https://cloud.google.com/bigquery/docs/samples/bigquery-extract-table-json)

In [45]:
# !gsutil -m rm -r $BUCKET/train_data/* #cleanup if needed

In [46]:
from google.cloud import bigquery
client = bigquery.Client()

destination_uri = f"{BUCKET}/train_data/*.jsonl"
dataset_ref = bigquery.DatasetReference(PROJECT, DATASET_ID)
table_ref = dataset_ref.table(TABLE)
job_config = bigquery.job.ExtractJobConfig()
job_config.destination_format = bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON
extract_job = client.extract_table(
    table_ref,
    destination_uri,
    job_config=job_config,
    # Location must match that of the source table.
    location=LOCATION,
)  # API request
extract_job.result()  # Waits for job to complete.

ExtractJob<project=hybrid-vertex, location=us-central1, id=c68eb054-125d-4c85-b239-88eaa0ae367a>

## define schema for 2t training

In [34]:
# track_uri_seed counts: 2249561
# artist_uri_seed counts: 294110
# album_uri_seed counts: 730377

candidate_types = [('artist_name_can', 'Text'),
('track_uri_can', 'Id', 10000),
# ('album_uri_can', 'Id', 1000),
('track_name_can', 'Text'),
# ('artist_uri_can', 'Id', 1000),
('duration_ms_can', 'Numeric'),
('album_name_can', 'Text'),
('track_pop_can', 'Numeric'),
('artist_pop_can', 'Numeric'),
('artist_followers_can', 'Numeric'),
('artist_genres_can', 'Text'),
]

query_types = [('id_pl', 'Id', 10000),
('name', 'Text'),
('collaborative', 'Text'),
('duration_ms_playlist', 'Numeric'),
('artist_name_seed_track', 'Text'),
# ('artist_uri_seed_track', 'Id', 1000),
('track_name_seed_track', 'Text'),
# ('track_uri_seed_track', 'Id', 10000),
('album_name_seed_track', 'Text'),
# ('album_uri_seed_track', 'Id', 1000),
('duration_seed_track', 'Numeric'),
('track_pop_seed_track', 'Numeric'),
('artist_pop_seed_track', 'Numeric'),
('artist_followers_seed_track', 'Numeric'),
('duration_ms_seed_pl', 'Numeric'),
('n_songs_pl', 'Numeric'),
('num_artists_pl', 'Numeric'),
('num_albums_pl', 'Numeric'),
('artist_genres_seed_track', 'Text'),
('description_pl', 'Text'),
('artist_name_pl', 'Text'),
# ('track_uri_pl', 'Id', 10000),
('track_name_pl', 'Text'),
('duration_ms_songs_pl', 'Numeric'),
('album_name_pl', 'Text'),
('artist_pop_pl', 'Numeric'),
('artists_followers_pl', 'Numeric'),
('track_pop_pl', 'Numeric'),
('artist_genres_pl', 'Text'),
              ]

In [35]:
#iterate over the bq rows and update corresponding data jsonl data
import json

with open('train_schema.json', 'w') as file:
    line = {"query" : {},
           "candidate" : {}}
    for field in query_types:
        line["query"].update({field[0]: {"feature_type": field[1]}})
        if len(field) == 3:
            line["query"][field[0]].update({"config": {"num_buckets": field[2]}})
    for field in candidate_types:
        line["candidate"].update({field[0]: {"feature_type": field[1]}})
        if len(field) == 3:
            line["candidate"][field[0]].update({"config": {"num_buckets": field[2]}})
    file.write(json.dumps(line))
        
file.close()

In [36]:
!gsutil rm $BUCKET/schema.json

Removing gs://spotify-builtin-2t/schema.json...
/ [1 objects]                                                                   
Operation completed over 1 objects.                                              


In [37]:
!gsutil cp train_schema.json $BUCKET/schema.json

Copying file://train_schema.json [Content-Type=application/json]...
/ [1 files][  1.7 KiB/  1.7 KiB]                                                
Operation completed over 1 objects/1.7 KiB.                                      


## Old single-threaded approach

In [None]:
# import json


# from google.cloud import bigquery

# # Construct a BigQuery client object.
# client = bigquery.Client()

# query = "select * from `hybrid-vertex.spotify_train_3.train_flatten`"
# # data = client.query(query)
# # data.allow_large_results = True
# data = client.list_rows("hybrid-vertex.spotify_train_3.train_flatten")#, max_results=100)

# query_fields = [
#     'name',
#     'collaborative',
#     'description',
#     'duration_ms_playlist',
#     'artist_name_seed_track',
#     'artist_uri_seed_track',
#     'track_name_seed_track',
#     'track_uri_seed_track',
#     'album_name_seed_track',
#     'album_uri_seed_track',
#     'duration_seed_track',
#     'duration_ms_seed_pl',
#     'n_songs',
#     'num_artists',
#     'num_albums',
#     'artist_name_seed_pl',
#     'track_uri_seed_pl',
#     'track_name_seed_pl',
#     'duration_ms_seed_songs_pl',
#     'album_name_seed_pl',
#     'artist_pop_seed_pl',
#     'artists_followers_seed_pl',
#     'track_pop_seed_pl',
# ]

# candidate_fields = [
#     'artist_name_seed',
#     'track_uri_seed',
#     'artist_uri_seed',
#     'track_name_seed',
#     'album_uri_seed',
#     'duration_ms_seed',
#     'album_name_seed',
#     'track_pop_seed',
#     'artist_pop_seed',
#     'artist_genres_seed',
#     'artist_followers_seed',
# ]
# !rm train_data.jsonl #remove the file if needed
# import multiprocessing
# p = multiprocessing.Pool(2) # use all available CPUs

# #iterate over the bq rows and update corresponding data jsonl data
# def format_data(data, filename='train_data.jsonl'):
#     for row in data:
#         line = {"query" : {},
#                "candidate" : {}}
#         for field in query_fields:
#             if type(row[field]) == list:
#                 line["query"].update({field: row[field]})
#             else:
#                 line["query"].update({field: [row[field]]}) #if not a list, put variable in list
#         for field in candidate_fields:
#             if type(row[field]) == list:
#                 line["candidate"].update({field: row[field]})
#             else:
#                 line["candidate"].update({field: [row[field]]})
#     with open(filename, 'w') as file:
#         file.write(json.dumps(line) + ",\n")
#     file.close()

# #run multithreaded 
# format_data(data)
# # p.imap(write_jsonl, lines)