# Approach: word2vec skip-gram embeddings model

Based on [tensorflow's word2vec tutorial](https://www.tensorflow.org/tutorials/text/word2vec)

# Importing 2017 data
Copying cells from `load_2017_data.ipynb` notebook, which produces the following:

Loads the following csv's into **DataFrames**:
- `playlist_2017.csv` $\rightarrow$ `playlist_df`
- `tracks_2017.csv` $\rightarrow$ `track_df`
- `albums_2017.csv` $\rightarrow$ `album_df`
- `artist_2017.csv` $\rightarrow$ `artist_df`

Then, the following **dictionaries** are created:
- `playlist_dict`
- `track_dict`
- `album_dict`
- `artist_dict`

As well as the following **functions**:
- `get_playlist_feature(PID, feature)`
- `get_track_feature(URI, feature)`
- `get_album_feature(URI, feature)`
- `get_artist_feature(URI, feature)`

In [75]:
import os
import csv
import tqdm

import numpy as np
import pandas as pd

import tensorflow as tf


### Update data path as needed

In [4]:
# path to the DIRECTORY where CSV files are saved relative to this notebook
path = '../../data'
output_filepath_root = os.path.relpath(path)
print(output_filepath_root)

../../data


## Load 2017 CSVs to DFs

### Playlists

In [5]:
playlist_df = pd.read_csv(os.path.join(output_filepath_root, 'playlists_2017.csv'), index_col='pid')
playlist_df.head(5)

Unnamed: 0_level_0,name,description,modified_at,num_artists,num_albums,num_tracks,num_followers,num_edits,duration_ms,collaborative,...,track_14_album_uri,track_14_artist_uri,track_15_uri,track_15_album_uri,track_15_artist_uri,modified_at_date,modified_at_year,modified_at_month,modified_at_day,modified_at_dow
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
434000,Sad,,1488240000,24,26,27,1,6,6081757,False,...,spotify:album:4M9Ti6t5h54aDMX4SizDfT,spotify:artist:4vVfuZfXWu18vk5Z4C7wbm,spotify:track:3yrVRdwCbEeKODZgG2mVZX,spotify:album:3SCJmoy3Z45p84IfuaM9YQ,spotify:artist:2EO56JK4txid1Pss9GVbOL,2017-02-28,2017,2,28,1
434001,pb&j,faves tbh,1487808000,35,38,39,1,9,8959761,False,...,spotify:album:6deiaArbeoqp1xPEGdEKp1,spotify:artist:0L8ExT028jH3ddEcZwqJJ5,spotify:track:5E30LdtzQTGqRvNd7l6kG5,spotify:album:18iFxjZugvKhuNNMbLjZJF,spotify:artist:77SW9BnxLY8rJ0RciFqkHh,2017-02-23,2017,2,23,3
434004,Whatever,,1506816000,36,57,79,1,46,18874072,False,...,spotify:album:2Tyx5dLhHYkx6zeAdVaTzN,spotify:artist:4LLpKhyESsyAXpc4laK94U,spotify:track:0htTZnlk6okQ1HIq4EvFQ6,spotify:album:6liIoWzpvrff945pUI7fHt,spotify:artist:02kJSzxNuaWGqwubyUba0Z,2017-10-01,2017,10,1,6
434005,roadtrip,,1492905600,30,35,70,1,18,15696608,False,...,spotify:album:6DwdzG4UGYLxJ2p7bd483v,spotify:artist:2Q0MyH5YMI5HPQjFjlq5g3,spotify:track:4XvKjZWIqsHvvza89lMTAH,spotify:album:6izXZb0VGaUHqm5GaXq4YC,spotify:artist:5IXalAOiV9I8LgLMGZydmt,2017-04-23,2017,4,23,6
434007,Cumbias,,1507507200,23,42,50,1,21,10255699,False,...,spotify:album:392uNMyh5D6fqkBS385XJd,spotify:artist:3zzeZVLuOeetfimOd4k8rE,spotify:track:1yOLrH7nF0R7MWDuuva6va,spotify:album:2rXbAorimO8C06RkqS2oq5,spotify:artist:0OhiQFSqbnnmB52NWEpsO5,2017-10-09,2017,10,9,0


### Tracks

In [6]:
track_df = pd.read_csv(os.path.join(output_filepath_root, 'tracks_2017.csv'), index_col='track_uri')
track_df.head(5)

Unnamed: 0_level_0,track_name,album_name,album_uri,artist_name,artist_uri,duration_ms
track_uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
spotify:track:6SbAbLqAWf2tnTdUy6Gmm5,FUCKING BEST SONG EVERRR,FUCKING BEST SONG EVERRR,spotify:album:1hmvZb81DAeTx67G1FaTjZ,Wallpaper.,spotify:artist:6NMcnx3vKGSAeqSMbySlpw,217800
spotify:track:1MvpPH6BTP3IrLnTjEA2gw,#STUPiDFACEDD,#STUPiDFACEDD,spotify:album:1c7wJm9mghFyIKnQJOobW8,Wallpaper.,spotify:artist:6NMcnx3vKGSAeqSMbySlpw,184026
spotify:track:5rgy6ghBq1eRApCkeUdJXf,We Are Young (feat. Janelle Monáe) - feat. Jan...,Some Nights,spotify:album:7m7F7SQ3BXvIpvOgjW51Gp,fun.,spotify:artist:5nCi3BB41mBaMH9gfr6Su0,250626
spotify:track:07dYGGSrzPeg6a3KZjWX65,Boyfriend,Believe,spotify:album:7BWK3eXcbAdwYeulyQj5Kw,Justin Bieber,spotify:artist:1uNFoZAHBGtllmzznpCI3s,171333
spotify:track:1NpW5kyvO4XrNJ3rnfcNy3,Wild Ones (feat. Sia),Wild Ones,spotify:album:7eLwoxxWs6lfkVYJGkGNbk,Flo Rida,spotify:artist:0jnsk9HBra6NMjO2oANoPY,232946


### Albums

In [7]:
album_df = pd.read_csv(os.path.join(output_filepath_root, 'albums_2017.csv'), index_col='album_uri')
album_df.head(5)

Unnamed: 0_level_0,album_name,artist_name
album_uri,Unnamed: 1_level_1,Unnamed: 2_level_1
spotify:album:1hmvZb81DAeTx67G1FaTjZ,FUCKING BEST SONG EVERRR,Wallpaper.
spotify:album:1c7wJm9mghFyIKnQJOobW8,#STUPiDFACEDD,Wallpaper.
spotify:album:7m7F7SQ3BXvIpvOgjW51Gp,Some Nights,fun.
spotify:album:7BWK3eXcbAdwYeulyQj5Kw,Believe,Justin Bieber
spotify:album:7eLwoxxWs6lfkVYJGkGNbk,Wild Ones,Flo Rida


### Artists

In [8]:
artist_df = pd.read_csv(os.path.join(output_filepath_root, 'artists_2017.csv'), index_col='artist_uri')
artist_df.head(5)

Unnamed: 0_level_0,artist_name
artist_uri,Unnamed: 1_level_1
spotify:artist:6NMcnx3vKGSAeqSMbySlpw,Wallpaper.
spotify:artist:5nCi3BB41mBaMH9gfr6Su0,fun.
spotify:artist:1uNFoZAHBGtllmzznpCI3s,Justin Bieber
spotify:artist:0jnsk9HBra6NMjO2oANoPY,Flo Rida
spotify:artist:4AK6F7OLvEQ5QYCBNiQWHq,One Direction


## Create Dictionaries

### Playlists

In [9]:
playlist_dict = playlist_df.to_dict()

In [10]:
def get_playlist_feature(pid, feature='name', dictionary=playlist_dict):
    """
    Given a playlist PID (as a integer) and the playlist dictionary, return the requested feature.
    
    Features include:
        - 'name'
        - 'description'
        - 'modified_at'
        - 'modified_at_date'
        - 'modified_at_year'
        - 'modified_at_month'
        - 'modified_at_day'
        - 'modified_at_dow'
        - 'num_artists'
        - 'num_albums'
        - 'num_tracks'
        - 'num_followers'
        - 'num_edits'
        - 'duration_ms'
        - 'collaborative'
        - 'track_X_uri'
        - 'track_X_album_uri'
        - 'track_X_artist_uri'
    """
    # Convert PID to integer
    try:
        pid = int(pid)
    except Exception as e:
        return e
    # Lookup in dictionary
    try:
        return dictionary[feature][pid]
    except Exception as e:
        return e

In [11]:
get_playlist_feature('434004', 'name')

'Whatever'

### Tracks

In [12]:
track_dict = track_df.to_dict()

In [13]:
def get_track_feature(uri, feature='track_name', dictionary=track_dict):
    """
    Given a track uri (as a string) and the track dictionary, return the requested feature.
    
    Features include:
        - 'track_name'
        - 'album_name'
        - 'album_uri'
        - 'artist_name'
        - 'artist_uri'
        - 'duration_ms'
    """
    try:
        return dictionary[feature][uri]
    except Exception as e:
        return e

In [14]:
get_track_feature('spotify:track:5rgy6ghBq1eRApCkeUdJXf', 'album_name')

'Some Nights'

### Albums

In [15]:
album_dict = album_df.to_dict()

In [16]:
def get_album_feature(uri, feature='album_name', dictionary=album_dict):
    """
    Given an album uri (as a string) and the album dictionary, return the requested feature.
    
    Features include:
        - 'album_name'
        - 'artist_name'
    """
    try:
        return dictionary[feature][uri]
    except Exception as e:
        return e

In [17]:
get_album_feature('spotify:album:7BWK3eXcbAdwYeulyQj5Kw', 'artist_name')

'Justin Bieber'

### Artists

In [18]:
artist_dict = artist_df.to_dict()

In [19]:
def get_artist_feature(uri, feature='artist_name', dictionary=artist_dict):
    """
    Given an artist uri (as a string) and the artist dictionary, return the requested feature.
    
    Features include:
        - 'artist_name'
    """
    try:
        return dictionary[feature][uri]
    except Exception as e:
        return e

In [20]:
get_artist_feature('spotify:artist:1uNFoZAHBGtllmzznpCI3s')

'Justin Bieber'

# Setting up word2vec

## Test Playlist

In [57]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [34]:
# df = playlist_df.loc[:, playlist_df.columns.str.startswith('track_')]
track_uri_cols = [col for col in playlist_df.columns
                  if 'uri' in col
                  and 'album' not in col
                  and 'artist' not in col]
track_uri_cols

['track_1_uri',
 'track_2_uri',
 'track_3_uri',
 'track_4_uri',
 'track_5_uri',
 'track_6_uri',
 'track_7_uri',
 'track_8_uri',
 'track_9_uri',
 'track_10_uri',
 'track_11_uri',
 'track_12_uri',
 'track_13_uri',
 'track_14_uri',
 'track_15_uri']

In [37]:
track_uri_df = playlist_df[track_uri_cols]
print(track_uri_df.shape)
track_uri_df.head()

(570806, 15)


Unnamed: 0_level_0,track_1_uri,track_2_uri,track_3_uri,track_4_uri,track_5_uri,track_6_uri,track_7_uri,track_8_uri,track_9_uri,track_10_uri,track_11_uri,track_12_uri,track_13_uri,track_14_uri,track_15_uri
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
434000,spotify:track:5JuA3wlm0kn7IHfbeHV0i6,spotify:track:5p79cffxQgsOSF7hraBV1M,spotify:track:32FJ75MXWMYWmmTw3NvKKy,spotify:track:0tV8pOpiNsKqUys0ilUcXz,spotify:track:3Th56VIq2sEaEmPPETu7p5,spotify:track:5lUbIsKvBfML00F1UNV1i2,spotify:track:6nIE1oCE4udqMGv3bqVNVb,spotify:track:4dvNIeJawEWqMe1ZfYKcXt,spotify:track:4tLBp1HnrrCvbJbeD4g0f3,spotify:track:3VXY8vQ3NJZ76iLtAg51QR,spotify:track:5uCax9HTNlzGybIStD3vDh,spotify:track:2iXBZ32Fz5VDCLeE0JIdX5,spotify:track:1aBO5KPwxqLESNTTJBR6VP,spotify:track:2iFvY1l5o2mmUAjBq1L9Mh,spotify:track:3yrVRdwCbEeKODZgG2mVZX
434001,spotify:track:6y6jbcPG4Yn3Du4moXaenr,spotify:track:6Nl5jjykRxa7salssZ0aER,spotify:track:4SYUUlkScpNR1QvPscXf8t,spotify:track:3xKsf9qdS1CyvXSMEid6g8,spotify:track:46Y1VsEbWAQ7dIZe1gpnve,spotify:track:0tICYNayWWhH9GPeFrfjfD,spotify:track:4djIFfof5TpbSGRZUpsTXq,spotify:track:1snNAXmmPXCn0dkF9DaPWw,spotify:track:5ho74ZlMvEbyhFutCd8SGg,spotify:track:2UAWCPSBKXi33Q0YMoDRRS,spotify:track:4dASQiO1Eoo3RJvt74FtXB,spotify:track:41on8RwRh22IHcChAN2gm8,spotify:track:7BHPGtpuuWWsvE7cCaMuEU,spotify:track:3ZOEytgrvLwQaqXreDs2Jx,spotify:track:5E30LdtzQTGqRvNd7l6kG5
434004,spotify:track:6JG0qhINKVwiHxqN85j7RG,spotify:track:1kMuU3TNQvHbqvXCWBodmP,spotify:track:0lwkTJnLBVWvEnxtku7Msy,spotify:track:5LME7YULt0enp6UAB8VoDn,spotify:track:1JFx9R87En9oJOi4DRH1e9,spotify:track:3UYhSPI2lVFfbnC6usruPj,spotify:track:0jx8zY5JQsS4YEQcfkoc5C,spotify:track:6TaqooOXAEcijL6G1AWS2K,spotify:track:0z5ZPs57J2KERwM1tBM2GF,spotify:track:25khomWgBVamSdKw7hzm3l,spotify:track:5yJ0kIxrrYra3QexH58UVj,spotify:track:6Ius4TC0L3cN74HT7ENE6e,spotify:track:1yYzuNd0KRyHVJ3NH8apBt,spotify:track:6GnhWMhgJb7uyiiPEiEkDA,spotify:track:0htTZnlk6okQ1HIq4EvFQ6
434005,spotify:track:2M4K2cYkZHN7OMJSZhPa5i,spotify:track:76S8gvsdaaH7obtBwXvVRL,spotify:track:7mZH0W1tFV2Hgl94ifGT0S,spotify:track:7uRTRMqtfXaOrbHqId0BSF,spotify:track:2uxnPcTJa2UExlj1LwYZp6,spotify:track:57nNNkgk768QVXq3uHxu5e,spotify:track:5CeWLQg987gjDTPF9jQptP,spotify:track:5IxS93bfuITJyNNkvkJ4q5,spotify:track:0mQnLhHH5WwRc5gtnzLtK6,spotify:track:66fVx0KicQduW94cDNM59V,spotify:track:66hayvUbTotekKU3H4ta1f,spotify:track:4WjH9Bzt3kx7z8kl0awxh4,spotify:track:5Hag6lqkkjgbzoyMEZO95y,spotify:track:4SuwafV4Ell3N4unnIzJSI,spotify:track:4XvKjZWIqsHvvza89lMTAH
434007,spotify:track:67RNKBsqRiyMOtor2RxfWR,spotify:track:2hqc4KkXhPVSuKNatapCmD,spotify:track:5VKELcHc3xqRq0tySxMVrB,spotify:track:7Eo1JOdPvYYZqveKzXh8pN,spotify:track:3KdOm1GJJWYDTQmIbRIkcL,spotify:track:5GgC8oLIpJjQ5FlshQvtv3,spotify:track:3ODSy5RSRPmo9WbuVYuWZx,spotify:track:0cMIrWqljfkcl99TIYTtqf,spotify:track:0zYYWBqEFWrQjg48Nmf71B,spotify:track:3CNdjIf1JnMwOrn2Gggbyf,spotify:track:1bYd8H3uOQA1Er17QN1V6B,spotify:track:4x85BXuxs7vAPZqJ2pRDvc,spotify:track:23H9bXxwd5L4CdTrvU7SYl,spotify:track:3JxCHQHvao3OO5x7gGiral,spotify:track:1yOLrH7nF0R7MWDuuva6va


In [42]:
test_playlist = track_uri_df.loc[434000]
test_playlist.tolist()

['spotify:track:5JuA3wlm0kn7IHfbeHV0i6',
 'spotify:track:5p79cffxQgsOSF7hraBV1M',
 'spotify:track:32FJ75MXWMYWmmTw3NvKKy',
 'spotify:track:0tV8pOpiNsKqUys0ilUcXz',
 'spotify:track:3Th56VIq2sEaEmPPETu7p5',
 'spotify:track:5lUbIsKvBfML00F1UNV1i2',
 'spotify:track:6nIE1oCE4udqMGv3bqVNVb',
 'spotify:track:4dvNIeJawEWqMe1ZfYKcXt',
 'spotify:track:4tLBp1HnrrCvbJbeD4g0f3',
 'spotify:track:3VXY8vQ3NJZ76iLtAg51QR',
 'spotify:track:5uCax9HTNlzGybIStD3vDh',
 'spotify:track:2iXBZ32Fz5VDCLeE0JIdX5',
 'spotify:track:1aBO5KPwxqLESNTTJBR6VP',
 'spotify:track:2iFvY1l5o2mmUAjBq1L9Mh',
 'spotify:track:3yrVRdwCbEeKODZgG2mVZX']

In [46]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in test_playlist:
    if token not in vocab:
        vocab[token] = index
        index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'spotify:track:5JuA3wlm0kn7IHfbeHV0i6': 1, 'spotify:track:5p79cffxQgsOSF7hraBV1M': 2, 'spotify:track:32FJ75MXWMYWmmTw3NvKKy': 3, 'spotify:track:0tV8pOpiNsKqUys0ilUcXz': 4, 'spotify:track:3Th56VIq2sEaEmPPETu7p5': 5, 'spotify:track:5lUbIsKvBfML00F1UNV1i2': 6, 'spotify:track:6nIE1oCE4udqMGv3bqVNVb': 7, 'spotify:track:4dvNIeJawEWqMe1ZfYKcXt': 8, 'spotify:track:4tLBp1HnrrCvbJbeD4g0f3': 9, 'spotify:track:3VXY8vQ3NJZ76iLtAg51QR': 10, 'spotify:track:5uCax9HTNlzGybIStD3vDh': 11, 'spotify:track:2iXBZ32Fz5VDCLeE0JIdX5': 12, 'spotify:track:1aBO5KPwxqLESNTTJBR6VP': 13, 'spotify:track:2iFvY1l5o2mmUAjBq1L9Mh': 14, 'spotify:track:3yrVRdwCbEeKODZgG2mVZX': 15}


In [47]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'spotify:track:5JuA3wlm0kn7IHfbeHV0i6', 2: 'spotify:track:5p79cffxQgsOSF7hraBV1M', 3: 'spotify:track:32FJ75MXWMYWmmTw3NvKKy', 4: 'spotify:track:0tV8pOpiNsKqUys0ilUcXz', 5: 'spotify:track:3Th56VIq2sEaEmPPETu7p5', 6: 'spotify:track:5lUbIsKvBfML00F1UNV1i2', 7: 'spotify:track:6nIE1oCE4udqMGv3bqVNVb', 8: 'spotify:track:4dvNIeJawEWqMe1ZfYKcXt', 9: 'spotify:track:4tLBp1HnrrCvbJbeD4g0f3', 10: 'spotify:track:3VXY8vQ3NJZ76iLtAg51QR', 11: 'spotify:track:5uCax9HTNlzGybIStD3vDh', 12: 'spotify:track:2iXBZ32Fz5VDCLeE0JIdX5', 13: 'spotify:track:1aBO5KPwxqLESNTTJBR6VP', 14: 'spotify:track:2iFvY1l5o2mmUAjBq1L9Mh', 15: 'spotify:track:3yrVRdwCbEeKODZgG2mVZX'}


In [48]:
example_sequence = [vocab[word] for word in test_playlist]
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [51]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

54


In [54]:
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({get_track_feature(inverse_vocab[target])}, {get_track_feature(inverse_vocab[context])})")

(6, 7): (Almost Lover, Half a Heart)
(12, 13): (Better in Time, Thank You for the Broken Heart)
(5, 6): (All Of The Stars, Almost Lover)
(15, 13): (Dead Hearts, Thank You for the Broken Heart)
(5, 3): (All Of The Stars, Start Again)


In [59]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([0 1 3 2], shape=(4,), dtype=int64)
['<pad>', 'spotify:track:5JuA3wlm0kn7IHfbeHV0i6', 'spotify:track:32FJ75MXWMYWmmTw3NvKKy', 'spotify:track:5p79cffxQgsOSF7hraBV1M']


In [60]:
# Add a dimension so you can use concatenation (in the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape the target to shape `(1,)` and context and label to `(num_ns+1,)`.
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [64]:
print(f"target_index    : {target}")
print(f"target_word     : {get_track_feature(inverse_vocab[target_word])}")
print(f"context_indices : {context}")
print(f"context_words   : {[get_track_feature(inverse_vocab[c.numpy()]) for c in context]}")
print(f"label           : {label}")

target_index    : 6
target_word     : Almost Lover
context_indices : [7 0 1 3 2]
context_words   : ['Half a Heart', KeyError('<pad>'), 'All I Want', 'Start Again', 'Sad']
label           : [1 0 0 0 0]


___
## For Real This Time

In [68]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [69]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (playlists) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (playlist).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
    
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [72]:
vocab_size = track_df.shape[0]
vocab_size

645802

In [81]:
all_track_uris = list(track_df.index)
all_track_uris[:5]

['spotify:track:6SbAbLqAWf2tnTdUy6Gmm5',
 'spotify:track:1MvpPH6BTP3IrLnTjEA2gw',
 'spotify:track:5rgy6ghBq1eRApCkeUdJXf',
 'spotify:track:07dYGGSrzPeg6a3KZjWX65',
 'spotify:track:1NpW5kyvO4XrNJ3rnfcNy3']

In [77]:
# Use StringLookup layer to translate URI strings into integer outputs
# via a table-based vocab lookup
vectorize_layer = tf.keras.layers.experimental.preprocessing.StringLookup(
    max_tokens=vocab_size
)

In [89]:
track_uri_df.reset_index().drop(['pid'], axis=1).head()

Unnamed: 0,track_1_uri,track_2_uri,track_3_uri,track_4_uri,track_5_uri,track_6_uri,track_7_uri,track_8_uri,track_9_uri,track_10_uri,track_11_uri,track_12_uri,track_13_uri,track_14_uri,track_15_uri
0,spotify:track:5JuA3wlm0kn7IHfbeHV0i6,spotify:track:5p79cffxQgsOSF7hraBV1M,spotify:track:32FJ75MXWMYWmmTw3NvKKy,spotify:track:0tV8pOpiNsKqUys0ilUcXz,spotify:track:3Th56VIq2sEaEmPPETu7p5,spotify:track:5lUbIsKvBfML00F1UNV1i2,spotify:track:6nIE1oCE4udqMGv3bqVNVb,spotify:track:4dvNIeJawEWqMe1ZfYKcXt,spotify:track:4tLBp1HnrrCvbJbeD4g0f3,spotify:track:3VXY8vQ3NJZ76iLtAg51QR,spotify:track:5uCax9HTNlzGybIStD3vDh,spotify:track:2iXBZ32Fz5VDCLeE0JIdX5,spotify:track:1aBO5KPwxqLESNTTJBR6VP,spotify:track:2iFvY1l5o2mmUAjBq1L9Mh,spotify:track:3yrVRdwCbEeKODZgG2mVZX
1,spotify:track:6y6jbcPG4Yn3Du4moXaenr,spotify:track:6Nl5jjykRxa7salssZ0aER,spotify:track:4SYUUlkScpNR1QvPscXf8t,spotify:track:3xKsf9qdS1CyvXSMEid6g8,spotify:track:46Y1VsEbWAQ7dIZe1gpnve,spotify:track:0tICYNayWWhH9GPeFrfjfD,spotify:track:4djIFfof5TpbSGRZUpsTXq,spotify:track:1snNAXmmPXCn0dkF9DaPWw,spotify:track:5ho74ZlMvEbyhFutCd8SGg,spotify:track:2UAWCPSBKXi33Q0YMoDRRS,spotify:track:4dASQiO1Eoo3RJvt74FtXB,spotify:track:41on8RwRh22IHcChAN2gm8,spotify:track:7BHPGtpuuWWsvE7cCaMuEU,spotify:track:3ZOEytgrvLwQaqXreDs2Jx,spotify:track:5E30LdtzQTGqRvNd7l6kG5
2,spotify:track:6JG0qhINKVwiHxqN85j7RG,spotify:track:1kMuU3TNQvHbqvXCWBodmP,spotify:track:0lwkTJnLBVWvEnxtku7Msy,spotify:track:5LME7YULt0enp6UAB8VoDn,spotify:track:1JFx9R87En9oJOi4DRH1e9,spotify:track:3UYhSPI2lVFfbnC6usruPj,spotify:track:0jx8zY5JQsS4YEQcfkoc5C,spotify:track:6TaqooOXAEcijL6G1AWS2K,spotify:track:0z5ZPs57J2KERwM1tBM2GF,spotify:track:25khomWgBVamSdKw7hzm3l,spotify:track:5yJ0kIxrrYra3QexH58UVj,spotify:track:6Ius4TC0L3cN74HT7ENE6e,spotify:track:1yYzuNd0KRyHVJ3NH8apBt,spotify:track:6GnhWMhgJb7uyiiPEiEkDA,spotify:track:0htTZnlk6okQ1HIq4EvFQ6
3,spotify:track:2M4K2cYkZHN7OMJSZhPa5i,spotify:track:76S8gvsdaaH7obtBwXvVRL,spotify:track:7mZH0W1tFV2Hgl94ifGT0S,spotify:track:7uRTRMqtfXaOrbHqId0BSF,spotify:track:2uxnPcTJa2UExlj1LwYZp6,spotify:track:57nNNkgk768QVXq3uHxu5e,spotify:track:5CeWLQg987gjDTPF9jQptP,spotify:track:5IxS93bfuITJyNNkvkJ4q5,spotify:track:0mQnLhHH5WwRc5gtnzLtK6,spotify:track:66fVx0KicQduW94cDNM59V,spotify:track:66hayvUbTotekKU3H4ta1f,spotify:track:4WjH9Bzt3kx7z8kl0awxh4,spotify:track:5Hag6lqkkjgbzoyMEZO95y,spotify:track:4SuwafV4Ell3N4unnIzJSI,spotify:track:4XvKjZWIqsHvvza89lMTAH
4,spotify:track:67RNKBsqRiyMOtor2RxfWR,spotify:track:2hqc4KkXhPVSuKNatapCmD,spotify:track:5VKELcHc3xqRq0tySxMVrB,spotify:track:7Eo1JOdPvYYZqveKzXh8pN,spotify:track:3KdOm1GJJWYDTQmIbRIkcL,spotify:track:5GgC8oLIpJjQ5FlshQvtv3,spotify:track:3ODSy5RSRPmo9WbuVYuWZx,spotify:track:0cMIrWqljfkcl99TIYTtqf,spotify:track:0zYYWBqEFWrQjg48Nmf71B,spotify:track:3CNdjIf1JnMwOrn2Gggbyf,spotify:track:1bYd8H3uOQA1Er17QN1V6B,spotify:track:4x85BXuxs7vAPZqJ2pRDvc,spotify:track:23H9bXxwd5L4CdTrvU7SYl,spotify:track:3JxCHQHvao3OO5x7gGiral,spotify:track:1yOLrH7nF0R7MWDuuva6va


In [98]:
adapt_data = tf.constant(track_uri_df.reset_index().drop(['pid'], axis=1))
adapt_data.shape

TensorShape([570806, 15])

In [99]:
# call .adapt() on all of the data (with repeating tracks) to create vocabulary
vectorize_layer.adapt(adapt_data)

In [101]:
inverse_vocab = vectorize_layer.get_vocabulary()
display(inverse_vocab[:20])
print(len(inverse_vocab))
print(track_df.shape)

['',
 '[UNK]',
 '0',
 'spotify:track:7KXjTSCq5nL1LoYtL7XAwS',
 'spotify:track:3a1lNhkSLSkpJE4MSHpDu9',
 'spotify:track:152lZdxL1OR0ZMW6KquMif',
 'spotify:track:7yyRTcZmCiyzzJlNzGC9Ol',
 'spotify:track:2EEeOnHehOozLq4aS0n6SL',
 'spotify:track:7GX5flRQZVHRAGd6B4TmDO',
 'spotify:track:4Km5HrUvYTaSUfiSGPJeQR',
 'spotify:track:0SGkqnVQo9KPytSri1H6cF',
 'spotify:track:7BKLCZ1jbUBVqRi2FVlTVw',
 'spotify:track:5hTpBe8h35rJ67eAWHQsJx',
 'spotify:track:7qiZfU4dY1lWllzX7mPBI3',
 'spotify:track:0VgkVdmE4gld66l8iyGjgx',
 'spotify:track:343YBumqHu19cGoGARUTsd',
 'spotify:track:5CtI0qwDJkDQGwXD1H1cLb',
 'spotify:track:6gBFPUFcJLzWGx4lenP6h2',
 'spotify:track:3DXncPQOG4VBw3QHh3S817',
 'spotify:track:1xznGGDReH1oQq0xzbwXa3']

645802
(645802, 6)


In [None]:
# Vectorize the data in playlists.
# trying to recreate this function:
# track_vector = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()


# stopping here bc we're going with Lawis' model