In [1]:
import pandas as pd
from tqdm.auto import tqdm; tqdm.pandas()

import os
import sys; sys.path.insert(0, os.path.abspath('..'))

from annoy import AnnoyIndex
from overrides import overrides

from allennlp.common import Params
from allennlp.common.util import JsonDict
from allennlp.data import Instance, DatasetReader
from allennlp.data.vocabulary import Vocabulary
from allennlp.predictors.predictor import Predictor
from allennlp.models import Model, load_archive

from vibecheck.models import BERTMoji
from vibecheck.dataset_readers import TwitterDatasetReader
from vibecheck.predictors.knn_predictor import KNNPredictor

  from pandas import Panel


vocab -> predict -> index -> save

## Data

In [2]:
DATA_DIR = "/home/jacobgdt/.cache/spotify_data/"
VOCAB_DIR = os.path.join(DATA_DIR, 'vocab')
INDEX_DIR = os.path.join(DATA_DIR, 'index.tree')

songs = pd.read_csv('https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv')
songs = songs[~songs.lyrics.isnull()].reset_index(drop=True)

lyrics = songs.lyrics.str.split(pat='\n{1,2}')

## Vocab

In [3]:
os.makedirs(VOCAB_DIR, exist_ok=True)

with open(os.path.join(VOCAB_DIR, 'track_ids.txt'), 'w') as f:
    f.writelines('\n'.join(songs.track_id.unique()))

In [4]:
vocab = Vocabulary()
vocab.set_from_file(os.path.join(VOCAB_DIR, 'track_ids.txt'), is_padded=False, namespace="labels")
vocab

Vocabulary with namespaces:  labels, Size: 7662 || Non Padded Namespaces: {'*tags', '*labels'}

In [5]:
vocab.save_to_files(os.path.join(VOCAB_DIR, 'vocab'))



## Predictor

In [3]:
params = Params.from_file('/home/jacobgdt/bertmoji/experiments/baseline_clf.jsonnet')
reader_params = params.pop('dataset_reader')
reader = DatasetReader.from_params(reader_params)

predictor = KNNPredictor(load_archive('https://jacobdanovitch.blob.core.windows.net/datasets/model.tar.gz', cuda_device=0).model, 
                        reader,
                        os.path.join(VOCAB_DIR, 'vocab'),
                        'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv',
                        # None, #DATA_DIR+'index.tree',
)
# predictor._model.eval();

In [7]:
import re
import numpy as np
# from scipy.special import softmax, expit as sigmoid

def encode_song(lyrics):
    x = [{'query': line} for line in lyrics if line and re.match('^[^\[].*[^\]]$', line, flags=re.M)]
    if not x: # instrumental
        return None
    pred = predictor.predict_batch_json(x)
    return np.stack([p['logits'] for p in pred]).max(axis=0)

In [11]:
encoded = lyrics.progress_apply(encode_song)

HBox(children=(FloatProgress(value=0.0, max=7662.0), HTML(value='')))




In [16]:
index_data = [(track, vec) for (track, vec) in zip(songs.track_id, encoded) if vec is not None]
pd.DataFrame(index_data, columns=['track_id', 'features']).to_json(os.path.join(DATA_DIR, 'encoded.jsonl'), lines=True, orient='records')

index_data[0]

('14msK75pk3pA33pzPVNtBF',
 array([1.        , 0.03960975, 0.99987054, 0.63999385, 0.01389084,
        0.0057368 , 0.01067287, 0.00612334, 0.01035467, 0.99978787,
        0.00768574, 0.07313313, 0.490284  , 0.23276588, 0.00912741,
        0.00683836, 0.13735737, 0.03423967, 0.00721115, 0.23394252,
        0.98473328, 0.03093073, 0.15337715, 0.00703134, 0.02795666,
        0.02635283, 0.04038235, 0.01321106, 0.0073832 , 0.01052561,
        0.0075404 , 0.01267227, 0.06510044, 0.00816377, 0.01103137,
        0.09679794, 0.00549329, 0.02697893, 0.00698734, 0.00605245,
        0.005598  , 0.39631853, 0.00546321, 0.99695349, 0.01396334,
        0.00556226, 0.00588284, 0.01053785, 0.00768726, 0.00501487]))

In [17]:
predictor.build_index(INDEX_DIR, index_data)

HBox(children=(FloatProgress(value=0.0, max=7652.0), HTML(value='')))




In [18]:
!gsutil cp {INDEX_DIR} gs://jacobdanovitch/spotify_lyrics/index.tree

Copying file:///home/jacobgdt/.cache/spotify_data/index.tree [Content-Type=application/octet-stream]...
- [1 files][  3.2 MiB/  3.2 MiB]                                                
Operation completed over 1 objects/3.2 MiB.                                      


In [None]:
predictor.vocab.get_

## Predictions

In [9]:
predictor.build_index(INDEX_DIR)

In [5]:
# query = "Going to the gym"
#query = "Out and about"
# query = "My friend is proposing 💍"
query = "What's going on"
sorted([f"{t['track_name']} - {t['artist_name']}" for t in predictor.predict_json({'query': query})])
# predictor.predict_json({'query': query})['logits']

['Atlas - COIN',
 "Don't Panic - Coldplay",
 'Fireproof - The National',
 'Gravity - A Perfect Circle',
 'I Can Be Somebody - Deorro',
 'If You Wanna - The Vaccines',
 'Knife - Grizzly Bear',
 'LITE SPOTS - KAYTRANADA',
 'Quit Playing Games (With My Heart) - Backstreet Boys',
 'Shimmy - System Of A Down']

In [14]:
i = 8
print(' - '.join(songs.loc[i, ['track_name', 'artist_name']]))
[f"{t['track_name']} - {t['artist_name']}" for t in predictor.predict_json({'track_id': songs.track_id[i]})['tracks']]

MIDDLE CHILD - J. Cole


['OTW - Khalid',
 "03' Adolescence - J. Cole",
 'Above The Law - Bad Meets Evil',
 'Last Call - Kanye West',
 'Opportunity Cost - G-Eazy',
 'Babylon (feat. Kendrick Lamar) - SZA',
 "Let's Go (feat. Big D & Twista) - Trick Daddy",
 'Spice Up Your Life - Spice Girls',
 'Jump Off The Roof - Vince Staples',
 'One Headlight - The Wallflowers']