In [2]:
import pandas as pd
import os

from allennlp.data.vocabulary import Vocabulary
from annoy import AnnoyIndex

## Data

In [3]:
DATA_DIR = "/home/jacobgdt/.cache/spotify_data/"

lyrics = pd.read_json(os.path.join(DATA_DIR, "lyrics.jsonl"), lines=True)
preds = pd.read_json(os.path.join(DATA_DIR, "pred.jsonl"), lines=True)
scores = preds.scores.apply(lambda x: dict(map(tuple, x))).apply(pd.Series)

encoded = pd.concat([lyrics, scores], axis=1).drop(columns='sentence')

print(encoded.shape)
encoded.head()

(7662, 51)


Unnamed: 0,track_id,:face_with_rolling_eyes:,:weary_face:,:face_with_tears_of_joy:,:crying_face:,:rolling_on_the_floor_laughing:,:loudly_crying_face:,N/A,:sparkles:,:smiling_face_with_smiling_eyes:,...,:speaking_head:,:police_car_light:,:white_heavy_check_mark:,:double_exclamation_mark:,:backhand_index_pointing_down:,:clapping_hands:,:heavy_check_mark:,:thumbs_up:,:trophy:,:male_sign:
0,14msK75pk3pA33pzPVNtBF,0.958762,0.038948,0.013138,0.01164,0.009979,0.009508,0.009136,0.009093,0.008988,...,0.005385,0.005343,0.00528,0.00513,0.005087,0.005,0.004933,0.004654,0.004593,0.004406
1,6MWtB6iiXyIwun0YzU6DFP,0.058186,0.01388,0.016922,0.014814,0.036876,0.010906,0.010361,0.009912,0.007909,...,0.0091,0.007564,0.006588,0.00758,0.006248,0.007638,0.005953,0.006309,0.007166,0.005947
2,4kV4N9D1iKVxx1KLvtTpjS,0.961466,0.016553,0.007651,0.014832,0.015082,0.010368,0.008913,0.008206,0.009855,...,0.005967,0.005367,0.004822,0.005313,0.004804,0.00564,0.004726,0.004607,0.004154,0.004595
3,6Qs4SXO9dwPj5GKvVOv8Ki,0.010327,0.013254,0.029442,0.012612,0.214775,0.539007,0.006198,0.005502,0.005793,...,0.005634,0.004837,0.004321,0.005133,0.005103,0.005312,0.004426,0.005006,0.004481,0.004035
4,25sgk305KZfyuqVBQIahim,0.014516,0.011302,0.00771,0.830548,0.008575,0.005515,0.007044,0.008473,0.007685,...,0.005833,0.00474,0.004794,0.004886,0.004434,0.005601,0.004644,0.004812,0.004598,0.004199


In [4]:
songs = pd.read_csv('https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv')
df = pd.merge(songs, encoded, on='track_id')

print(df.shape)
df.head()

(7662, 97)


Unnamed: 0,track_id,genre,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,...,:speaking_head:,:police_car_light:,:white_heavy_check_mark:,:double_exclamation_mark:,:backhand_index_pointing_down:,:clapping_hands:,:heavy_check_mark:,:thumbs_up:,:trophy:,:male_sign:
0,14msK75pk3pA33pzPVNtBF,Pop,Ariana Grande,7 rings,100,0.578,0.725,178640,0.321,0.0,...,0.005385,0.005343,0.00528,0.00513,0.005087,0.005,0.004933,0.004654,0.004593,0.004406
1,6MWtB6iiXyIwun0YzU6DFP,Rap,Post Malone,Wow.,99,0.163,0.833,149520,0.539,2e-06,...,0.0091,0.007564,0.006588,0.00758,0.006248,0.007638,0.005953,0.006309,0.007166,0.005947
2,4kV4N9D1iKVxx1KLvtTpjS,Pop,Ariana Grande,"break up with your girlfriend, i'm bored",99,0.0421,0.726,190440,0.554,0.0,...,0.005967,0.005367,0.004822,0.005313,0.004804,0.00564,0.004726,0.004607,0.004154,0.004595
3,6Qs4SXO9dwPj5GKvVOv8Ki,Pop,Sam Smith,Dancing With A Stranger (with Normani),97,0.45,0.741,171030,0.52,2e-06,...,0.005634,0.004837,0.004321,0.005133,0.005103,0.005312,0.004426,0.005006,0.004481,0.004035
4,25sgk305KZfyuqVBQIahim,Pop,Ava Max,Sweet but Psycho,97,0.0691,0.719,187436,0.704,0.0,...,0.005833,0.00474,0.004794,0.004886,0.004434,0.005601,0.004644,0.004812,0.004598,0.004199


In [71]:
df.to_csv(os.path.join(DATA_DIR, 'encoded.csv'), index=False)

In [89]:
!rm {os.path.join(DATA_DIR, 'encoded.csv')}

## Vocab

In [12]:
VOCAB_DIR = os.path.join(DATA_DIR, 'vocab')
os.makedirs(VOCAB_DIR, exist_ok=True)

with open(os.path.join(VOCAB_DIR, 'track_ids.txt'), 'w') as f:
    f.writelines('\n'.join(df.track_id.unique()))

In [16]:
help(vocab.set_from_file)

Help on method set_from_file in module allennlp.data.vocabulary:

set_from_file(filename: str, is_padded: bool = True, oov_token: str = '@@UNKNOWN@@', namespace: str = 'tokens') method of allennlp.data.vocabulary.Vocabulary instance
    If you already have a vocabulary file for a trained model somewhere, and you really want to
    use that vocabulary file instead of just setting the vocabulary from a dataset, for
    whatever reason, you can do that with this method.  You must specify the namespace to use,
    and we assume that you want to use padding and OOV tokens for this.
    
    # Parameters
    
    filename : `str`
        The file containing the vocabulary to load.  It should be formatted as one token per
        line, with nothing else in the line.  The index we assign to the token is the line
        number in the file (1-indexed if `is_padded`, 0-indexed otherwise).  Note that this
        file should contain the OOV token string!
    is_padded : `bool`, optional (default=

In [17]:
vocab = Vocabulary()
vocab.set_from_file(os.path.join(VOCAB_DIR, 'track_ids.txt'), is_padded=False, namespace="labels")
vocab

Vocabulary with namespaces:  labels, Size: 7662 || Non Padded Namespaces: {'*tags', '*labels'}

In [18]:
vocab.save_to_files(os.path.join(VOCAB_DIR, 'vocab'))

In [100]:
os.path.join(VOCAB_DIR, 'vocab')

'/home/jacobgdt/.cache/spotify_data/vocab/vocab'

## Index

In [42]:
index = AnnoyIndex(scores.shape[-1], metric='angular')
vectors = encoded.set_index("track_id")

In [43]:
from tqdm.auto import tqdm

for track in tqdm(df.track_id.unique()):
    i = vocab.get_token_to_index_vocabulary("labels")[track]
    index.add_item(i, vectors.loc[track])

HBox(children=(FloatProgress(value=0.0, max=7662.0), HTML(value='')))




In [46]:
index.build(-1)
index.save(os.path.join(DATA_DIR, 'index.tree'))

True

In [107]:
nns = index.get_nns_by_item(0, 10)
nns

[6259, 1408, 543, 3927, 7032, 1928, 4376, 3756, 1781, 3327]

In [103]:
!ls {VOCAB_DIR}/vocab

labels.txt  non_padded_namespaces.txt


## Predictor

In [93]:
from overrides import overrides

from allennlp.common.util import JsonDict
from allennlp.data import Instance, DatasetReader
from allennlp.predictors.predictor import Predictor
from allennlp.models import Model

import pandas as pd
from annoy import AnnoyIndex

# @Predictor.register('knn')
class KNNPredictor(Predictor):
    def __init__(self, 
                 model: Model, 
                 dataset_reader: DatasetReader,
                 vocab: Vocabulary,
                  annoy_index_fp: str,
                 dim: int,
                 df_fp: str
                ) -> None:
        super().__init__(model, dataset_reader)
        
        self.vocab = vocab
        self.df = pd.read_csv(df_fp).set_index("track_id")
        
        self.index = AnnoyIndex(dim, metric='angular')
        self.index.load(annoy_index_fp)
    
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        if 'track_id' in inputs:
            idx = self.vocab.get_token_to_index_vocabulary("labels")[inputs['track_id']]
            tracks = [vocab.get_token_from_index(i, "labels") for i in nns]
            return self.df.loc[tracks].reset_index(drop=True).to_dict(orient='records')
            
        instance = self._json_to_instance(inputs)
        output_dict = self.predict_instance(instance)
        #label_dict = self._model.vocab.get_index_to_token_vocabulary('labels')
        #all_labels = [label_dict[i] for i in range(len(label_dict))]
        #output_dict["all_labels"] = all_labels
        return output_dict

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        return self._dataset_reader.text_to_instance(text=json_dict['query'])


In [83]:
# %cd ~/bertmoji

/home/jacobgdt/bertmoji


In [94]:
from allennlp.models import load_archive
from my_library.models import BERTMoji
from my_library.dataset_readers import TwitterDatasetReader

knn_pred = KNNPredictor(load_archive(DATA_DIR + 'saved_model').model, 
                        TwitterDatasetReader(),
                        vocab,
                        DATA_DIR+'index.tree',
                        50,
                        'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv'
)

In [97]:
knn_pred.predict_json({'track_id': df.track_id[0]})

[{'genre': 'Alternative',
  'artist_name': 'Nine Inch Nails',
  'track_name': 'Something I Can Never Have',
  'popularity': 41,
  'acousticness': 0.815,
  'danceability': 0.501,
  'duration_ms': 354933,
  'energy': 0.131,
  'instrumentalness': 6.12e-05,
  'key': 'F',
  'liveness': 0.111,
  'loudness': -16.824,
  'mode': 'Major',
  'speechiness': 0.0302,
  'tempo': 108.051,
  'time_signature': '4/4',
  'valence': 0.0391,
  'Alternative': 1,
  'Anime': 0,
  'Blues': 0,
  'Children’s Music': 0,
  'Classical': 0,
  'Country': 0,
  'Dance': 0,
  'Electronic': 0,
  'Folk': 0,
  'Hip-Hop': 0,
  'Indie': 0,
  'Jazz': 0,
  'Movie': 0,
  'Pop': 0,
  'R&B': 0,
  'Rap': 0,
  'Reggae': 0,
  'Reggaeton': 0,
  'Rock': 0,
  'Ska': 0,
  'Soul': 0,
  'Soundtrack': 0,
  'World': 0,
  'api_path': '/songs/174672',
  'apple_music_id': 900248433.0,
  'apple_music_player_url': 'https://genius.com/songs/174672/apple_music_player',
  'lyrics_url': 'https://genius.com/Nine-inch-nails-something-i-can-never-have-l

In [98]:
help(Predictor.from_archive)

Help on method from_archive in module allennlp.predictors.predictor:

from_archive(archive: allennlp.models.archival.Archive, predictor_name: str = None, dataset_reader_to_load: str = 'validation') -> 'Predictor' method of builtins.type instance
    Instantiate a `Predictor` from an [`Archive`](../models/archival.md);
    that is, from the result of training a model. Optionally specify which `Predictor`
    subclass; otherwise, we try to find a corresponding predictor in `DEFAULT_PREDICTORS`, or if
    one is not found, the base class (i.e. `Predictor`) will be used. Optionally specify
    which [`DatasetReader`](../data/dataset_readers/dataset_reader.md) should be loaded;
    otherwise, the validation one will be used if it exists followed by the training dataset reader.



In [109]:
df.track_id[0]

'14msK75pk3pA33pzPVNtBF'