In [None]:
data_root = 'data/drums/'

In [None]:
from os.path import join
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import collections
import pickle
import sklearn
import json
import re

In [None]:
metadata = json.load(open(join(data_root, 'metadata.json')))

In [None]:
docs = [' '.join([m['name'], m['description']] + m['tags']) for m in metadata]
docs = [d.replace('-', '_') for d in docs] # replace dash with underscore

In [None]:
# print the `limit` most common tokens
limit = 50
vectorizer = CountVectorizer(min_df=10, stop_words='english', binary=True)
vectors = vectorizer.fit_transform(docs)
freqs = [(vectors.getcol(idx).sum(), word) for word, idx in vectorizer.vocabulary_.items()]
for freq, token in sorted(freqs, key=lambda x: -x[0])[:limit]:
    print freq, token

`synsets.json` should be a list of synsets, where each synset is a list of synonyms. A synonym should not contain characters like spaces that a tokenizer would split into two tokens. An example:

```json
[
 ["bass"],
 ["kick", "kickdrum", "kicks"],
 ["hat", "hihat", "hi_hat"],
 ["snare", "snares"]
]
```

In [None]:
# vectorize only using words in the synsets
synsets = json.load(open(join(data_root, 'synsets.json')))
vocabulary = [item for sublist in synsets for item in sublist]
vectorizer = CountVectorizer(min_df=1, stop_words='english', binary=True, vocabulary=vocabulary)
vectors = vectorizer.fit_transform(docs)

In [None]:
# map vocabulary indices to synset indices
vocabulary_to_synset = [i for i, synset in enumerate(synsets) for syn in synset]
# fill out mapping from samples to labels and from labels to samples
samples_to_labels = [set() for m in metadata]
labels_to_samples = [set() for s in synsets]
for i, vector in enumerate(vectors):
    nonzero = vector.nonzero()[1]
    labels = [vocabulary_to_synset[j] for j in nonzero]
    samples_to_labels[i].update(labels)
    for label in labels:
        labels_to_samples[label].add(i)
# convert from list of sets to list of lists
samples_to_labels = [list(labels) for labels in samples_to_labels]
labels_to_samples = [list(samples) for samples in labels_to_samples]

In [None]:
with open(join(data_root, 'labels_to_samples.pkl'), 'wb') as f:
    pickle.dump(labels_to_samples, f)
with open(join(data_root, 'samples_to_labels.pkl'), 'wb') as f:
    pickle.dump(samples_to_labels, f)