In [None]:
data_root = 'data/drums/'

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from os.path import join
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from difflib import SequenceMatcher
from tqdm import tqdm
import collections
import sklearn
import json
import re

In [None]:
%time metadata = json.load(open(join(data_root, 'metadata.json')))

In [None]:
print len(metadata)
metadata[0]

In [None]:
# geotags = [m['geotag'] for m in metadata if m['geotag'] is not None]
usernames = [m['username'] for m in metadata]
names = [m['name'] for m in metadata]
tags = [m['tags'] for m in metadata]
descriptions = [m['description'] for m in metadata]

In [None]:
c = Counter(usernames)
print len(usernames), len(set(usernames))
plt.plot([count for username, count in c.most_common()[:1000]])
plt.yscale('log')
plt.xscale('log')
plt.show()

In [None]:
fingerprints = [' '.join([m['name'], m['description']] + m['tags']) for m in metadata]
fingerprints = [f.replace('_', ' ') for f in fingerprints] # split on underscore (vectorizer doesn't by default)
fingerprints = [f.replace('-', '_') for f in fingerprints] # don't split on dash (vectorizer does by default)
fingerprints = [re.sub('wav|aif|mp3', '', f) for f in fingerprints] # remove wav, aif, mp3
fingerprints = [re.sub('<a.+?/a>', '', f) for f in fingerprints]
# altnernatively, this could be done with a whitelist

In [None]:
username_dict = collections.defaultdict(list)
for username, fingerprint in zip(usernames, fingerprints):
    username_dict[username].append(fingerprint)

In [None]:
for i in [10,11,12]:
    print username_dict[usernames[3000]][i]

In [None]:
all_unique = []
ratios = []
ratio_cutoff = 0.99
matcher = SequenceMatcher(isjunk=lambda x: x in '. \t\n', autojunk=False)
for username in tqdm(username_dict, leave=True):
    cur = username_dict[username]
    unique = []
    for a in cur:
        matcher.set_seq1(a)
        max_ratio = 0
        for b in unique:
            matcher.set_seq2(b)
            ratio = matcher.real_quick_ratio()
            ratios.append(ratio)
            max_ratio = max(max_ratio, ratio)
        if max_ratio < ratio_cutoff:
            unique.append(a)
    all_unique.extend(unique)

In [None]:
print len(all_unique),'out of',len(set(descriptions)),'unique'
print 'did ratio', len(ratios), 'ratio comparisons'
ratios.sort()
plt.plot(ratios)
plt.show()

In [None]:
print all_unique[1000]

In [None]:
# could be good to do per-user vectorization with a low max_df cutoff
# then combine the vectorized results
vectorizer = CountVectorizer(min_df=2, stop_words='english', binary=True)
vectors = vectorizer.fit_transform(fingerprints)
vectors.shape

In [None]:
freqs = [(word, vectors.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]
#sort from largest to smallest
print sorted (freqs, key = lambda x: -x[1])[:100]

In [None]:
for pair in sorted (freqs, key = lambda x: -x[1])[:1000]:
    print pair[1], pair[0]

In [None]:
synsets = [l.strip().replace(' ','').split(',') for l in open(data_root + '/synsets.txt').readlines()]
synset_examples = [set() for s in synsets] # make one set per synset
leftovers = Counter()
for doc_index, doc in enumerate(tqdm(vectorizer.inverse_transform(vectors), leave=True)):
    matches = 0
    for term in doc:
        for synset_example, synset in zip(synset_examples, synsets):
            for syn in synset:
                if term == syn:
#                     print doc_index, synset_index, term
                    synset_example.add(doc_index)
                    matches += 1
                    break
    if matches == 0:
        leftovers.update(doc)
leftovers.most_common()[:50]

In [None]:
print len(synset_examples)
for synset_example, synset in zip(synset_examples, synsets):
    print synset, len(synset_example)

In [None]:
import pickle
with open(data_root + 'synset_examples.pkl', 'wb') as f:
    pickle.dump(synset_examples, f, -1)
with open(data_root + 'synsets.pkl', 'wb') as f:
    pickle.dump(synsets, f, -1)

In [None]:
i = 1000
print vectorizer.get_feature_names()[i:i+100]

In [None]:
# average number of tokens per document
vectors.nnz / float(vectors.shape[0])

In [None]:
lda = LatentDirichletAllocation(n_topics=100, max_iter=10)
%time fit = lda.fit_transform(vectors)

In [None]:
def print_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
            for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [None]:
print_top_words(lda, vectorizer.get_feature_names())

In [None]:
a = 0
b = 1
c = 2
ref = all_unique
n = range(len(fit[0]))

for i in [6,7,8,9,10]:
    print ref[i]
    plt.bar(n, fit[i])
    plt.show()

In [None]:
vector = vectorizer.transform(['laser'])
print vector
plt.bar(n, lda.transform(vector)[0])
plt.show()

The lesson is: many sounds aren't going to have very good descriptions, and the fingerprint isn't going to be enough to distinguish them from each other. We definitely can't run t-SNE on the LDA results, maybe on the (huge) sparse matrix...

The right thing to do as a next step is to use the tags to do multiclass classification on some categories we decide on in advance, do data augmentation, etc. Treat it as a typical supervised learning problem, then run t-SNE on the "soft", pre-output vectors.

In [None]:
searchable = [m['name'] + ' ' + ' '.join(m['tags']) for m in metadata]

In [None]:
with open(data_root + 'searchable.txt', 'w') as f:
    for line in searchable:
        print>>f, line.encode('utf8')