In [1]:
# Generic Imports
import sys 
import numpy as np
import json
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import collections

# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 

# Vader and Stopwords
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Clustering, DR and tfidf
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
with open('tracks.json','r') as f:
    tracks = json.load(f)
print(len(tracks))

123984


In [3]:
def plot_histograms(documents):
    amt_of_words = []
    amt_of_unique_words = []
    unique_words = []

    for track in documents:
        words = track[0]
        total_words = len(list(set(words)))
        amt_of_words.append(len(words))
        amt_of_unique_words.append(total_words)
        unique_words = set(list(unique_words) + words)
    fig, axs = plt.subplots(2,figsize=(4,6))
    axs[0].hist(amt_of_words, bins = 500)
    axs[1].hist(amt_of_unique_words, bins = 500)
    plt.show() 

    avg_amt_of_words = sum(amt_of_words) / len(amt_of_words)
    avg_amt_of_unique_words = sum(amt_of_unique_words) / len(amt_of_unique_words)
    print("#Songs: ", len(documents))
    print("#Unique Words: ", len(unique_words))
    print("Longest Song: ", max(amt_of_words))
    print("Avg Wordcount: ", int(avg_amt_of_words)  )
    print("Avg Unique Wordcount: ",int(avg_amt_of_unique_words))

#plot_histograms(tracks)         

In [4]:
def train_model(documents):
    train_documents = [TaggedDocument(i[0],[i[1][0]]) for i in documents]
    model = Doc2Vec(documents=train_documents, vector_size=10, 
                    epochs=10, min_count=2, workers=6, window=5,
                    sample=0.000001)
    model.save("doc2vec.model")
    return model

def infer_vectors(documents, model):
    inferred_vectors = {}
    for doc_id in [i[1][0] for i in documents]:
        inferred_vector = model.dv[doc_id]
        inferred_vectors[doc_id] = inferred_vector
    with open("inferred_vectors_dict.p","wb") as inf_json:
        pickle.dump(inferred_vectors,inf_json)
    return inferred_vectors

In [5]:
def build_tfidf_df(documents):
    corpus = [' '.join(x[0]) for x in documents]
    vectorizer = TfidfVectorizer()
    tfidf_x = vectorizer.fit_transform(corpus)
    df_countvect = pd.DataFrame(data= tfidf_x.todense(), index=[i[1][0] for i in documents], 
                                columns=vectorizer.get_feature_names() )
    with open("tfidf_df.p","wb") as tfidfp:
        pickle.dump(df_countvect, tfidfp)  
    return df_countvect

In [6]:
def do_clustering(df, inferred):
    kmeans_model = KMeans(init='k-means++', n_clusters=20, n_init=10)
    kmeans_model.fit(np.array(list(inferred.values())))
    clustering = kmeans_model.labels_
    clust_df = pd.DataFrame({'cluster': clustering}, index=[i for i in inferred.keys()])
    new_df = df.join(clust_df)
    with open("tfidf_df.p","wb") as inf_json:
        pickle.dump(new_df,inf_json)
    return new_df

In [7]:
def add_genre_todf(df):
    with open('test_with_genres_dict.json') as json_file:
        test_dictionary = json.load(json_file)
    genres = [i['index'] for i in test_dictionary.values()]
    clust_df = pd.DataFrame({'genre': genres}, index=[i for i in test_dictionary.keys()] )
    new_df = df.join(clust_df)
    new_df['genre'] = new_df['genre'].replace(np.nan, 7)
    new_df['genre'] = pd.to_numeric(new_df['genre'], downcast='signed')
    with open("tfidf_df.p","wb") as tfidfp:
        pickle.dump(new_df, tfidfp)
    return new_df

In [8]:
def sentiment_analyzer(documents):
    sia = SentimentIntensityAnalyzer()
    polarity_scores = {}
    for doc in documents:
        polarity_scores[doc[1][0]] = sia.polarity_scores(' '.join(set(doc[0])))
    with open("sia_scores","wb") as siaf:
        pickle.dump(sia_scores,siaf)
    return polarity_scores

#sia_scores = sentiment_analyzer(docs)

In [9]:
def compute_test_with_topwords(X_embedded= None, df_count_vect=None, new_clustering=False, new_tsne=False):
    # load data
    # vectors from doc2vec model doc_id: [vector]
    test_dictionary = pickle.load( open( "test_dict.p", "rb" ) )
    all_inferred = pickle.load( open( "inferred_vectors_dict.p", "rb" ) )
    sia_scores = pickle.load( open( "sia_scores.p", "rb" ) ) 
    test_sias = {x:sia_scores[x] for x in test_dictionary.keys()}
    # tf-idf matrix: doc_id w1 w2 ... wn
    if df_count_vect is None:
        df_count_vect = pickle.load( open( "tfidf_df.p", "rb" ) )
    genre_labels = {i['index']:i['name'] for i in test_dictionary.values()}

    # np array of sia scores
    X_sia = np.array([list(test_sias[i].values()) for i in test_sias.keys()])

    # np array of vectors
    X = np.array([all_inferred[i] for i in test_dictionary.keys()])
    
    # apply t-SNE
    if new_tsne or X_embedded is None:
        X_embedded = TSNE(n_components=2, perplexity=100, learning_rate=200).fit_transform(X)
        X_embedded.shape
    
    tfidf_vecs = [] #df_countvect.loc(0)[list(test_dictionary.keys())]
    clustering = [] #list(tfidf_vecs['cluster'].to_numpy())
    if new_clustering:
        # cluster
        df_countvect = do_clustering(df_count_vect, all_inferred)
        tfidf_vecs = df_count_vect.loc(0)[list(test_dictionary.keys())]
        clustering = list(tfidf_vecs['cluster'].to_numpy())
        print(set(clustering))
    else:
        tfidf_vecs = df_count_vect.loc(0)[list(test_dictionary.keys())]
        clustering = list(tfidf_vecs['cluster'].to_numpy())   

    cluster_labels = []
    for i in range(20):
        cluster = df_count_vect.loc(0)[df_count_vect.loc(1)['cluster'] == i]
        cluster_mean = cluster.mean()
        top_words = cluster_mean.sort_values()
        cluster_labels.append(top_words[-30:])
        
    genre_topwords = []
    for i in range(20):
        gcluster = df_count_vect.loc(0)[df_count_vect.loc(1)['genre'] == i]
        gcluster_mean = gcluster.mean()
        gtop_words = gcluster_mean.sort_values()
        genre_topwords.append(gtop_words[-30:])
    genre_lab = [genre_labels[i] +': '+ ' , '.join(list(genre_topwords[i].keys().to_numpy())) for i in range(20)]
        
    return X_embedded, clustering, cluster_labels, genre_lab, test_dictionary, genre_topwords, X_sia

In [10]:
colors = ['#e6194b', '#3cb44b', '#ffe119','#ffd8b1', '#aaffc3', '#fffac8', '#808080', 
          '#4363d8', '#f58231', '#911eb4', 
          '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#808000', 
          '#9a6324', '#000075', '#800000',  ]


def draw_scatter(data, cs, n_clust=20):
    plt.figure(figsize=(25, 25))
    plt.scatter([i[0] for i in data],[i[1] for i in data], color=[colors[i] for i in cs], s=30)
    
def draw_legend(labels, n_clust=20):
    plt.figure(figsize=(25, 10))
    plt.scatter([-10 for i in range(n_clust)],[i*3 for i in range(n_clust)], color=[colors[i] for i in list(range(0,n_clust))], cmap='tab20', s=200, marker="s")
    plt.scatter([0],[0],c=[0], cmap='tab20', s=5, marker="s")
    for i in list(range(0,n_clust)):
        txt = labels[i] if isinstance(labels[i], str) else ', '.join(list(labels[i].keys().to_numpy()))
        plt.annotate(txt, (-9.8,  i*3),fontsize=24, va='center')


In [11]:
# Use this method to intialize all data for the first time
def run_all_initial():
    model = train_model(tracks)
    inferred_vectors_dict = infer_vectors(tracks, model)

    df_countvect = build_tfidf_df(tracks)
    #df_countvect = pickle.load( open( "tfidf_df.p", "rb" ) ) 
    if not 'genre' in df_countvect:
        df_countvect = add_genre_todf(df_countvect)
    inferred_vectors_dict = pickle.load( open( "inferred_vectors_dict.p", "rb" ) ) 
    if not 'cluster' in df_countvect:
        df_countvect = do_clustering(df_countvect, inferred_vectors_dict)
    return model, inferred_vectors_dict, df_countvect

In [12]:
# Use this method to recompute 'cluster' info and load model etc.
def run_all_secondary():
    model = Doc2Vec.load("doc2vec.model")
    inferred_vectors_dict = pickle.load( open( "inferred_vectors_dict.p", "rb" ) ) 
    df_countvect = pickle.load( open( "tfidf_df.p", "rb" ) ) 
    if not 'genre' in df_countvect:
        df_countvect = add_genre_todf(df_countvect)
    if not 'cluster' in df_countvect:
        df_countvect = do_clustering(df_countvect, inferred_vectors_dict)
    return model, inferred_vectors_dict, df_countvect

In [13]:
def visualize_genre_clusters():

    tsne,clusters,c_labels,genres,dict_test, gen_tw, sia_embed = compute_test_with_topwords(
        X_embedded=None, df_count_vect=None,new_clustering=False, new_tsne=False) 

    draw_scatter(tsne, clusters)
    draw_legend(c_labels)
    draw_scatter(tsne, [i['index'] for i in dict_test.values()])
    draw_legend(genres)