In [1]:
import json
import numpy as np
import pandas as pd
from time import sleep
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from my_utils import *

In [2]:
markers = ['d', '*', 's', '+', '^', 'x', 'o']
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'brown']

In [3]:
streaming_file = "data/processed/streaming.csv"
artists_file = "data/processed/artists.csv"
tracks_file = "data/processed/tracks_with_lyrics.csv"

In [4]:
streaming_df = pd.read_csv(streaming_file, index_col = None)
artists_df = pd.read_csv(artists_file, index_col = None)
tracks_df = pd.read_csv(tracks_file, index_col = None)

In [5]:
streaming_df = convert_to_datetime(streaming_df, 'dateTime')
tracks_df = convert_to_datetime(tracks_df, 'timeAdded')

In [6]:
tracks_df.loc[1]['lyrics']

'Sailing the Sunset ****, I\'m a bit of a king Granny would take a trip, I\'ve been bending the strings Got hammers in both my hands, such a delicate touch They say I\'m from Amsterdam, does that make me Dutch?   Please don\'t remember me, for what I did last night, oh Please don\'t remember me Lord and children Please don\'t remember me, it\'s only 1980 It\'s only 1983  Smoking the ****, feeling my own light My brother\'s a keeper, I married a TV wife The devil\'s Camaro, parked in thе high school lot A little sombrero \'cause tеacher was way too hot   Please don\'t remember me, for what I did last night, oh Please don\'t remember me Listen Lord now Please don\'t remember me, it\'s only 1980 It\'s only 1983   Tell my love "But leave me never" Can\'t complain about the weather Snowing at the rainbow, have a ball Cut my teeth down at the whiskey GTO\'s tried to kiss me One more song, they have seen it all     Please don\'t remember me, for what I did last night, oh Please don\'t remembe

* [Perplexity procedure](https://www.mathworks.com/help/textanalytics/ug/choose-number-of-topics-for-LDA-model.html)
* [Perplexity definition](http://qpleple.com/perplexity-to-evaluate-topic-models/)

In [8]:
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import wordpunct_tokenize as tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [9]:
allowed_words = words.words()
stop_words = stopwords.words('english')
stop_words.extend(['song', 'intro', 'chorus', 'bridge', 'outro', 'ft', 'get', 'take',
                  'oh', 'ooh', 'ee', 'lil', 'like', 'let', 'got', 'one', 'know', 'yeah',
                  'go', 'might', 'could', 'never', 'mr', 'ms', 'dan', 'much', 'come', 'back',
                  'wanna', 'would', 'yuh', 'la', 'le', 'make', 'remix', 'de', 'feat', 'na', 
                  'bring', 'ah', 'right', 'nothing', 'try', 'mm', 'aa', 'aah', 'nah', 'im',
                  'huh', 'tell', 'spotify', 'dont', 'ill'])

In [10]:
def format_lyrics_for_lda(tracks_df, allowed_words, stop_words):
    '''format the lyrics from a dataframe for Latent Dirichlet Analysis. 
        This involves getting rid of profanity (represented by strings of ***),
        punctuation, stop words, and all words not in the allowed words list'''
    lyrics = tracks_df['lyrics'].dropna().map(lambda x: re.sub('[_,\\.!?\\d*\']', '', x).lower())
    lyrics = lyrics.values.tolist()
    new_lyrics = []
    for song in tqdm(lyrics):
        tokens = tokenize(song)
        allowed_tokens = [t for t in tokens if t in allowed_words and t not in stop_words]
        new_lyrics.append(' '.join(allowed_tokens))
    return new_lyrics

In [11]:
# lda_lyrics = format_lyrics_for_lda(tracks_df, allowed_words, stop_words)

In [13]:
import pickle
lda_lyrics_file = "data/processed/lda_input_lyrics.p"
# pickle.dump({'lyrics':lda_lyrics}, open(lda_lyrics_file, "wb"))

with open(lda_lyrics_file,'rb') as read_file:
    lda_lyrics = pickle.load(read_file)['lyrics']

In [14]:
count_vect = CountVectorizer(stop_words=stop_words, lowercase=True, min_df = 40, max_df = 0.6)
x_counts = count_vect.fit_transform(lda_lyrics)

In [15]:
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [16]:
dimension = 25
lda = LDA(n_components = dimension)
lda_array = lda.fit_transform(x_tfidf)

In [17]:
components = [lda.components_[i] for i in range(len(lda.components_))]
features = count_vect.get_feature_names()

In [18]:
def get_essential_words(features, components, nwords = 5):
    important_words = []
    for j in range(len(components)):
        iw = sorted(features, key = lambda x: components[j][features.index(x)], reverse = True)[:nwords]
        print(iw)
        important_words.append(iw)
    return important_words

In [19]:
important_words = get_essential_words(features, components, nwords = 5)

['notice', 'river', 'expensive', 'supposed', 'drip']
['woman', 'happen', 'anything', 'met', 'night']
['way', 'care', 'somewhere', 'breathing', 'ne']
['da', 'young', 'black', 'jay', 'west']
['daylight', 'youve', 'ha', 'se', 'free']
['high', 'rat', 'die', 'live', 'amen']
['bye', 'beware', 'worried', 'runaway', 'awake']
['groove', 'came', 'doubt', 'spoke', 'bigger']
['ay', 'bloom', 'wonderful', 'following', 'series']
['di', 'stoner', 'son', 'velvet', 'sea']
['really', 'answer', 'bad', 'bit', 'sending']
['breathe', 'loose', 'waiting', 'pure', 'thank']
['th', 'sunshine', 'alright', 'bae', 'baby']
['closer', 'easily', 'laura', 'keep', 'wrap']
['guitar', 'horizon', 'pause', 'carried', 'lick']
['love', 'baby', 'time', 'want', 'see']
['low', 'dancing', 'glow', 'bout', 'prepared']
['tear', 'push', 'stretch', 'alive', 'paradise']
['ancient', 'sleep', 'funky', 'guitar', 'staring']
['un', 'tu', 'mi', 'en', 'te']
['waiting', 'shut', 'swallow', 'loving', 'lying']
['pa', 'four', 'three', 'frank', 'two

#### BERTopic

In [20]:
lyrics = tracks_df['lyrics'].dropna().tolist()

In [21]:
from bertopic import BERTopic

In [22]:
model = BERTopic(verbose=True)

In [None]:
topics, probabilities = model.fit_transform(lyrics)

Batches:   0%|          | 0/116 [00:00<?, ?it/s]

2022-10-12 18:14:48,843 - BERTopic - Transformed documents to Embeddings
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [None]:
model.get_topic_freq().head(11)

In [None]:
model.get_topic(0)

In [None]:
all_genres = sorted(list(set((' '.join(artists_df.dropna()['genres'].tolist())).split(' '))))

In [None]:
def gen_artist_genres_df(artists_df, genres):
    artists_with_genres_df = artists_df.dropna()
    genres_df = artists_with_genres_df[['name']]
    genres_df = genres_df.rename(columns={"name": "artistName"})
    for genre in genres:
        genres_df[genre] = False
        inds = artists_with_genres_df[artists_with_genres_df['genres'].str.contains(genre)].index.tolist()
        for i in inds:
            genres_df.at[i, genre] = True
    return genres_df

In [None]:
genres = ['rock', 'rap', 'hip hop', 'jazz', 'house', 'folk', 'alternative', 'pop', 'r&b', 'soul']

In [None]:
artist_genres_df = gen_artist_genres_df(artists_df, genres)

In [None]:
def gen_initial_track_genres_df(tracks_df, artist_genres_df):
    tracks_with_lyrics_df = tracks_df.dropna()
    artist_genre_columns = list(artist_genres_df.columns)
    track_columns = ['uri', 'trackName', 'lyrics']
    columns = track_columns + artist_genre_columns
    df = pd.DataFrame(columns = columns)
    for i in range(len(artist_genres_df)):
        artistName = artist_genres_df.iloc[i]['artistName']
        artist_genres = artist_genres_df.iloc[i]
        genres_dict = {k: artist_genres[k] for k in artist_genre_columns}
        # print(genres_dict)
        artist_tracks = tracks_with_lyrics_df[tracks_with_lyrics_df['artistName'] == artistName]
        for j in range(len(artist_tracks)):
            # print(j)
            track = artist_tracks.iloc[j]
            track_dict = {k: track[k] for k in track_columns}
            row_dict = {**track_dict, **genres_dict}
            # row_dict = track_dict.update(genres_dict)
            # print(row_dict)
            new_row = pd.Series(row_dict)
            df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
    return df

In [None]:
initial_track_genres_df = gen_initial_track_genres_df(tracks_df, artist_genres_df)

In [None]:
initial_track_genres_df[genres].astype(int).sum()