In [1]:
import re
import string
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from typing import List
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from IPython.display import clear_output
import tqdm


STOPWORDS = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def clean(word):
    chars = [char for char in word if char not in string.punctuation]  # Remove Punctuation
    word = "".join(chars)
    word = word.lower()   # Lowercase
    return word    


def tokenize_lyrics(lyrics: str):
    """ Takes lyrics in string format, returns list of words """
    replacement_patterns = (r'\[.*?\]',    # Between Brackets
                           )    
    lyrics_clean = re.sub('|'.join(replacement_patterns), '', lyrics)
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(lyrics_clean)
    
    # Split words with hyphens
    for word in tokens:
        if '-' in word:
            i = tokens.index(word)
            tokens = tokens[:i] + word.split('-') + tokens[i+1:]
        
    tokens = filter(lambda token: token != '', tokens)
    return list(tokens)


def is_stopword(word: str):
    return word in STOPWORDS


def tokenize(lyrics, stopwords=True, lemandstem=True):
    tokens = tokenize_lyrics(lyrics)
    tokens = map(clean, tokens)
    if stopwords:
        tokens = [word for word in tokens if not is_stopword(word)]
    if lemandstem:
        tokens = map(lemmatizer.lemmatize, tokens)
        tokens = map(stemmer.stem, tokens)
        tokens = map(lemmatizer.lemmatize, tokens)
    tokens = list(tokens)
    return tokens


def add_tokenization_to_track(track):
    track['raw_lyrics'] = track['lyrics']['result']['track']['text']
    track['tokens'] = tokenize(track['lyrics']['result']['track']['text'])
    return track

def lyric_emotion(song):
    lyrics = song['lyrics']['result']['track']['text']
    lyrics = lyrics.split("\n")
    
    compound_score = 0
    for sentence in lyrics:
        sid = SentimentIntensityAnalyzer()
        ss = sid.polarity_scores(sentence)
        compound_score += ss['compound']
    
    return compound_score / len(lyrics)   # AVG compound score


def update_song_emotion(song):
    song['emotion'] = lyric_emotion(song)
    return song

def parallel_update_song_emotions(songs):
    updated = []
    with Pool() as pool:
        for song in tqdm.tqdm(pool.imap_unordered(update_song_emotion, songs), total=len(songs)):
            clear_output(wait=True)
            updated.append(song)
    return updated



In [10]:
from pymongo import MongoClient
from pprint import pprint
from typing import List
from functools import reduce
from operator import add
from multiprocessing import Pool
from operator import add

# DB CONFIG
client = MongoClient('mongo', 27017)
db = client.music_db
COLLECTIONS =   ['pop-songs',
                 'latin-songs',
                 'country-songs',
                 'rock-songs',
                 'jazz-songs',
                 'christian-songs',
                 'rap-song']

def has_spotify(song):
    return song.get('spotify', {}).get('audio_features')

def has_lyrics(song):
    if not song.get('lyrics'):
        return False
    
    if song.get('lyrics') == 'null':
        return False

    if song.get('lyrics').get('error'):
        return False

    return True


def update_collections(collection_names, require_spotify=True, require_lyrics=True):
    all_songs = []    
    for name in collection_names:
        collection = db[name]
        for i, song in enumerate(collection.find()):
            song['genre'] = name
            if require_spotify and has_spotify(song):
                song.update(song['spotify']['audio_features'])
                
            if require_lyrics and has_lyrics(song):
                add_tokenization_to_track(song)
                print("Added", i, end=" ")
                clear_output(wait=True)
                
            if (has_spotify(song) if require_spotify else True) and (has_lyrics(song) if require_lyrics else True):
                all_songs.append(song)
            
    return all_songs





dict([(collection, len(list(db[collection].find()))) for collection in COLLECTIONS])
        
#songs = update_collections(COLLECTIONS, False, False) 

{'pop-songs': 3374,
 'latin-songs': 4126,
 'country-songs': 9722,
 'rock-songs': 4560,
 'jazz-songs': 1596,
 'christian-songs': 4480,
 'rap-song': 5914}

In [47]:
songs = parallel_update_song_emotions(songs)



100%|██████████| 356/356 [00:41<00:00,  8.54it/s][A[A

In [48]:
collection = db['all-songs']
db.drop_collection("all-songs")

def insert_all_songs(songs, collection):
    collection.insert_many(songs)

insert_all_songs(songs, collection)