In [55]:
import requests
from pymongo import MongoClient
from urllib.parse import urljoin
from pprint import pprint
import nltk
import json

# Retrieve Lyrics Via API

In [56]:
# DB CONFIG
client = MongoClient('mongo', 27017)
db = client.music_db
songs_collection = db.songs

In [57]:
# Pull all songs from DB
songs = list(songs_collection.find())

In [58]:
def has_lyrics(song):
    if not song.get('lyrics'):
        return False
    
    if song.get('lyrics') == 'null':
        return False

    if song.get('lyrics').get('error'):
        return False
        
    
    return True

songs = [song for song in songs if has_lyrics(song)]

# Clean Lyrics

In [59]:
import re
import string
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from typing import List

STOPWORDS = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def clean(word):
    chars = [char for char in word if char not in string.punctuation]  # Remove Punctuation
    word = "".join(chars)
    word = word.lower()   # Lowercase
    return word    


def tokenize_lyrics(lyrics: str):
    """ Takes lyrics in string format, returns list of words """
    replacement_patterns = (r'\[.*?\]',    # Between Brackets
                           )    
    lyrics_clean = re.sub('|'.join(replacement_patterns), '', lyrics)
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(lyrics_clean)
    
    # Split words with hyphens
    for word in tokens:
        if '-' in word:
            i = tokens.index(word)
            tokens = tokens[:i] + word.split('-') + tokens[i+1:]
        
    tokens = filter(lambda token: token != '', tokens)
    return list(tokens)


def is_stopword(word: str):
    return word in STOPWORDS


def tokenize(lyrics):
    tokens = tokenize_lyrics(lyrics)
    tokens = map(clean, tokens)
    tokens = [word for word in tokens if not is_stopword(word)]
    tokens = map(lemmatizer.lemmatize, tokens)
    tokens = map(stemmer.stem, tokens)
    tokens = map(lemmatizer.lemmatize, tokens)
    tokens = list(tokens)
    return tokens


def add_tokenization_to_track(track):
    track['lyrics']['tokens'] = tokenize(track['lyrics']['result']['track']['text'])
    return track
    
tokenizations = list(map(add_tokenization_to_track, songs))