In [70]:
import os
import sys
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model
import pickle
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from pprint import pprint
import re

In [21]:
#CONSTANTS
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
VALIDATION_SPLIT = 0.2
learning_rate = .0000001
max_grad_norm = 1.
dropout = 0.5
EMBEDDING_DIM = 200

In [83]:
# PATH CONSTANTS
PICKLE_ROOT = 'data/lyrics/'
CHRISTIAN_PATH = 'Christian.pickle'
POP_PATH = 'Pop.pickle'
ROCK_PATH = 'Rock.pickle'
COUNTRY_PATH = 'Country.pickle'
RAP_PATH = 'Rap.pickle'

LYRIC_PATHS = [CHRISTIAN_PATH,POP_PATH,ROCK_PATH,COUNTRY_PATH,RAP_PATH]

EMBEDDING_PATH = 'data/glove_embeddings/'
EMBEDDING_FILE = 'glove.6B.'+str(EMBEDDING_DIM)+'d.txt'

In [68]:
# Embedding
# Elmo could improve the word embeddings - need more research
# elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
if not os.path.exists(EMBEDDING_PATH+EMBEDDING_FILE):
    print('Embeddings not found, downloading now')
    ! cd EMBEDDING_PATH
    ! wget http://nlp.stanford.edu/data/glove.6B.zip
    ! unzip glove.6B.zip
    ! cd ../..c

glove_embeddings = {}
with open(EMBEDDING_PATH+EMBEDDING_FILE) as emb_f:
    for line in emb_f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector


In [84]:
# Pickle extraction
# pickle looks like -> pickle_lyrics['lyrics'][('song_title', 'artist')]['lyrics']
# or - > pickle_lyrics['genre']
pickle_lyrics = []
genre_index = {}
max_length = 0
for i,l_path in enumerate(LYRIC_PATHS):
    if not os.path.exists(PICKLE_ROOT+l_path):
        print('problem occured looking for %s' %(PICKLE_ROOT+l_path))
        sys.exit()
    print(os.getcwd()+PICKLE_ROOT+l_path)
    loaded_lyrics = pickle.load(open(PICKLE_ROOT+l_path, "rb" ))
    genre_index[loaded_lyrics['genre']] = i
    pickle_lyrics.append(loaded_lyrics)
    print(len(loaded_lyrics['lyrics']))
    for key, song_info in loaded_lyrics['lyrics'].items():
        if len(song_info['lyrics'].split()) > max_length:
            max_length = len(song_info['lyrics'].split())
            print(key)
            print(max_length)
            print(i)
print(len(pickle_lyrics))
print(genre_index)
# print(max_length)
# print(pickle_lyrics[0]['lyrics']['Cabin Essence: Chorus', 'The Beach Boys']['lyrics'])

/Users/Joe/Applications/OneDrive/School/Spring 2019 - senior/NLP/project/tag_my_lyricsdata/lyrics/Christian.pickle
10186
('Never Gonna Let Me Go', 'Tauren Wells')
334
0
('Forward Motion', 'Thousand Foot Krutch')
379
0
('Love Brought Me Back', 'Helen Baylor')
491
0
('Moments Like This', 'B.Reith')
502
0
('Independence Day', 'Whiteheart')
5957
0
('Pray For My Enemies', 'Bruce Carroll')
10828
0
('Great Things', 'Dallas Holm')
35571
0
('Open Heaven Lord', 'Steve Camp')
37985
0
('One Time In Each Forever', 'Wayne Watson')
50503
0
('Wisdom Way', 'Aaron Jeoffrey')
57599
0
("Big Man's Hat", 'Charlie Peacock')
84663
0
('Answer To Prayer', 'Bruce Carroll')
108479
0
('Carried Away (Safe On The Wings Of The Lord)', 'Whiteheart')
109095
0
('God Has Another Plan', 'Babbie Mason')
121583
0
('Star Of The Ages', 'Babbie Mason')
148554
0
('A Consistent Ethic Of Human Life', 'Derek Webb')
154678
0
/Users/Joe/Applications/OneDrive/School/Spring 2019 - senior/NLP/project/tag_my_lyricsdata/lyrics/Pop.pickle

In [91]:
def check_validity(data):
    valid_count = 0
    max_len_key = ''
    max_len = 0
    total_words = []
    for key, song_info in data['lyrics'].items():
        title, artist = key
        inner_title = song_info['title']
        inner_artist = song_info['artist']
        song_lyrics = song_info['lyrics']
        song_lyrics_norm = re.sub(r'[^a-zA-Z0-9-\']', ' ', song_lyrics).strip().lower()
        song_lyrics_split = song_lyrics_norm.split()         
        if title == inner_title and artist == inner_artist and len(song_lyrics_split) <= 2500:
            if len(song_lyrics_split) > max_len:
                max_len = len(song_lyrics_split)
                max_len_key = key
            valid_count+=1
            total_words = list(set(total_words+song_lyrics_split))
    print(max_len_key)
    print(max_len)
    return valid_count, total_words

for data in pickle_lyrics:
    print(data['genre'])
    total_songs = len(data['lyrics'])
    total_words_set = []
    valid, total_words = check_validity(data)
    total_words_set  = list(set(total_words_set+total_words))
    print(total_songs, ' : ', valid)
print(len(total_words_set))

Christian
('Price Tag', "Da' T.R.U.T.H.")
1104
10186  :  6895
Pop
('Tropico', 'Lana Del Rey')
1129
8618  :  5818
Rock
('The Real Slim Shady', 'Eminem')
1013
8054  :  5596
Country
('The Haircut Song', 'Ray Stevens')
877
7516  :  5368
Rap
('Mortal Man', 'Kendrick Lamar')
2234
8247  :  4747
51519


In [75]:
def strip_word(words): 
    word_list = []
    word_norm = re.sub(r'[^\w]', '', words).strip()
    if word_norm.isdigit():
        word_list.append(word_norm)
    else:
        word_norm = re.sub(r'[^a-zA-Z]', ' ', words).strip()
        word_list = word_norm.split(' ')
    return word_list

def clean_data(data):
    song_list = []
    for key, song_info in data['lyrics'].items():
        title, artist = key
        inner_title = song_info['title']
        inner_artist = song_info['artist']
        song_lyrics = song_info['lyrics']
        song_lyrics_norm = re.sub(r'[^a-zA-Z0-9-\']', ' ', song_lyrics).strip()
        song_lyrics_split = song_lyrics_norm.split()         
        if title == inner_title and artist == inner_artist and len(song_lyrics_split) <= 2500:       
            song_list.append(song_lyrics_norm)
            
    return song_list
            
# initial data pre-processing
# assuming a list of tokenized data 
# vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_len)
lyrics = []
lyrics_labels = []
for data in pickle_lyrics:
    genre = data['genre']
    for key, song_info in data['lyrics'].items():
        song_lyrics = song_info['lyrics']
        song_lyrics_norm = re.sub(r'[^a-zA-Z0-9-\']', ' ', song_lyrics).strip()
        song_lyrics_split = song_lyrics_norm.split() 
        print(song_lyrics)
        print()
        print(song_lyrics_norm)
        print()
        print(song_lyrics_split)
        sys.exit()
        lyrics_labels.append(genre_index[genre])



All my life, I've had questions
Who, what, when, where, why?
Wasting time looking for answers
Going out my mind
I wanted to see behind the scenes
But now I'm letting go
'Cause the safest place that I can be
Is in Your great unknown


I'm ready to chase impossible
Walking away from what I know
(In everything I know)
You're never gonna let me go
I'm tearing apart my master plan
Trading it for my master's hand
(In everything I know)
You're never gonna let me go


Let me go, You're never gonna let me go
Let me go, You're never gonna let me go


Take me out into Your waters
Let's get lost at sea (lost at sea)
I feel the voice of You my Father
Calm the storm in me
I'm not looking for the distant shore
Or how the story ends
My life is Yours, I'm living for
The moment that I'm in


I'm ready to chase impossible
Walking away from what I know
(In everything I know)
You're never gonna let me go
I'm tearing apart my master plan
Trading it for my master's hand
(In everything I know )
You're never 

SystemExit: 