In [69]:
import os
import re
import pickle 
import itertools
from functools import partial

from lyricsgenius import Genius
from datasets import Dataset, DatasetDict
import numpy as np

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [70]:
TOKEN = "EaFwVotR7TKt9kDiKTnUmjgEciv5vFvoljxqArZ3Sf4pj0BVFEzpTojPqPo7FTFh"
genius = Genius(TOKEN)
genius.skip_non_songs = True
genius.timeout = 10
genius.retries = 3

In [71]:
def check_artist_path(artist, override = False):
    artist_f_id = '-'.join(artist.lower().split(' '))
    if os.path.exists(f'../artists/{artist_f_id}.p') and not override:
        print("Lyrics have already been created for this artist")
        return f'../artists/{artist_f_id}.p'
    return None
artist_path = check_artist_path('Taylor Swift')

Lyrics have already been created for this artist


In [72]:
def get_all_songs(artist):
    artist_found = genius.search_artist(artist)
    if artist_found.name.lower() != artist.lower():
        print(f"Please note that the artist requested was {artist}")
        print(f"However, the artist used to find search results is {artist_found.name}")
        print(f"If this seems incorrect, please go back to source code and review!")
    model_name = artist_found.url[artist_found.url.rfind('/') + 1:].lower()
    print(artist_found.id)
    return artist_found, model_name
# ls, model_name = get_all_songs('Laura Stevenson')

In [73]:
def limit_songs(artist_found):
    song_list = artist_found.songs
    print(len(song_list))
    artist_found.songs = [song for song in song_list if artist_found.name.lower() in song.primary_artist.name.lower()]
    print(len(song_list))
    return artist_found
# ls2 = limit_songs(ls)

In [74]:
def standardize_song_name(name):
    a = re.sub(r'\([^)]*\)', '', name)
    b = re.sub(r'\[[^)]*\]', '', a)
    return b.strip().lower().replace('\u200b','')

def clean_lyrics(lyrics):
    lyrics = re.sub(r'(\[.*?\])*', '', lyrics)
    lyrics = re.sub('\n{2}', '\n', lyrics)  # Gaps between verses
    lyrics = re.sub('\nYou might also like\n','', lyrics)
    lyrics = re.sub('\nYou might also like\n','', lyrics)
    lyrics = re.sub(r'(You might also like)([\S]+)',r'\g<2>', lyrics)

    # Want to get consistent apostrophes to get consistent treatment of contractions
    lyrics = str(lyrics).replace("'", "’")
    
    lyrics = str(lyrics.strip("\n"))
    lyrics = lyrics.replace("EmbedShare URLCopyEmbedCopy", "")
    lyrics = re.sub("[\(\[].*?[\)\]]", "", lyrics)
    lyrics = re.sub(r'\d+$', '', lyrics)
    lyrics = str(lyrics).lstrip().rstrip()
    lyrics = str(lyrics).replace("\n\n", "\n")
    lyrics = str(lyrics).replace("\n\n", "\n")
    lyrics = re.sub(' +', ' ', lyrics)
    lyrics = str(lyrics).replace('"', "")
    # lyrics = str(lyrics).replace("'", "")
    lyrics = str(lyrics).replace("*", "")

    # Remove text at front of lyrics, claiming this is start of lyrics
    lyrics = re.sub('(^[\s\S]* Lyrics[\\n]*)','', lyrics)
    lyrics = re.sub('(^[\s\S]* Lyrics\\n)','', lyrics)
    if 'lyrics' in lyrics[:100].lower():
        print("Lyrics are not properly cleaned!")
    
    # Remove some random symbols that represent spaces
    lyrics = re.sub('\u2005',' ', lyrics)
    
    # Remove non-lyrics text that sometimes appears in lyrics
    lyrics = re.sub('\d*Embed$', '', lyrics)
    lyrics = re.sub('(You might also like$)','', lyrics)
    if 'You might also like' in lyrics:
        print("Check out lyrics here...")
        print(lyrics)
        print()
    return lyrics
    
def get_lyrics(artist_found):
    newDict = {}
    for song in artist_found.songs:
        title = standardize_song_name(song.title)
        if title not in newDict.keys():
            newDict[title] = clean_lyrics(song.lyrics)
    return newDict
# lyric_dict = get_lyrics(ls2)

In [75]:
def write_pickle(artist, lyric_dict):
    artist_f_id = '-'.join(artist.lower().split(' '))
    with open(f'../artists/{artist_f_id}.p', 'wb') as handle:
        pickle.dump(lyric_dict, handle)

In [76]:
def get_and_save_artist_lyrics(artist, override = False):
    """
    Returns:
    -A lyrics dict with
        title : lyric 
      pair for all songs found under the artist
    """
    artist_path = check_artist_path(artist, override = override)
    if artist_path and not override:
        with open(artist_path, 'rb') as handle:
            return pickle.load(handle)
    
    print(f"Getting songs for {artist}...")
    artist_found, model_name = get_all_songs(artist)
    print("Limiting songs...")
    artist_found = limit_songs(artist_found)

    print("Extracting and cleaning lryics...")
    lyric_dict = get_lyrics(artist_found)

    print("Writing to pickle")
    write_pickle(artist, (lyric_dict, model_name) )

    return lyric_dict, model_name

In [77]:
def create_prelim_dataset(lyric_dict):
    lyrics = [v for k, v in lyric_dict.items()]

    my_dataset = Dataset.from_dict({'text' : lyrics})
    currLen = len(my_dataset)

    train_percentage = 0.85
    validation_percentage = 0.15
    test_percentage = 00

    train, valid , test = np.split(lyrics, [int(currLen*train_percentage), int(currLen*(train_percentage + validation_percentage))])

    datasets = DatasetDict(
                {
                    'train' : Dataset.from_dict({'text': train }),
                    'valid' : Dataset.from_dict({'text' : valid}),
                    'test' : Dataset.from_dict({'text' : test})
                })
    return datasets

# ls_datasets = create_prelim_dataset(ls_lyrics)

In [78]:
def regroup_text(examples, block_size):
    combined = {k : list(itertools.chain.from_iterable(examples[k])) for k in examples.keys()}
    combined['input_ids']

    combined_size = len(combined['input_ids']) // block_size * block_size
    test_arr = [i for i in range(combined_size)]
    combined_size // block_size

    new_dict = {}
    for k, v in combined.items():
        val = []
        for i in range(combined_size // block_size):
            val.append(v[i * block_size : i * block_size + block_size])
        new_dict[k] = val

    new_dict['labels'] = new_dict['input_ids'].copy()

    return new_dict

In [79]:
def tokenize_function(examples):
    return tokenizer(examples['text'])

def create_lm_dataset(prelim_dataset):
    int_datasets = prelim_dataset.map(tokenize_function, batched = True, remove_columns = ['text'])
    # [tokenizer.decode(i) for i in test['train'][0]['input_ids']]

    block_size = int(tokenizer.model_max_length / 4)

    regroup_texts_fn = partial(regroup_text, block_size = block_size)

    lm_datasets = int_datasets.map(
                    regroup_texts_fn,
                    batched = True,
                    batch_size = 100,
                    num_proc = 1
    )
    return lm_datasets

# lm_datasets = create_lm_dataset(ls_datasets)

In [80]:
def save_datasets(ls_datasets, model_name):
    folder_path = f'../models/{model_name}'
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    new_file = folder_path + '/' + 'datasets.p'

    with open(new_file, 'wb') as handle:
        pickle.dump(ls_datasets, handle)
    return

# save_datasets(ls_datasets, model_name)

In [86]:
# artist = 'Kendrick Lamar'
ls_lyrics, model_name = get_and_save_artist_lyrics(artist)
# ls_datasets = create_prelim_dataset(ls_lyrics)
# lm_datasets = create_lm_dataset(ls_datasets)
# save_datasets(lm_datasets, model_name)

Lyrics have already been created for this artist
