In [1]:
# imports
import numpy as np
import pandas as pd

from tqdm import tqdm
import multiprocessing as mp

import lyricsgenius as lg

In [2]:
print("Number of processors: ", mp.cpu_count())

Number of processors:  8


In [3]:
genius=lg.Genius(access_token='Insert access_token',
                 skip_non_songs=True, remove_section_headers=True,sleep_time=0.1,verbose=False)

# Genre Selection

In [4]:
df = pd.read_csv('../data/track_audio_dataset_no_lyrics.csv', index_col=0)

In [5]:
df.genre.unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop', 'pop-film',
       'pow

In [6]:
# Create df of selected genres to sample lyrics from
mask = (df['genre'] == 'alternative') | (df['genre'] == 'country') | (df['genre'] == 'edm') | (df['genre'] == 'hip-hop') | (df['genre'] == 'r-n-b') | (df['genre'] == 'rock')
df_subset = df[mask]
# Gather track and artists
tracks = df_subset.track_name.copy().to_list()
artists = df_subset.artist_name.copy().to_list()

In [7]:
len(df_subset)

3274

In [8]:
def lyric_generator(track_lst,artist_lst):
    """
    Takes two ordered, same-size lists of tracks and artists and
    returns a list of each track's lyrics.
    
    Parameters:
    -----------
    track_lst: list of song titles
    artist_lst: list of artist names
    
    Output:
    -------
    Returns a list of lyrics (strings) for each song passed in.
    """
    lyric_lst = []
    # Initialize API object
    genius=lg.Genius(access_token='Insert access_token',
                     skip_non_songs=True, 
                     remove_section_headers=True,
                     sleep_time=0.1,
                     verbose=False)
    
    # Iterate through tracks and store lyrics
    for t, a in tqdm(zip(track_lst, artist_lst)):
        song = genius.search_song(title = t,
                                  artist = a,
                                  get_full_info = False)
        if song is not None:
            lyric_lst.append(song.lyrics)
        else:
            lyric_lst.append('')
        
    return lyric_lst

In [17]:
def single_song_lyric_generator(genius_API, track, artist):
    """
    Takes a track and its corresponding artist and
    returns the song's lyrics using the provided
    genius API object. If there aren't any lyrics,
    it returns a blank string.
    
    Parameters:
    -----------
    genius_API: genius API object
    track: song title
    artist: artist name
    
    Output:
    -------
    Returns the lyrics (string) for the specified song.
    """
    # Pull song from API
    song = genius_API.search_song(title = track,
                                  artist = artist,
                                  get_full_info = False)
    if song is not None:
        return song.lyrics
    return ''

# Testing parallel and non-parallel methods and comparing results

In [12]:
# Initialize API object
genius=lg.Genius(access_token='Insert access_token',
                 skip_non_songs=True,
                 remove_section_headers=True,
                 sleep_time=0.1,
                 verbose=False)

# Initialize multiprocessing object to make multiple API calls at once
pool = mp.Pool(mp.cpu_count())

test_parallel = [pool.apply(single_song_lyric_generator, args=(genius,t,a)) for t, a in tqdm(zip(tracks[:5],artists[:5]))]

# Close object
pool.close()


0it [00:00, ?it/s][A
1it [00:01,  1.02s/it][A
2it [00:02,  1.04s/it][A
3it [00:03,  1.02s/it][A
4it [00:07,  2.00s/it][A
5it [00:08,  1.70s/it][A


In [11]:
test = lyric_generator(tracks[:5],artists[:5])


0it [00:00, ?it/s][A
1it [00:00,  1.16it/s][A
2it [00:01,  1.20it/s][A
3it [00:02,  1.24it/s][A
4it [00:03,  1.23it/s][A
5it [00:03,  1.26it/s][A


In [13]:
test == test_parallel

True

# Final Lyric Pull

In [15]:
# Initialize multiprocessing object to make multiple API calls at once
pool = mp.Pool(mp.cpu_count())

lyrics = [pool.apply(single_song_lyric_generator, args=(genius,t,a)) for t, a in tqdm(zip(tracks,artists))]

# Close object
pool.close()



0it [00:00, ?it/s][A[A

1it [00:00,  1.09it/s][A[A

2it [00:02,  1.03it/s][A[A

3it [00:03,  1.01it/s][A[A

4it [00:04,  1.01it/s][A[A

5it [00:05,  1.01s/it][A[A

6it [00:07,  1.33s/it][A[A

7it [00:10,  2.00s/it][A[A

8it [00:12,  2.03s/it][A[A

9it [00:14,  1.99s/it][A[A

10it [00:16,  1.94s/it][A[A

11it [00:18,  1.89s/it][A[A

12it [00:20,  1.96s/it][A[A

13it [00:22,  2.11s/it][A[A

14it [00:24,  2.10s/it][A[A

15it [00:27,  2.08s/it][A[A

16it [00:29,  2.11s/it][A[A

17it [00:31,  2.12s/it][A[A

18it [00:33,  2.07s/it][A[A

19it [00:35,  2.01s/it][A[A

20it [00:37,  2.05s/it][A[A

21it [00:39,  1.99s/it][A[A

22it [00:41,  2.12s/it][A[A

23it [00:43,  2.03s/it][A[A

24it [00:45,  1.92s/it][A[A

25it [00:47,  1.96s/it][A[A

26it [00:49,  2.04s/it][A[A

27it [00:51,  2.04s/it][A[A

28it [00:53,  1.97s/it][A[A

29it [00:58,  2.86s/it][A[A

30it [01:00,  2.61s/it][A[A

31it [01:03,  2.72s/it][A[A

32it [01:05,  2.48s/

259it [09:43,  1.97s/it][A[A

260it [09:45,  1.88s/it][A[A

261it [09:47,  1.83s/it][A[A

262it [09:50,  2.11s/it][A[A

263it [09:52,  2.14s/it][A[A

264it [09:55,  2.36s/it][A[A

265it [09:56,  2.18s/it][A[A

266it [09:58,  2.13s/it][A[A

267it [10:01,  2.17s/it][A[A

268it [10:03,  2.06s/it][A[A

269it [10:06,  2.33s/it][A[A

270it [10:09,  2.64s/it][A[A

271it [10:11,  2.39s/it][A[A

272it [10:14,  2.66s/it][A[A

273it [10:16,  2.40s/it][A[A

274it [10:18,  2.22s/it][A[A

275it [10:21,  2.46s/it][A[A

276it [10:23,  2.36s/it][A[A

277it [10:24,  2.09s/it][A[A

278it [10:27,  2.17s/it][A[A

279it [10:29,  2.21s/it][A[A

280it [10:31,  2.06s/it][A[A

281it [10:33,  2.08s/it][A[A

282it [10:35,  2.13s/it][A[A

283it [10:38,  2.33s/it][A[A

284it [10:40,  2.46s/it][A[A

285it [10:43,  2.35s/it][A[A

286it [10:45,  2.37s/it][A[A

287it [10:48,  2.54s/it][A[A

288it [10:50,  2.33s/it][A[A

289it [10:52,  2.15s/it][A[A

290it [1

515it [19:56,  2.08s/it][A[A

516it [19:58,  2.04s/it][A[A

517it [20:00,  2.03s/it][A[A

518it [20:03,  2.42s/it][A[A

519it [20:04,  1.91s/it][A[A

520it [20:06,  2.13s/it][A[A

521it [20:09,  2.20s/it][A[A

522it [20:12,  2.40s/it][A[A

523it [20:14,  2.46s/it][A[A

524it [20:16,  2.27s/it][A[A

525it [20:19,  2.44s/it][A[A

526it [20:21,  2.38s/it][A[A

527it [20:24,  2.47s/it][A[A

528it [20:26,  2.50s/it][A[A

529it [20:29,  2.50s/it][A[A

530it [20:31,  2.26s/it][A[A

531it [20:32,  1.94s/it][A[A

532it [20:33,  1.82s/it][A[A

533it [20:35,  1.92s/it][A[A

534it [20:37,  1.87s/it][A[A

535it [20:39,  1.83s/it][A[A

536it [20:41,  1.92s/it][A[A

537it [20:43,  1.87s/it][A[A

538it [20:46,  2.14s/it][A[A

539it [20:48,  2.27s/it][A[A

540it [20:50,  2.05s/it][A[A

541it [20:51,  1.98s/it][A[A

542it [20:53,  1.89s/it][A[A

543it [20:55,  1.85s/it][A[A

544it [20:58,  2.15s/it][A[A

545it [21:01,  2.39s/it][A[A

546it [2

Timeout: Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)