# Scraping lyrics.com

In [2]:
import requests
from bs4 import BeautifulSoup as soup
import os
import re

### Prepare functions for scraping lyrics of artists

Functions to extract all titles and suffixes on the artist page that lead to lyrics of different songs.

In [15]:
def get_titles(artist):
    titles = []
    artist_site = requests.get('https://www.lyrics.com' + artist)
    artist_html = soup(artist_site.text, 'html.parser')
    all_entries = artist_html.body.find_all('a', attrs={"href":re.compile("^/lyric")})
    for entry in all_entries:
        title = entry.text.lower().replace('(', '[').split(' [', 1)[0].replace(' ', '_')
        if title not in titles:
            titles.append(title)
    return titles

def get_suffixes(artist):
    suffixes = []
    ttitles = []
    artist_site = requests.get('https://www.lyrics.com' + artist)
    artist_html = soup(artist_site.text, 'html.parser')
    all_entries = artist_html.body.find_all('a', attrs={"href":re.compile("^/lyric")})
    for entry in all_entries:
        ttitle = entry.text.lower().replace('(', '[').split(' [', 1)[0].replace(' ', '_')
        if ttitle not in ttitles:
            ttitles.append(ttitle)
            suffixes.append(entry.get('href'))
    return suffixes

Function to extract the lyrics of each song in the suffixes list and store it in one list

In [16]:
def get_lyrics(artist, suffixes):
    lyrics = []
    for i in range(len(suffixes)):
        lyrics_site = requests.get('https://www.lyrics.com' + suffixes[i])
        lyrics_html = soup(lyrics_site.text, 'html.parser')
        text = lyrics_html.body.find(id ='lyric-body-text').text.replace('\n', ' ') 
        lyrics.append(text) 
    return lyrics

### Florence and the Machine lyrics

In [19]:
# Florence and the machine
fatm = '/artist.php?name=Florence-%2B-the-Machine&aid=1062562&o=1' # URL of the arstist page (sorted from A-Z, 
                                                                   # not grouped by album)
fatm_titles = get_titles(fatm)
fatm_suffixes = get_suffixes(fatm)
fatm_lyrics = get_lyrics(fatm, fatm_suffixes)

Count most frequent words

In [21]:
from collections import Counter

c = Counter(' '.join(fatm_lyrics).split())
print(c.most_common(5))

[('the', 1005), ('I', 869), ('you', 562), ('to', 561), ('me', 502)]


### Imagine Dragons lyrics

In [23]:
# Imagine Dragons
imdrag = '/artist.php?name=Imagine-Dragons&aid=1184089&o=1' # URL of the arstist page (sorted from A-Z, 
                                                                   # not grouped by album)
imdrag_titles = get_titles(imdrag)
imdrag_suffixes = get_suffixes(imdrag)
imdrag_lyrics = get_lyrics(imdrag, imdrag_suffixes)