# Scraping lyrics.com

In [49]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as soup
import os
import re

## Scraping

### Prepare functions for scraping lyrics of artists

Functions to extract all titles and suffixes on the artist page that lead to lyrics of different songs.

In [50]:
def get_titles(artist):
    titles = []
    artist_site = requests.get('https://www.lyrics.com' + artist)
    artist_html = soup(artist_site.text, 'html.parser')
    all_entries = artist_html.body.find_all('a', attrs={"href":re.compile("^/lyric")})
    for entry in all_entries:
        title = entry.text.lower().replace('(', '[').split(' [', 1)[0]
        title = re.sub('[^A-Za-z0-9 ]+', '', title)
        if title not in titles:
            titles.append(title)
    return titles

def get_suffixes(artist):
    suffixes = []
    ttitles = []
    artist_site = requests.get('https://www.lyrics.com' + artist)
    artist_html = soup(artist_site.text, 'html.parser')
    all_entries = artist_html.body.find_all('a', attrs={"href":re.compile("^/lyric")})
    for entry in all_entries:
        ttitle = entry.text.lower().replace('(', '[').split(' [', 1)[0]
        ttitle = re.sub('[^A-Za-z0-9 ]+', '', ttitle)
        if ttitle not in ttitles:
            ttitles.append(ttitle)
            suffixes.append(entry.get('href'))
    return suffixes

Function to extract the lyrics of each song in the suffixes list and store it in one list

In [51]:
def get_lyrics(artist, suffixes):
    lyrics = []
    for i in range(len(suffixes)):
        lyrics_site = requests.get('https://www.lyrics.com' + suffixes[i])
        lyrics_html = soup(lyrics_site.text, 'html.parser')
        text = lyrics_html.body.find(id ='lyric-body-text').text
        text = re.sub('[^A-Za-z0-9 ]+', ' ', text)
        lyrics.append(text) 
    return lyrics


### Florence and the Machine lyrics

In [52]:
# Florence and the machine
fatm = '/artist.php?name=Florence-%2B-the-Machine&aid=1062562&o=1' # URL of the arstist page (sorted from A-Z, 
                                                                   # not grouped by album)
fatm_titles = get_titles(fatm)
fatm_suffixes = get_suffixes(fatm)
fatm_lyrics = get_lyrics(fatm, fatm_suffixes)

In [53]:
fatm_df = pd.DataFrame({'artist':['Florence and the Machine' for i in range(len(fatm_titles))], 'title':fatm_titles, 'lyrics':fatm_lyrics})
fatm_df.to_csv('fatm_df.csv', sep=';', index=False)

Count most frequent words

In [54]:
from collections import Counter

c = Counter(' '.join(fatm_lyrics).split())
print(c.most_common(5))

[('I', 1295), ('the', 998), ('you', 705), ('it', 560), ('to', 559)]


### Imagine Dragons lyrics

In [55]:
# Imagine Dragons
imdrag = '/artist.php?name=Imagine-Dragons&aid=1184089&o=1' # URL of the arstist page (sorted from A-Z, 
                                                                   # not grouped by album)
imdrag_titles = get_titles(imdrag)
imdrag_suffixes = get_suffixes(imdrag)
imdrag_lyrics = get_lyrics(imdrag, imdrag_suffixes)

In [56]:
imdrag_df = pd.DataFrame({'artist':['Imagine Dragons' for i in range(len(imdrag_titles))], 'title':imdrag_titles, 'lyrics':imdrag_lyrics})
imdrag_df.to_csv('imdrag_df.csv', sep=';', index=False)

### Queen

In [57]:
# Queen
queen = '/artist.php?name=Queen&aid=5205&o=1' # URL of the arstist page (sorted from A-Z, 
                                                                   # not grouped by album)
queen_titles = get_titles(queen)
queen_suffixes = get_suffixes(queen)
queen_lyrics = get_lyrics(queen, queen_suffixes)

In [58]:
queen_df = pd.DataFrame({'artist':['Queen' for i in range(len(queen_titles))], 'title':queen_titles, 'lyrics':queen_lyrics})
queen_df.to_csv('queen_df.csv', sep=';', index=False)

### ABBA

In [59]:
# ABBA
abba = '/artist.php?name=ABBA&aid=3492&o=1' # URL of the arstist page (sorted from A-Z, 
                                                                   # not grouped by album)
abba_titles = get_titles(abba)
abba_suffixes = get_suffixes(abba)
abba_lyrics = get_lyrics(abba, abba_suffixes)

In [60]:
abba_df = pd.DataFrame({'artist':['ABBA' for i in range(len(abba_titles))], 'title':abba_titles, 'lyrics':abba_lyrics})
abba_df.to_csv('abba_df.csv', sep=';', index=False)

## Create one dataframe and save it as .csv

In [61]:
# combine dataframes
lyrics_df = fatm_df.append(imdrag_df).append(queen_df).append(abba_df)

In [62]:
# save it
lyrics_df.to_csv('lyrics_df.csv', sep=';', index=False)

### Word clouds

In [38]:
import numpy as np
from matplotlib import pyplot as plt
import wordcloud

In [39]:
mask = np.array((500, 500, 3), 0)
mask[150:350,150:350,:] = 255  # masked out area

TypeError: data type not understood

In [None]:
cloud = wordcloud.WordCloud(background_color="white",
                max_words=50,
                mask=mask,
                collocations=____,  # calculates frequencies
                contour_color='steelblue').____(text)
                # stop words are removed!

plt.figure(figsize = (40,____))
plt.imshow(cloud, interpolation='bilinear')
#____.savefig('cloud.png')
plt.show()