# Scrape lyrics to MongoDB

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

import time

from pymongo import MongoClient

## Connecting to Mongo Client

In [2]:
client = MongoClient()
db = client.genius
db.list_collection_names()

['artist_ids', 'song_urls', 'song_lyrics']

## Get Song Info

In [4]:
cursor = db.song_urls.find({}, {'_id': 0, 'artist_id': 0})
df = pd.DataFrame(list(cursor))
df.head()

Unnamed: 0,id,title,url
0,987434,Berlin,https://genius.com/The-essex-green-berlin-lyrics
1,1009076,Big Green Tree,https://genius.com/The-essex-green-big-green-t...
2,1551187,By the Sea,https://genius.com/The-essex-green-by-the-sea-...
3,1745373,Carballo,https://genius.com/The-essex-green-carballo-ly...
4,1073422,Chartiers,https://genius.com/The-essex-green-chartiers-l...


In [4]:
def scrape_lyrics(df):
    """
    Takes in a DataFrame containing song IDs, titles,
    and URLs to the respective lyrics page
    and writes unformatted text into a MongoDB.
    """

    observations = df.shape[0]

    for i in range(observations):
        song_info = {}

        song_info['song_id'] = int(df.iloc[i].id)
        song_info['song_title'] = df.iloc[i].title
        song_info['song_url'] = df.iloc[i].url

        song_url = df.iloc[i].url

        response = requests.get(song_url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")

        try:
            raw_lyrics = soup.find('div', {'class': 'lyrics'}).text
        except:
            raw_lyrics = ''
            print(df.iloc[i].url, "had no lyrics listed")

        song_info['raw_lyrics'] = raw_lyrics

        db.song_lyrics.insert_one(song_info)

        time.sleep(np.random.poisson(100)/50)

In [5]:
df.shape

(53554, 3)

In [5]:
scrape_lyrics(df)