In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [7]:
# Sets base url for annual blog listing top 40 songs
base_url = 'https://www.praisecharts.com/blog/top-40-worship-songs-of-'

# Sets list of years to append to base url
year_list = range(2004, 2021)

# Creates empty list to collect dictionaries of song info
song_dicts = []

# Iterates through each year corresponding to a PraiseChart's top 40 blog
# Uses tqdm progress bar to identify where code give errror, if it does
for year in tqdm(year_list):
    
    # Appends year to base url, completing url path for each year's top 40 blog
    songlist_url = base_url + str(year)
    
    # Makes request to access each year's top 40 blog
    songlist_res = requests.get(songlist_url)
    
    # Retrieves HTML content for each year's top 40 blog
    songlist_soup = BeautifulSoup(songlist_res.content, 'lxml')
    
    # Assigns top 40 blog's track list to variable
    top_40 = songlist_soup.find('div', {'class': 'blog-entry'})
    
    # Iterates through each track name in top 40 list
    for song in top_40.find_all('h4'):
        
        # Finds track info link attached to track name and saves url to variable
        track_url = song.find('a').attrs['href']
        
        # Makes request to access each track's info page
        track_res = requests.get(track_url)
        
        # Retrieves HTML content for each track's info page
        track_soup = BeautifulSoup(track_res.content, 'lxml')
        
        # Finds sidebar containing track lyrics
        lyrics_sidebar = track_soup.find('section', {'id': 'sidebar'}).find('div', {'class': 'lyrics'})
        
        # Creates empty list to store lyrics
        song_lyrics = []
        
        # If track info page contains lyrics, stores each line of lyrics in above list
        try:
            for lyric in lyrics_sidebar.find_all('div'):
                
                # Removes non-lyric text, such as song section label
                if lyric.attrs['data-line'] not in ['Verse ', 'Verse 1', 'Verse 2', 'Verse 3',
                                                    'Chorus', 'Chorus ', 'Chorus 1', 'Chorus 2',
                                                    'Chorus 3', 'Bridge', 'Bridge ', 'Vamp ',
                                                   'tag',]:
                    
                    # Ensures lyric text is lowercase
                    song_lyrics.append(lyric.text.lower())  
        
        # Creates exception if track info page doesn't contain lyrics and identifies song
        except:
            print(f'No lyrics found at {track_url}')
        
        # Finds table containing track info
        track_info_table = track_soup.find('table')
        
        # Creates emtpy dictionary to store track info
        track_info = {}
        
        # Iterates through track info 
        for info in track_info_table.find_all('tr'):
            
            # Stores track info in dictionary
            track_info[info.find_all('td')[0].text[:-1]] = info.find_all('td')[1].text.rstrip()
        
        # Adds lyrics to dictionary
        track_info['lyrics'] = song_lyrics
        
        # Adds track title to dictionary
        track_info['song'] = track_soup.find('h1', {'id': 'product-detail-title'}).text.lstrip().rstrip()
        
        # Adds year song was on PraseChart's annual top 40 list to dictionary
        track_info['year'] = year
        
        # Stores dictionary in song_dicts list
        song_dicts.append(track_info)

  0%|          | 0/17 [00:00<?, ?it/s]

No lyrics found at https://www.praisecharts.com/songs/details/1808/make-it-glorious-sheet-music


 29%|██▉       | 5/17 [06:59<16:48, 84.07s/it]

No lyrics found at https://www.praisecharts.com/songs/details/9340/christmas-eve-service-guide-sheet-music


 53%|█████▎    | 9/17 [13:25<12:33, 94.15s/it]

No lyrics found at https://www.praisecharts.com/songs/details/23350/miraculum-sheet-music


100%|██████████| 17/17 [27:20<00:00, 96.53s/it] 


In [8]:
len(song_dicts)

674

In [9]:
pd.DataFrame(song_dicts).to_csv('song_list')