In [None]:
# Import dependencies 
from splinter import Browser
from bs4 import BeautifulSoup
import string
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

In [None]:
# Store filepaths in variables
file_new = "music_list_data/new_school.csv"
file_old = "music_list_data/old_school.csv"

In [None]:
# Read our data files with the pandas library
new_df = pd.read_csv(file_new)
old_df = pd.read_csv(file_old)

In [None]:
# Add new columns for reformatted data for "new" data
new_df['artists'] = [("-".join(row.split())) for row in new_df['artist']]
new_df['songs'] = [("-".join(row.split())) for row in new_df['song']]

In [None]:
# Add new columns for reformatted data for "old" data
old_df['artists'] = [("-".join(row.split())) for row in old_df['artist']]
old_df['songs'] = [("-".join(row.split())) for row in old_df['song']]

In [None]:
# Store nltk stop words in a variable
stop_words = set(stopwords.words('english'))

In [None]:
# small_stop_words is a list of length 4 or less words in stop_words
small_stop_words = [w for w in stop_words if len(w) < 5]

In [None]:
def genius_lyrics_scrape(artist_name, song_name):

    # Define a empty list to store all scraped data 
    lyrics_result_list = []

    with Browser() as browser:
        url = 'https://genius.com/' + artist_name + '-' + song_name + '-lyrics' 
        browser.visit(url)
        genius_html = browser.html
        
        # Create BeautifulSoup object; parse with 'html.parser'
        soup = BeautifulSoup(genius_html, 'html.parser')

        results = soup.find_all('div', class_ = 'lyrics')
    
    # Loop through returned results to collect the lyrics text
    for result in results:
        try:
            # Retrieve the raw lyrics text
            full_lyrics = result.find('br').get_text(separator=u' ')
            # Clean up the raw lyrics to remove bracketed text and punctiations  
            clean_lyrics = re.sub("[\(\[].*?[\)\]]", " ", full_lyrics)
            words = clean_lyrics.lower().split()
            clean_words = [''.join(c for c in s if c not in string.punctuation) for s in words]
            # Determine number of unique words and the avg length of the unique words            
            unique_words = set(clean_words)
            avg_word_len = sum(map(len, unique_words))/len(unique_words)
            # Use stopwords to remove common words 
            filter_words = list(filter(lambda w: not w in stop_words,unique_words))
            filter_avg_word_len = sum(map(len, filter_words))/len(filter_words)
            big_words = list(filter(lambda w: not w in small_stop_words,unique_words))
            big_avg_word_len = sum(map(len, big_words))/len(big_words)

        except:
            print("This is an error message!")

    lyrics_result_list = [song_name, len(words), len(unique_words), len(filter_words), 
                          len(big_words), avg_word_len, filter_avg_word_len, big_avg_word_len]
    return lyrics_result_list

In [None]:
# Run "old" data through scrape function and store into a variable
old_song_results_series = old_df.apply(lambda x: genius_lyrics_scrape(x['artists'],x['songs']), axis=1)

In [None]:
# Run "new" data through scrape function and store into a variable
new_song_results_series = new_df.apply(lambda x: genius_lyrics_scrape(x['artists'],x['songs']), axis=1)

In [None]:
# Create a list of "old" scraped data
old_song_results_list = list(old_song_results_series)

In [None]:
# Create a list of "new" scraped data
new_song_results_list = list(new_song_results_series)

In [None]:
# Store header names into a variable
headers = ['songs', '#_words', '#_unique_words', '#_filter_words', '#_big_words', 'avg_word_len', 'filter_avg_word_len', 'big_avg_word_len']

In [None]:
# Create a data frame of "old" scraped data
old_song_results = pd.DataFrame(old_song_results_list, columns=headers)

In [None]:
# Create a data frame of "new" scraped data
new_song_results = pd.DataFrame(new_song_results_list, columns=headers)

In [None]:
# Merge "old" scraped data frame with original "old" data frame, and export to a csv
old_results_df = old_df.merge(old_song_results, on='songs')
old_results_df.to_csv('old_results.csv')

In [None]:
# Merge "new" scraped data frame with original "new" data frame, and export to a csv
new_results_df = new_df.merge(new_song_results, on='songs')
new_results_df.to_csv('new_results.csv')