# Data Collection  
  
In order to calculate reasonably stable embedding vectors using our NN, we'll need to collect a sufficient amount of data. For this exercise, we'll use only data that is free and publicly accessible. It is always worth checking the Terms and Conditions of any site before attempting to scrape data, and even if scraping is allowed, then it is good practice to limit the requests in some way to that the site is not swamped.  


In [None]:
# Load packages that we'll need
import urllib.request
import re
import nltk
import time
import json
import unicodedata
import pandas as pd

from bs4 import BeautifulSoup
from collections import defaultdict

  
## 1) Classic Literature  

The text of many books in the public domain can be obtained from Gutenberg.org

In [None]:
def book_reader(urls):
    """
    Parses the HTML of a given page using Beautiful Soup, extracts the text of the book as a string,
    and then tokenizes that string into individual words
    
    Args:
        urls : A list containing urls for each book to process
        
    Returns:
        (nltk object) Combined text of all the books in the list
    """
    final = ''
    for url in urls:
        print(url)
        page = urllib.request.urlopen(url)
        book = BeautifulSoup(page, 'html.parser')
        raw = book.get_text()
        final += raw.lower()
    tokens = nltk.word_tokenize(final)
    text = nltk.Text(tokens)
    return text


# Urls for a selection of books by female authors that we want to train on
urls_female = ['https://www.gutenberg.org/files/1342/1342-0.txt', # Jane Austen
        'https://www.gutenberg.org/files/158/158-0.txt', 
        'http://www.gutenberg.org/cache/epub/161/pg161.txt', 
        'http://www.gutenberg.org/cache/epub/105/pg105.txt', 
        'https://www.gutenberg.org/files/121/121-0.txt', 
        'http://www.gutenberg.org/cache/epub/946/pg946.txt', 
        'https://www.gutenberg.org/files/1212/1212-0.txt', 
        'http://www.gutenberg.org/cache/epub/5670/pg5670.txt', # Virginia Woolf
        'https://www.gutenberg.org/files/144/144-0.txt', 
        'https://www.gutenberg.org/files/1245/1245-0.txt', 
        'http://www.gutenberg.org/cache/epub/29220/pg29220.txt',
        'https://www.gutenberg.org/files/57050/57050-0.txt', 
        'http://www.gutenberg.org/cache/epub/1260/pg1260.txt', # Charlotte Bronte
        'http://www.gutenberg.org/cache/epub/9182/pg9182.txt',
        'https://www.gutenberg.org/files/30486/30486-0.txt', 
        'https://www.gutenberg.org/files/1028/1028-0.txt', 
        'http://www.gutenberg.org/cache/epub/54254/pg54254.txt',
        'http://www.gutenberg.org/cache/epub/768/pg768.txt', # Emily Bronte
        'http://www.gutenberg.org/cache/epub/145/pg145.txt', # George Eliot
        'http://www.gutenberg.org/cache/epub/550/pg550.txt',
        'https://www.gutenberg.org/files/6688/6688-0.txt',
        'https://www.gutenberg.org/files/28289/28289-0.txt',
        'https://www.gutenberg.org/files/507/507-0.txt',
        'http://www.gutenberg.org/cache/epub/7469/pg7469.txt',
        'http://www.gutenberg.org/cache/epub/2165/pg2165.txt',
        'https://www.gutenberg.org/files/24020/24020-0.txt',
        'https://www.gutenberg.org/files/40882/40882-0.txt',
        'http://www.gutenberg.org/cache/epub/2171/pg2171.txt',
        'http://www.gutenberg.org/cache/epub/17780/pg17780.txt']

# Urls for a selection of books by male authors that we want to train on
urls_male = ['https://www.gutenberg.org/files/46/46-0.txt', # Charles Dickens
        'https://www.gutenberg.org/files/98/98-0.txt',
        'https://www.gutenberg.org/files/1400/1400-0.txt',
        'https://www.gutenberg.org/files/786/786-0.txt',
        'http://www.gutenberg.org/cache/epub/730/pg730.txt',
        'http://www.gutenberg.org/cache/epub/19337/pg19337.txt',
        'https://www.gutenberg.org/files/766/766-0.txt',
        'http://www.gutenberg.org/cache/epub/1023/pg1023.txt',
        'https://www.gutenberg.org/files/580/580-0.txt',
        'https://www.gutenberg.org/files/74/74-0.txt', # Mark Twain
        'https://www.gutenberg.org/files/76/76-0.txt',
        'https://www.gutenberg.org/files/86/86-0.txt',
        'https://www.gutenberg.org/files/1837/1837-0.txt',
        'https://www.gutenberg.org/files/3176/3176-0.txt',
        'https://www.gutenberg.org/files/245/245-0.txt',
        'https://www.gutenberg.org/files/102/102-0.txt',
        'https://www.gutenberg.org/files/3186/3186-0.txt',
        'https://www.gutenberg.org/files/3177/3177-0.txt',
        'https://www.gutenberg.org/files/8525/8525-0.txt',
        'https://www.gutenberg.org/files/2701/2701-0.txt', # Herman Melville
        'http://www.gutenberg.org/cache/epub/11231/pg11231.txt',
        'http://www.gutenberg.org/cache/epub/21816/pg21816.txt',
        'http://www.gutenberg.org/cache/epub/15859/pg15859.txt',
        'https://www.gutenberg.org/files/1900/1900-0.txt',
        'http://www.gutenberg.org/cache/epub/12384/pg12384.txt',
        'https://www.gutenberg.org/files/805/805-0.txt', # F. Scott Fitzgerald
        'http://gutenberg.net.au/ebooks02/0200041.txt',
        'http://gutenberg.net.au/ebooks03/0301261.txt',
        'http://gutenberg.net.au/ebooks01/0100021.txt', # George Orwell
        'http://gutenberg.net.au/ebooks03/0300011.txt',
        'http://gutenberg.net.au/ebooks02/0200151.txt',
        'http://gutenberg.net.au/ebooks01/0100011.txt',
        'http://gutenberg.net.au/ebooks02/0200031.txt',
        'http://gutenberg.net.au/ebooks02/0201111.txt',
        'http://gutenberg.net.au/ebooks02/0200391.txt',
        'http://gutenberg.net.au/ebooks02/0200141.txt',
        'http://gutenberg.net.au/ebooks02/0200021.txt',
        'http://gutenberg.net.au/ebooks02/0200011.txt',
        'http://gutenberg.net.au/ebooks02/0200051.txt',
        'http://gutenberg.net.au/ebooks01/0100171.txt']

# Save the data as a text file
text = book_reader(urls_female)
with open("Female_Authors.txt", 'w') as file:
    for item in text:
        file.write("{} ".format(item))
        
text = book_reader(urls_male)
with open("Male_Authors.txt", 'w') as file:
    for item in text:
        file.write("{} ".format(item))


## 2) Movie Scripts  
  
There are several sites that publish movie scripts. For this exercise, we'll use xxxxxxxxxx.com, which includes a two page index of all movie scripts hosted on the site. The first step is to collect that index so that we can create a list of urls, that we'll then loop through in step 2.

In [None]:
# The index of movies is split over two pages. We'll collect the name of the movie, the url for the script,
# and the year of release (in case we later want to examine how biases have changed over time)

source_page = ["http://www.xxxxxxxxxx.com/movie.html", "http://www.xxxxxxxxxx.com/movie_n-z.html"]
movies = []
for source in source_page:
    page = urllib.request.urlopen(source)
    parsed = BeautifulSoup(page, 'html.parser')
    
    for a in parsed.find_all('a', href=True):
        # only grab links to scripts, not to the imdb entry
        if a['href'][:7] == 'scripts':
            url = "http://www.xxxxxxxxxx.com/"+a['href']
            name = unicodedata.normalize("NFKD", a.contents[0])
            year = unicodedata.normalize("NFKD", a.contents[0].next_element[-4:])
            movies.append([name,url,year])
            
# This particular site publishes scripts in either html or as a pdf file. For simplicity, we'll only collect
# the html files and won't attempt to scrape the pdfs.

# Create a new list that excludes any item where the url ends in 'pdf'
movies = [x for x in movies if x[1][-3:].lower() != 'pdf']

Now that we have the list of movies, we can scrape the site to obtain the text of the scripts. We'll build in a delay so that we only make one request every two seconds to avoid hitting theier servers too badly. 
  
The scripts need some pre-processing. Character names are provided before every spoken line, and therefore will be high frequency but also semantically unrelated to the surrounding words, so we'll remove them for the purposes of this analysis. By convention, they are in capitals, and therefore easy to strip out using regex. This will also remove any normal words that are all in capitals (eg "I") but the benefit of removing this bias outweighs the loss of capitalized text (which are generally names, background descriptions, and instructions on cinematography)

In [None]:
# Create a dictionary to store the scripts, and a list to store any urls where scraping was unsuccessful
movie_dict = {}
failures = []

# Collect the scripts
for name,url,year in movies:
    try:
        page = urllib.request.urlopen(url)
        time.sleep(2) # delay in order to avoid swamping the website with requests
        parsed = BeautifulSoup(page, 'html.parser')
        # Remove words that are all caps
        script = re.sub(r'\b[A-Z]+\b', '', parsed.text)
        # Get rid of unnecessary white space and newlines
        script = " ".join(script.split())
        movie_dict[name] = [script,year]
        # hokey counter, just to keep track of where we are
        print(name[:1], end = "")
    except:
        failures.append(url)
        
# Save the dictionary
with open('Movie_Script_dict.txt', 'w') as file:
     file.write(json.dumps(movie_dict))
        
# Save a text file containing all the scripts       
with open("Movie_Scripts.txt", 'w') as file:
    file.write(" ".join(movie_dict[name][0] for name in movie_dict.keys()))

## 3) Song lyrics  

For this exercise, we'll collect the lyrics of the billboard top 100 songs in the UK and the US over the last fifty years.

There are many sites that publish song lyrics, however, formats between them vary significantly, and there are several that discourage scraping. This means that even with a list of song titles and artists, there may be significant manual intervention required to reformat the name and find the correct lyrics. This quickly becomes tedious (as you will see below), so we'll only perform the exercise for the UK, and collect the US data from the very interesting repo of walkerkq (https://github.com/walkerkq/), who thankfully has already done it.

In [None]:
# First, create a list of webpages that contain the top 100 songs for each year
url_dict = {year : 'https://www.xxxxxxxxxx.com/charts/singles-chart/'+str(year)+'0630/7501' for year in range(1965, 2016)}

# Run through the dictionary and check that a page exists for each year:url pair
for k, v in url_dict2.items():
    try:
        urllib.request.urlopen(v)
        print(k, 'ok')
    except:
        print(k, 'error - check format')

Now we know that the dictionary urls are correct, set up a function to scrape the artist names and song titles from each page

In [None]:
def get_artists_and_titles(parsed_page):
    '''Scrape a specific page from www.xxxxxxxxxx.com to collect
       the artist and title of the top 100 songs in the UK for a given year
       
       Args:
           parsed_page: web page parsed with Beautiful Soup
       Returns:
           List of (title, artist) tuples'''
    artist_list = []
    title_list = []
    for artist in parsed_page.findAll("div", class_="artist"):
        artist_list.append(artist.find('a').contents[0])
    for title in parsed_page.findAll("div", class_="title"):
        title_list.append(title.find('a').contents[0])
    return(list(zip(title_list, artist_list)))

The format of the artist names provided by this website differs from the one used by the site which contains the actual lyrics. The following code is therefore specific to the target lyrics site (ie, it would have to be changed for use with other sites that use different naming conventions)

In [None]:
def get_lyrics(combined_list):
    '''Loop through the list of top 100 songs, format the names correctly
       so that the lyrics can be scraped from www.xxxxxxxxxx.com and then
       scrape them
       
       Args:
           List of (title, artist) tuples
       Returns:
           List where List[0] is a string containing successfully scraped lyrics, and
           List[1] contains the (title, artist) tuples that were not found'''
    
    # Replace certain symbols that appear in the list of artist / song names with the
    # corresponding symbol from the lyrics site
    target_list = []
    replacements = {' ':'-', '(':'', ')':'', '&':'and', "'":"", '?':''}
    for n in range(len(combined_list)):
        target = '-lyrics-'.join(combined_list[n])
        for a,b in replacements.items():
            target = target.replace(a,b)
        target_list.append(target)
    
    # With artist collaborations, the convention on metrolyrics.com is that anything after
    # the substring 'ft' gets dropped, so we need to update the list to reflect that

    breakpoint = '-FT'
    for n, item in enumerate(target_list):
        target_list[n] = item.split(breakpoint, 1)[0]
        
    # Then, there are two more transformations that we need
    # a) where a song includes a subtitle (denoted by a forward slash) the
    # subtitle is ignored, so for example, 'Get over you/Move this mountain'
    # by Sophie Ellis Bextor is listed under 'Get over you'
    #
    # b) The word "The" is ignored when it precedes the artist name, so
    # "The Beatles" are listed under "Beatles", but "Hootie and the Blowfish"
    # remains unchanged
    
    break_s = "/"
    break_a = "THE "
    for n, item in enumerate(target_list):
        song = item[0].split(break_s, 1)[0]
        if artist1[:4] == break_a:
            artist = artist1[4:]
        else:
            artist = artist1
        target_list[n] = [song,artist]
    
    # Now grab the lyrics for all songs in the year, but store any songs for which
    # the formatting is incorrect in a separate list, in case we find another source
    # or choose to add them manually
    
    failures = []
    lyrics = []
    for n in range(len(target_list)):
        lyrics_page = 'http://www.xxxxxxxxxx.com/{}.html'.format(target_list[n].lower())
        time.sleep(2) # put in a delay so that we don't hammer their website too badly
        try:
            page2 = urllib.request.urlopen(lyrics_page)
            parsed2 = BeautifulSoup(page2, 'html.parser')
            for x in parsed2.findAll("p", class_="verse"):
                lyrics.append(x.text)
        except:
            failures.append(combined_list[n])
        
    all_lyrics = " ".join(lyrics).replace('\n','. ')
    result = [all_lyrics, failures]
    
    return(result)

Now, run through the dictionary of urls and collect as many lyrics as possible. We'll recycle the dictionary, and keep the year as the key, but replace the URL that we no longer need with a list that contains the lyrics for that year, plus any title/artist combinations for the year that we were unable to obtain from the target lyrics site

In [None]:
# Scrape the target site to collect lyrics
for k, v in url_dict.items():
    page = urllib.request.urlopen(v)
    parsed = BeautifulSoup(page, 'html.parser')
    combined = get_artists_and_titles(parsed)
    lyrics = get_lyrics(combined)
    # replace the URL for the given year with the lyrics of the top 100 songs and a list of songs that we missed
    url_dict[k] = lyrics
    # this process can take a while, so to keep track, print out when each year has been processed
    print(k, ' done')

# Save the dictionary
with open('UK_lyric_dict.txt', 'w') as file:
     file.write(json.dumps(url_dict))
        
# Save a file containing the lyrics for all years
with open("UK_lyrics.txt", 'w') as file:
    file.write(" ".join(url_dict[year][0] for year in url_dict.keys()))

Unfortunately, at this stage there are still several issues with the naming convention which means that we have incomplete data. We saved the (title, artist) tuple for anything that couldn't be found on the target site in the dictionary. Check the size of the problem. 

In [None]:
# For each key value pair in the dictionary key = year, value = [lyrics, [(title,artist) that failed]]
# Print out how many failures there are for each year and what the overall coverage is

missing = 0
for year in url_dict.keys():
    print(year, len(url_dict[year][1]))
    missing += len(url_dict[year][1])
print("Coverage is now {}%".format(100 - missing*100/5000))

### Try scraping a different site for the songs we are missing

Adjust the code so that it will be compatible with a different site, xxxxxxxxxx.com

In [None]:
def get_lyrics_2(combined_list):
    '''Loop through the list of remaining songs, format the names correctly
       so that the lyrics can be scraped from www.xxxxxxxxxx.com and then
       scrape them
       
       Args:
           List of (title, artist) tuples
       Returns:
           List where List[0] is a string containing successfully scraped lyrics, and
           List[1] contains the (title, artist) tuples that were not found'''
  
    # Format data so that it works in the url
    target_list = []
    replacements = {' ':'_', '(':'', ')':'', '&':'and', "'":"", '?':''}
    for n in range(len(combined_list)):
        target = ''.join((combined_list[n][1], ":", combined_list[n][0]))
        for a,b in replacements.items():
            target = target.replace(a,b)
        target_list.append(target)
    
    # Now grab the lyrics for all songs in the year, but store any songs for which
    # the formatting is incorrect in a separate list
    
    failures = []
    lyrics = []
    for n in range(len(target_list)):
        lyrics_page = 'http://www.xxxxxxxxxx.com/wiki/{}'.format(target_list[n].lower())
        time.sleep(1) # put in a delay so that we don't hammer their website too badly
        try:
            page2 = urllib.request.urlopen(lyrics_page)
            parsed2 = BeautifulSoup(page2, 'html.parser')
            # format needs a bit more processing than before
            x = parsed2.findAll("div", class_="lyricbox")[0]
            y = ' '.join(str(child) for child in x.children)
            y = y.replace("<br/>", ".")
            y = y.split('<div class="lyricsbreak"></div> \n')[0]
            lyrics.append(y)
        except:
            failures.append(combined_list[n])
        
    all_lyrics = " ".join(lyrics).replace('\n','. ')
    result = [all_lyrics, failures]
    
    return(result)

Repeat this process with different sites until substantially all the lyrics have been collected. At each pass, check the failures and search manually on the site to understand what the formatting issue was, then correct it and add the additional lyrics to the master dictionary. A common problem is collaborations, which were particularly popular in the 1990's and early 2000's. The following code may help  
  
```
    song_delimiters = "FT | FEATURING | VS | WITH "
    artist_delimiters = "FT | FEATURING | VS | WITH | AND"
    for n, item in enumerate(combined_list):
        song = re.split(song_delimiters, item[0])[0]
        artist = re.split(artist_delimiters, item[1])[0]
        combined_list[n] = [song,artist]```

Finally, combine the UK data with the US data, into a single corpus of song lyrics

In [None]:
# Get correct encoding of csv file in terminal, using the command 'file billboard_lyrics_1964-2015.csv'
df = pd.read_csv('billboard_lyrics_1964-2015.csv', encoding='ISO-8859-1').set_index("Year")
df = df.drop(["Rank", "Song", "Artist", "Source"], axis = 1)

# Get the data into a structure that is consistent with how the UK data is stored
# First, set up a dictionary where the key is year, and the value is a list of song
# lyrics for the year
us_dict = defaultdict(list)
for year, row in df.iterrows():
    us_dict[str(year)].append(str(row["Lyrics"]))
    
# Then collapse the list of lyrics into a single string for each year, and save
# the dictionary in case we want to do analysis across time periods
for k, v in us_dict.items():
    us_dict[k] = " ".join((v))    
with open('US_lyric_dict_final.txt', 'w') as file:
     file.write(json.dumps(us_dict))
        
# Finally, create a single file of text, save it...
with open("US_lyrics_final.txt", 'w') as file:
    file.write(" ".join(us_dict[year] for year in us_dict.keys()))

# ...and merge it with the UK file
with open('Combined_lyrics_final.txt', 'w') as output:
    with open('US_lyrics_final.txt') as temp:
        for line in temp:
            output.write(line)
    with open('UK_lyrics_final.txt') as temp:
        for line in temp:
            output.write(line)
