# Collecting ASCAP Info + Storing in DB

**Description**: Pulling necessary song info from the ASCAP HTML files & storing in PostreSQL db

In [1]:
import json
import os
import pickle
import re

from bs4 import BeautifulSoup, element
from sqlalchemy import create_engine

import pandas as pd

## Loading in HTML File

I'll need to figure out how to retrieve all of the corresponding song details from each HTML file that I scraped from ASCAP>

In [10]:
with open('../data/ascap_songs/CHIEF KEEF.html', 'rb+') as f:
    test_html = BeautifulSoup(f, 'lxml')

### Retrieving Song Cards

My current thought is that I should divvy these results into a list of `dict`s, which should allow me to easily toss into a dataframe or db afterwards.

In [11]:
test_cards = test_html.find_all('div', {'class':'card hyphenate card--expanded'})

64 songs indexed for Chief Keef

In [12]:
len(test_cards)

64

##### The title and "Total Current ASCAP Share"

In [13]:
test_cards[0].find('div', {'class':'card__header__title'}).text.strip().split(sep='\n')

['ADAM & EVE',
 '                ',
 '                    ',
 '                    Total Current ASCAP Share:',
 '                    25%']

##### Just the song title

In [14]:
test_cards[0].find('div', {'class':'card__header__title'}).text.strip().split(sep='\n')[0]

'ADAM & EVE'

##### Writers

In [15]:
writers_row = test_cards[0].find_all('tr', {'class':'writers__row'})

In [16]:
writer_list = []

for writer in writers_row:
    if isinstance(writer.find('a'), element.Tag):
        writer_list.append((writer.find('a'), writer.find_all('td', {'class':'sup-col'})))
    elif isinstance(writer.find('span'), element.Tag):
        writer_list.append((writer.find('span'), writer.find_all('td', {'class':'sup-col'})))
    else:
        writer_list.append('cannot find writer')
        
writer_info = []

for writer, writer_ids in writer_list:
    b = writer.attrs
    b['pro'] = writer_ids[0].text.strip()
    b['ipi'] = writer_ids[1].text.strip()
    writer_info.append(b)

In [17]:
writer_list[0][0].attrs.pop('href')
writer_list[0][0].attrs.pop('class')
writer_list[0][0].attrs

{'data-writer': 'MCKINNEY REGINALD CHRISTOPHER',
 'data-id': '652264355',
 'pro': 'ASCAP',
 'ipi': '652264355'}

In [18]:
writer_info

[{'data-writer': 'MCKINNEY REGINALD CHRISTOPHER',
  'data-id': '652264355',
  'pro': 'ASCAP',
  'ipi': '652264355'},
 {'class': ['no-click'],
  'data-writer': 'WRITER UNKNOWN  ',
  'data-id': '0',
  'pro': 'NS',
  'ipi': '—'}]

Performers

In [19]:
performers = test_cards[0].find_all('div', {'class':'creditors__list'})

In [20]:
performers[0].text.strip('\n')

'CHIEF KEEF'

Retreive one or multiple performers per song

In [21]:
if performers[0].text.count('\n') > 2:
    performers[0].text.strip().split(sep='\n')
else:
    performers[0].text.strip('\n')

HTML Title

### Defining Functions to Sort Through HTML Files

Here's how I'll take the song information that I need and output it into a list of `dict`s

In [12]:
def bs_file(file):
    '''
    Opens file and returns beautiful soup object
    '''
    if 'html' in file: 
        with open('../data/ascap_songs/{}'.format(file), 'rb') as f:
            bs = BeautifulSoup(f, 'lxml')
    else:
        
    return bs

def song_sorter(bs):
    '''
    Iterates through songs in bs object and returns cards of all songs
    '''
    results = []
    if len(bs.find_all('div', {'class':'card hyphenate card--expanded'})) > 0:
        results.extend(bs.find_all('div', {'class':'card hyphenate card--expanded'}))
    if len(bs.find_all('div', {'class':'card card--expanded hyphenate'})) > 0:
        results.extend(bs.find_all('div', {'class':'card card--expanded hyphenate'}))
    return results
                                   
def get_song_credits(card):
    '''
    Retrieves song credits from an individual song card
    '''   
    title = card.find('div', {'class':'card__header__title'}).text.strip().split(sep='\n')[0]
    writers_row = card.find_all('tr', {'class':'writers__row'})
    writer_list = []
    
    # Creating writer list for further filtering
    for writer in writers_row:
        if isinstance(writer.find('a'), element.Tag):
            writer_list.append((writer.find('a'), writer.find_all('td', {'class':'sup-col'})))
        elif isinstance(writer.find('span'), element.Tag):
            writer_list.append((writer.find('span'), writer.find_all('td', {'class':'sup-col'})))
        else:
            writer_list.append('cannot find writer')
    writers = []
    for writer, writer_ids in writer_list:
        b = writer.attrs
        b['pro'] = writer_ids[0].text.strip()
        b['ipi'] = writer_ids[1].text.strip()
        writers.append(b)
    
    # Grabbing performers
    performers = card.find_all('div', {'class':'creditors__list'})
    if performers[0].text.count('\n') > 2:
        artists = performers[0].text.strip().split(sep='\n')
    else:
        artists = performers[0].text.strip('\n')
        
    # Grabbing alternate titles
    try:
        if performers[1].text.count('\n'):
            alt_titles = performers[1].text.strip().split(sep='\n')
        else:
            alt_titles = performers[1].text.strip('\n')
    except IndexError:
        alt_titles = None
        
    return title, writers, artists, alt_titles

In [54]:
ck = song_sorter(test_html)
t, w, a, at = get_song_credits(ck[12])

In [46]:
len(ck)

64

In [15]:
def get_html_songs(html_list):
    '''
    Retreives songs in html files and stores in a list of dicts
    '''
    song_list = []
    count = 0
    for file in html_list:
        bs = bs_file(file)
        songs = song_sorter(bs)
        for song in songs:
            count += 1
            t, w, p, at = get_song_credits(song)
            song_dict = {}
            song_dict['Title'] = t
            song_dict['Writers'] = w
            song_dict['Performers'] = p
            song_dict['Alt-Titles'] = at
            song_list.append(song_dict)
            if len(song_list) % 5000 == 0:
                print('{} songs completed'.format(count))
    return song_list

## Retrieving HTML File List

In [7]:
html_list = os.listdir('../data/ascap_songs/')

In [8]:
len(html_list)

13019

In [13]:
test = get_html_songs(html_list[0:5])

In [63]:
html_list[0:5]

['BONE THUGS N HARMONY FEAT. EAZY-E.html',
 'VESTAL AND LEE GREENWOOD.html',
 ' PETE SEEGER.html',
 'DARIUS RUCKER FEAT JILL SCOTT.html',
 'GARTH BROOKS.html']

In [14]:
len(test)

7

## Retrieving Song Results

In [27]:
html_list.index('.ipynb_checkpoints')

9917

In [29]:
html_list.pop(9917)

'.ipynb_checkpoints'

In [16]:
song_list = get_html_songs(html_list)

5000 songs completed
10000 songs completed
15000 songs completed
20000 songs completed
25000 songs completed
30000 songs completed
35000 songs completed
40000 songs completed
45000 songs completed
50000 songs completed
55000 songs completed
60000 songs completed
65000 songs completed
70000 songs completed
75000 songs completed
80000 songs completed
85000 songs completed
90000 songs completed
95000 songs completed
100000 songs completed
105000 songs completed
110000 songs completed
115000 songs completed
120000 songs completed
125000 songs completed
130000 songs completed
135000 songs completed
140000 songs completed
145000 songs completed
150000 songs completed
155000 songs completed
160000 songs completed
165000 songs completed
170000 songs completed
175000 songs completed
180000 songs completed
185000 songs completed
190000 songs completed
195000 songs completed
200000 songs completed
205000 songs completed
210000 songs completed
215000 songs completed
220000 songs completed
225000 s

IsADirectoryError: [Errno 21] Is a directory: '../data/ascap_songs/.ipynb_checkpoints'

:0(((