# Collecting ASCAP Info + Storing in DB

**Description**: Pulling necessary song info from the ASCAP HTML files & storing in PostreSQL db

In [1]:
import json
import pickle
import re

from bs4 import element
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

import pandas as pd

## Test-Run: Loading in HTML File

In [3]:
with open('../data/ascap_songs/CHIEF KEEF.html', 'rb+') as f:
    test_html = BeautifulSoup(f, 'lxml')

### Retrieving Song Cards

My current thought is that I should divvy these results into a list of `dict`s, which should allow me to easily toss into a dataframe or db afterwards.

In [5]:
test_cards = test_html.find_all('div', {'class':'card hyphenate card--expanded'})

64 songs indexed for Chief Keef

In [7]:
len(test_cards)

64

##### The title and "Total Current ASCAP Share"

In [17]:
test_cards[0].find('div', {'class':'card__header__title'}).text.strip().split(sep='\n')

['ADAM & EVE',
 '                ',
 '                    ',
 '                    Total Current ASCAP Share:',
 '                    25%']

##### Just the song title

In [18]:
test_cards[0].find('div', {'class':'card__header__title'}).text.strip().split(sep='\n')[0]

'ADAM & EVE'

##### Writers

In [19]:
writers_row = test_cards[0].find_all('tr', {'class':'writers__row'})

In [34]:
writer_list = []

for writer in writers_row:
    if isinstance(writer.find('a'), element.Tag):
        writer_list.append((writer.find('a'), writer.find_all('td', {'class':'sup-col'})))
    elif isinstance(writer.find('span'), element.Tag):
        writer_list.append((writer.find('span'), writer.find_all('td', {'class':'sup-col'})))
    else:
        writer_list.append('cannot find writer')
        
writer_info = []

for writer, writer_ids in writer_list:
    b = writer.attrs
    b['pro'] = writer_ids[0].text.strip()
    b['ipi'] = writer_ids[1].text.strip()
    writer_info.append(b)

In [50]:
writer_list[0][0].attrs.pop('href')
writer_list[0][0].attrs.pop('class')
writer_list[0][0].attrs

{'data-writer': 'MCKINNEY REGINALD CHRISTOPHER',
 'data-id': '652264355',
 'pro': 'ASCAP',
 'ipi': '652264355'}

In [36]:
writer_info

[{'href': 'javascript:void(0)',
  'class': ['name'],
  'data-writer': 'MCKINNEY REGINALD CHRISTOPHER',
  'data-id': '652264355',
  'pro': 'ASCAP',
  'ipi': '652264355'},
 {'class': ['no-click'],
  'data-writer': 'WRITER UNKNOWN  ',
  'data-id': '0',
  'pro': 'NS',
  'ipi': 'â€”'}]

Performers

In [73]:
performers = test_cards[0].find_all('div', {'class':'creditors__list'})

In [74]:
performers[0].text.strip('\n')

'CHIEF KEEF'

Retreive one or multiple performers per song

In [78]:
# performers[1]

IndexError: list index out of range

In [76]:
if performers[0].text.count('\n') > 2:
    performers[0].text.strip().split(sep='\n')
else:
    performers[0].text.strip('\n')

HTML Title

In [81]:
def bs_file(file):
    '''
    Opens file and returns beautiful soup object
    '''
    with open('../{}'.format(file), 'rb') as f:
        bs = BeautifulSoup(f, 'lxml')
        return bs

def song_sorter(bs):
    '''
    Iterates through songs in bs object and returns cards of all songs
    '''
    cards = bs.find_all('div', {'class':'card hyphenate card--expanded'})
    for i in range(len(cards)):
        get_song_credits(cards[i])

def get_song_credits(card):
    '''
    Retrieves song credits from an individual song card
    '''   
    title = cardtest_cards[0].find('div', {'class':'card__header__title'}).text.strip().split(sep='\n')[0]
    writers_row = test_cards[0].find_all('tr', {'class':'writers__row'})
    writer_list = []
    
    # Creating writer list for further filtering
    for writer in writers_row:
        if isinstance(writer.find('a'), element.Tag):
            writer_list.append((writer.find('a'), writer.find_all('td', {'class':'sup-col'})))
        elif isinstance(writer.find('span'), element.Tag):
            writer_list.append((writer.find('span'), writer.find_all('td', {'class':'sup-col'})))
        else:
            writer_list.append('cannot find writer')
    writers = []
    for writer, writer_ids in writer_list:
        b = writer.attrs
        
        # removing extraeneous fields
        pl = ['href', 'class']
        [b.pop(p) for p in pl]
        b['pro'] = writer_ids[0].text.strip()
        b['ipi'] = writer_ids[1].text.strip()
        writers.append(b)
    
    # Grabbing performers
    performers = test_cards[0].find_all('div', {'class':'creditors__list'})
    if performers[0].text.count('\n') > 2:
        pers = performers[0].text.strip().split(sep='\n')
    else:
        pers = performers[0].text.strip('\n')
        
    # Grabbing alternate titles
    try:
        if performers[1].text.count('\n'):
            alt_titles = performers[1].text.strip().split(sep='\n')
        else:
            alt_titles = performers[1].text.strip('\n')
    except IndexError:
        alt_titles = None
        
    return title, writers, pers, alt_titles

In [82]:
def get_html_songs(html_list):
    '''
    Retreives songs in html files and stores in a list of dicts
    '''
    song_list = []
    for file in html_list:
        bs = bs_file(file)
        songs = song_sorter(bs)
        for song in songs:
            t, w, p, at = get_song_credits(song)
            song_dict = {}
            song_dict['Title'] = t
            song_dict['Writers'] = w
            song_dict['Performers'] = p
            song_dict['Alt-Titles'] = at
            song_list.append(song_dict)
    return song_list