In [22]:
import pandas as pd
import re
from datetime import datetime as dt

YEAR_RANGE = (1900, 1950)

def remove(row):
    try:
        date = dt.strptime(row.Date, '%Y-%m-%d')
    except:
        return True
    
    if YEAR_RANGE and date.year >= YEAR_RANGE[0] and date.year <= YEAR_RANGE[1]:
        pass # in the range!
    else:
        return True

    if row['Exclude from visualization'] or row['Unsure whether drag artist']:
        return True
    
    no_city = row['City'] == ''
    no_performer = row['Performer'] == ''
    no_venue = row['Venue'] == ''
    unnamed_performer = 'unnamed' in row['Performer'].lower()
    
    if no_city and no_performer and no_venue:
        return True
    
    if unnamed_performer:
        return True

    return False

def extract_addresses_dict(normalized_df):
    addresses = {}
    rows_with_addresses = normalized_df[normalized_df['Address']!='']
    warnings = []
    for x in zip(rows_with_addresses['Date'], rows_with_addresses['Source'], rows_with_addresses['Venue'], rows_with_addresses['Address']):
        date, source, venue, address = x
        if venue == '':
            warnings.append(address)
        else:
            if not venue in addresses:
                addresses[venue] = {}
            if not source in addresses[venue]:
                addresses[venue][source] = address
    if len(warnings):
        print(f'Warning: {len(warnings)} Venues with no names have addresses:')
        print('- ' + '- '.join(warnings))

    return addresses

def reverse_comment_dict(comment_dict):
    comments_reverse = {}
    for performer, comments in comment_dict.items():
        if not performer in comments_reverse:
            comments_reverse[performer] = {}
        for source, comment in comments.items():
            if not comment in comments_reverse[performer]:
                comments_reverse[performer][comment] = []
            comments_reverse[performer][comment].append(source)
    return comments_reverse
        
def get_comments(df, comment_field='Comment on edge: revue', match_field='Revue', transform=None):
    comments = {}
    rows_with_comments = df[df[comment_field]!='']
    warnings = []
    for x in zip(rows_with_comments['Date'], rows_with_comments['Source'], rows_with_comments[match_field], rows_with_comments[comment_field]):
        date, source, match, comment = x
        comment = str(comment).strip()
        if transform:
            comment = transform(comment)
        if match == '':
            warnings.append(str(comment)[:40]+'...')
        else:
            if not match in comments:
                comments[match] = {}
            if not source in comments[match]:
                comments[match][source] = comment
    if len(warnings):
        print(f'Warning: {len(warnings)} mentions in `{comment_field}` with no value have comments:')
        print('- ' + '\n- '.join(warnings))

    return comments

def get_revue_comments_dict(df):
    return get_comments(df, 'Comment on edge: revue', 'Revue')

def get_performer_comments_dict(df):
    return get_comments(df, 'Comment on node: performer', 'Performer')

def get_venue_comments_dict(df):
    return get_comments(df, 'Comment on node: venue', 'Venue')

def get_city_comments_dict(df):
    return get_comments(df, 'Comment on node: city', 'City')

def get_true_value(row, type):
    if type == 'source':
        if row['Source clean'] != '':
            return row['Source clean']
        return row['Source']
    if type == 'performer':
        if row['Normalized performer'] != '':
            return row['Normalized performer']
        if row['Performer first-name'] != '' and row['Performer last-name'] != '':
            return row['Normalized performer']
        return row['Performer']
    if type == 'city':
        if row['Normalized City'] != '':
            return row['Normalized City']
        return row['City']
    if type == 'revue':
        if row['Normalized Revue Name'] != '':
            return row['Normalized Revue Name']
        return row['Revue name']
    if type == 'venue':
        if row['Normalized Venue'] != '':
            return row['Normalized Venue']
        return row['Venue']
    raise NotImplementedError(f'type `{type}` is not yet implemented')

def find_ref(row, eima=True):
    source = row['Source']
    source += ' ' + row['EIMA']
    source += ' ' + row['Search (newspapers.com)']
    source += ' ' + row['Source clean']
    
    is_eima = 'eima' in source.lower() or 'variety' in source.lower() or 'billboard' in source.lower()
    has_ref = re.search(r'(\d{7,10})', source)
    refs = list(set(re.findall(r'(\d{7,10})', source)))
    if has_ref and eima and is_eima:
        return '|'.join(refs)
    
    if has_ref and not eima and not is_eima:
        return '|'.join(refs)

    return ''
    

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=0&single=true&output=csv')
df = df.fillna('')
df = df.replace('—', '')
df = df.replace('—*', '')


In [23]:
# First, set up our references to EIMA and newspapers.com
df['EIMA'] = df.apply(lambda row: find_ref(row), axis=1)
df['Newspapers.com'] = df.apply(lambda row: find_ref(row, False), axis=1)

# Normalize dataframe
df['Source'] = df.apply(lambda row: get_true_value(row, 'source'), axis=1)
df['Venue'] = df.apply(lambda row: get_true_value(row, 'venue'), axis=1)
df['Performer'] = df.apply(lambda row: get_true_value(row, 'performer'), axis=1)
df['City'] = df.apply(lambda row: get_true_value(row, 'city'), axis=1)
df['Revue'] = df.apply(lambda row: get_true_value(row, 'revue'), axis=1)


In [24]:
# Drop filtered data
df['remove'] = df.apply(lambda row: remove(row), axis=1)
df = df.drop(df[df['remove']==True].index)

In [27]:
sources = list(set([x for x in df['Source']]))
billboards = [x for x in sources if 'billboard' in x.lower()]

In [1]:
# Load up the billboard and the variety

In [6]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys

try:
    b.get('https://www-proquest-com.proxy.library.nyu.edu/eima')
except:
    b = webdriver.Firefox()
    b.get('https://www-proquest-com.proxy.library.nyu.edu/eima')

In [7]:
input('login')

login


''

In [119]:
import time, random, json
from pathlib import Path

In [38]:
billboards

['Billboard, October 11, 1947, 91',
 'Billboard, June 21, 1930, 16 (Vaudeville Reviews: The Palace, New York)',
 'Billboard, March 28, 1936, page unknown (Route Department)',
 'Billboard, July 5, 1947, 45',
 'Billboard, January 27, 1930, 49',
 'Billboard, August 1, 1936, 19 (Club Chatter)',
 'Billboard, May 27, 1944, page unknown',
 'Billboard, November 19, 1938, 18',
 'Billboard, September 10, 1932, 7 (Collier Returning to America)',
 'Billboard, January 17, 1948, 73',
 'Billboard, August 31, 1935, 14 (Round the Tables)',
 'Billboard, May 29, 1937, 37 (Club Chatter)',
 'Billboard, August 28, 1948, 48 (Uno)',
 'Billboard, May 14, 1938, 18',
 'Billboard, December 5, 1936, 14 (Club Chatter)',
 'Billboard, March 11, 1944, 25 (Advertisement)',
 'Billboard, January 27, 1940, page unknown',
 'Billboard, July 17, 1948, 91',
 "Billboard, June 7, 1930, 16 (VAUDEVILLE REVIEWS: Loew's State, New York)",
 'Billboard, January 1, 1938, 29',
 'Billboard, September 4, 1937, 14',
 'Billboard, March 17,

In [190]:
b.get('https://www-proquest-com.proxy.library.nyu.edu/eima/publication/686483')

In [204]:
def get_current_data():
    try:
        return json.loads(Path('billboard-data.json').read_text())
    except FileNotFoundError:
        return {
            '_investigate': []
        }
    
def put_current_data(data):
    Path('billboard-data.json').write_text(json.dumps(data))
    
    
for issue in [x for x in billboards if '1930' in x or '1931' in x or '1932' in x or '1933' in x or '1934' in x or '1935' in x or '1936' in x or '1937' in x or '1938' in x or '1939' in x or '1940' in x]:
    issue = issue.split('(')[0].strip()
    
    skip_ahead = False
    data = get_current_data()
    
    print()
    print(issue)

    if issue in data['_investigate']:
        print('issue in investigation!')
        skip_ahead = True
        
    if not skip_ahead:
        print('moving ahead...')

        elements = [x.strip() for x in issue.split(',') if x != 'Billboard']
        if len(elements) == 3:
            try:
                month, day = elements[0].split(' ')
                year = elements[1]
                if int(year) < 1929 or int(year) > 1940:
                    print(f'---> skipping: date not within range ({year})')
                    skip_ahead = True
                    continue

                if skip_ahead:
                    continue

                page = elements[2]
                if '(' in page:
                    page = [x.strip() for x in page.split('(')][0]
                month_short = month[:3]

                if 'page unknown' in page:
                    data['_investigate'].append(issue)
                    print(f'---> skipping: unknown page')
                    put_current_data(data)
                    skip_ahead = True
                    continue

                if skip_ahead:
                    continue

                if not year in data:
                    data[year] = {}

                if not month in data[year]:
                    data[year][month] = {}

                if not day in data[year][month]:
                    data[year][month][day] = {}

                if not page in data[year][month][day]:
                    data[year][month][day][page] = None

                print(f'looking for {year}, {month}, {day}')
                try:
                    select = Select(b.find_element_by_id('yearSelected'))
                    select.select_by_visible_text(year)
                    time.sleep(2)
                except:
                    time.sleep(5)
                    select = Select(b.find_element_by_id('yearSelected'))
                    select.select_by_visible_text(year)
                    time.sleep(2)

                try:
                    select = Select(b.find_element_by_id('monthSelected'))
                    select.select_by_visible_text(month)
                    time.sleep(2)
                except:
                    time.sleep(5)
                    select = Select(b.find_element_by_id('monthSelected'))
                    select.select_by_visible_text(month)
                    time.sleep(2)

                select_issue = []
                try:
                    select = Select(b.find_element_by_id('issueSelected'))
                    select_issue = [x.text for x in select.options if f'{month_short} {day},' in x.text]
                except:
                    time.sleep(5)
                    select = Select(b.find_element_by_id('issueSelected'))
                    select_issue = [x.text for x in select.options if f'{month_short} {day},' in x.text]

                if len(select_issue) == 1:
                    select_issue = select_issue[0]
                elif len(select_issue) == 0:
                    data['_investigate'].append(issue)
                    print(f'---> skipping: unknown issue')
                    put_current_data(data)
                    skip_ahead = True
                    continue

                if skip_ahead:
                    continue
                select.select_by_visible_text(select_issue)
                time.sleep(1)
                b.find_element_by_css_selector('input[value="View issue"]').click()

                time.sleep(10)

                try:
                    b.find_element_by_css_selector('a[title="First page"]').click()
                    time.sleep(10)
                except:
                    if len([x.text for x in b.find_elements_by_css_selector('li.disabled') if 'First' in x.text]):
                        pass # we're already on the first page
                    else:
                        continue # skipping...

                done = False

                print(f'browsing for {page}')
                while not done:
                    if len({x.find_element_by_class_name('titleAuthorETC').text: x for x in b.find_elements_by_class_name('resultItem') if x.find_element_by_class_name('titleAuthorETC').text.endswith(': ' + page + '.')}):
                        done = True
                    else:
                        b.find_element_by_css_selector('a[title="Next page"]').click()
                        time.sleep(10)

                found = {x.find_element_by_class_name('titleAuthorETC').text: x for x in b.find_elements_by_class_name('resultItem') if x.find_element_by_class_name('titleAuthorETC').text.endswith(': ' + page + '.')}

                link = found[list(found.keys())[0]].find_elements_by_tag_name('a')[3].get_attribute('href')

                data[year][month][day][page] = link

                put_current_data(data)
            except Exception as e:
                print(issue, 'failed...', e)
                pass
        else:
            #print('-----> skipping because cannot interpret')
            #print(elements)
            continue # skipping


Billboard, June 21, 1930, 16
issue in investigation!

Billboard, March 28, 1936, page unknown
issue in investigation!

Billboard, January 27, 1930, 49
issue in investigation!

Billboard, August 1, 1936, 19
issue in investigation!

Billboard, November 19, 1938, 18
issue in investigation!

Billboard, September 10, 1932, 7
issue in investigation!

Billboard, August 31, 1935, 14
issue in investigation!

Billboard, May 29, 1937, 37
issue in investigation!

Billboard, May 14, 1938, 18
issue in investigation!

Billboard, December 5, 1936, 14
issue in investigation!

Billboard, January 27, 1940, page unknown
issue in investigation!

Billboard, June 7, 1930, 16
issue in investigation!

Billboard, January 1, 1938, 29
issue in investigation!

Billboard, September 4, 1937, 14
issue in investigation!

Billboard, March 17, 1937, 49
issue in investigation!

Billboard, January 22, 1938, 8
issue in investigation!

Billboard, March 8, 1930, 51
issue in investigation!

Billboard, January 19, 1935, 12
is

Billboard, March 6, 1937, 13 failed... Message: Element <input id="searchPubIssues_330e11b0fb1ed" class="btn btn-default" name="searchPubIssues" type="submit"> could not be scrolled into view


Billboard, April 6, 1935, 13
issue in investigation!

Billboard, July 3, 1937, 14
issue in investigation!

Billboard, May 1, 1937, 14
issue in investigation!

Billboard, October 8, 1932, 22
issue in investigation!

Billboard, October 17, 1936, 13
issue in investigation!

Billboard, July 27, 1940, 23
issue in investigation!

The Billboard, Apr 11, 1936, 27
moving ahead...

Billboard, November 6, 1937, 24
issue in investigation!

Billboard, November 22, 1930, 19
issue in investigation!

Billboard, October 2, 1937, 14
issue in investigation!

Billboard, April 4, 1936, page unknown
issue in investigation!

Billboard, March 30, 1935, 4
issue in investigation!

Billboard, October 10, 1936, 16
issue in investigation!

Billboard, September 11, 1937, 14
issue in investigation!

Billboard, March 20, 1937,