In [1]:
import pandas as pd
import re
from datetime import datetime as dt

YEAR_RANGE = (1900, 1950)

def remove(row):
    try:
        date = dt.strptime(row.Date, '%Y-%m-%d')
    except:
        return True
    
    if YEAR_RANGE and date.year >= YEAR_RANGE[0] and date.year <= YEAR_RANGE[1]:
        pass # in the range!
    else:
        return True

    if row['Exclude from visualization'] or row['Unsure whether drag artist']:
        return True
    
    no_city = row['City'] == ''
    no_performer = row['Performer'] == ''
    no_venue = row['Venue'] == ''
    unnamed_performer = 'unnamed' in row['Performer'].lower()
    
    if no_city and no_performer and no_venue:
        return True
    
    if unnamed_performer:
        return True

    return False

def extract_addresses_dict(normalized_df):
    addresses = {}
    rows_with_addresses = normalized_df[normalized_df['Address']!='']
    warnings = []
    for x in zip(rows_with_addresses['Date'], rows_with_addresses['Source'], rows_with_addresses['Venue'], rows_with_addresses['Address']):
        date, source, venue, address = x
        if venue == '':
            warnings.append(address)
        else:
            if not venue in addresses:
                addresses[venue] = {}
            if not source in addresses[venue]:
                addresses[venue][source] = address
    if len(warnings):
        print(f'Warning: {len(warnings)} Venues with no names have addresses:')
        print('- ' + '- '.join(warnings))

    return addresses

def reverse_comment_dict(comment_dict):
    comments_reverse = {}
    for performer, comments in comment_dict.items():
        if not performer in comments_reverse:
            comments_reverse[performer] = {}
        for source, comment in comments.items():
            if not comment in comments_reverse[performer]:
                comments_reverse[performer][comment] = []
            comments_reverse[performer][comment].append(source)
    return comments_reverse
        
def get_comments(df, comment_field='Comment on edge: revue', match_field='Revue', transform=None):
    comments = {}
    rows_with_comments = df[df[comment_field]!='']
    warnings = []
    for x in zip(rows_with_comments['Date'], rows_with_comments['Source'], rows_with_comments[match_field], rows_with_comments[comment_field]):
        date, source, match, comment = x
        comment = str(comment).strip()
        if transform:
            comment = transform(comment)
        if match == '':
            warnings.append(str(comment)[:40]+'...')
        else:
            if not match in comments:
                comments[match] = {}
            if not source in comments[match]:
                comments[match][source] = comment
    if len(warnings):
        print(f'Warning: {len(warnings)} mentions in `{comment_field}` with no value have comments:')
        print('- ' + '\n- '.join(warnings))

    return comments

def get_revue_comments_dict(df):
    return get_comments(df, 'Comment on edge: revue', 'Revue')

def get_performer_comments_dict(df):
    return get_comments(df, 'Comment on node: performer', 'Performer')

def get_venue_comments_dict(df):
    return get_comments(df, 'Comment on node: venue', 'Venue')

def get_city_comments_dict(df):
    return get_comments(df, 'Comment on node: city', 'City')

def get_true_value(row, type):
    if type == 'source':
        if row['Source clean'] != '':
            return row['Source clean']
        return row['Source']
    if type == 'performer':
        if row['Normalized performer'] != '':
            return row['Normalized performer']
        if row['Performer first-name'] != '' and row['Performer last-name'] != '':
            return row['Normalized performer']
        return row['Performer']
    if type == 'city':
        if row['Normalized City'] != '':
            return row['Normalized City']
        return row['City']
    if type == 'revue':
        if row['Normalized Revue Name'] != '':
            return row['Normalized Revue Name']
        return row['Revue name']
    if type == 'venue':
        if row['Normalized Venue'] != '':
            return row['Normalized Venue']
        return row['Venue']
    raise NotImplementedError(f'type `{type}` is not yet implemented')

def find_ref(row, eima=True):
    source = row['Source']
    source += ' ' + row['EIMA']
    source += ' ' + row['Search (newspapers.com)']
    source += ' ' + row['Source clean']
    
    is_eima = 'eima' in source.lower() or 'variety' in source.lower() or 'billboard' in source.lower()
    has_ref = re.search(r'(\d{7,10})', source)
    refs = list(set(re.findall(r'(\d{7,10})', source)))
    if has_ref and eima and is_eima:
        return '|'.join(refs)
    
    if has_ref and not eima and not is_eima:
        return '|'.join(refs)

    return ''
    

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=0&single=true&output=csv')
df = df.fillna('')
df = df.replace('—', '')
df = df.replace('—*', '')


In [2]:
# First, set up our references to EIMA and newspapers.com
df['EIMA'] = df.apply(lambda row: find_ref(row), axis=1)
df['Newspapers.com'] = df.apply(lambda row: find_ref(row, False), axis=1)

# Normalize dataframe
df['Source'] = df.apply(lambda row: get_true_value(row, 'source'), axis=1)
df['Venue'] = df.apply(lambda row: get_true_value(row, 'venue'), axis=1)
df['Performer'] = df.apply(lambda row: get_true_value(row, 'performer'), axis=1)
df['City'] = df.apply(lambda row: get_true_value(row, 'city'), axis=1)
df['Revue'] = df.apply(lambda row: get_true_value(row, 'revue'), axis=1)


In [3]:
# Drop filtered data
df['remove'] = df.apply(lambda row: remove(row), axis=1)
df = df.drop(df[df['remove']==True].index)

In [4]:
sources = list(set([x for x in df['Source']]))

In [5]:
ID_NUMBERS = re.compile(r'\d{6,10}')
DATES = re.compile(r'(January|February|March|April|May|June|July|August|September|October|November|December) (\d{1,2}), (\d{4})')
ENDS_WITH_PAGE = re.compile(r', (\d{1,2})$')


In [6]:
sources_without_id = [x for x in sources if not ID_NUMBERS.search(x)]

sources_with_date = [x for x in sources_without_id if DATES.search(x)]

sources_with_date = sorted(sources_with_date)


papers = [DATES.split(x)[0][:-2] for x in sources_with_date]
dates = [DATES.search(x).groups() for x in sources_with_date]
# dates = [(x[2], x[0].replace('January', '01').replace('February', '02').replace('March', '03').replace('April', '04').replace('May', '05').replace('June', '06').replace('July', '07').replace('August', '08').replace('September', '09').replace('October', '10').replace('November', '11').replace('December', '12'), f'{int(x[1]):02d}') for x in dates]
dates = [(x[2], x[0], x[1]) for x in dates]

In [7]:

clear_page_number = [ENDS_WITH_PAGE.search(x) for x in sources_with_date]

In [8]:
source_data_file = 'sources-data.json'

In [9]:
from selenium.common.exceptions import NoSuchElementException

In [10]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys

try:
    b.get('http://www.newspapers.com')
except:
    b = webdriver.Firefox()
    b.get('http://www.newspapers.com')

In [11]:
import json, time
from pathlib import Path

In [None]:
LIMIT = 4000

def get_current_variables():
    try:
        return json.loads(Path('data.json').read_text())
    except FileNotFoundError:
        print('could not load JSON so returning empty')
        return {
            'not_found': [],
            'firstpage_urls': {},
            'newspaper_urls': {},
            'page_urls': {},
            'ambiguous': [],
            'no_link': {}
        }
    
def set_current_variables(not_found, firstpage_urls, newspaper_urls, page_urls, ambiguous, no_link):
    data = {
        'not_found': not_found,
        'firstpage_urls': firstpage_urls,
        'newspaper_urls': newspaper_urls,
        'page_urls': page_urls,
        'ambiguous': ambiguous,
        'no_link': no_link
    }
    Path('data.json').write_text(json.dumps(data))


count = len([x for x in zip(papers, dates, clear_page_number)])
counter = 0

for newspaper, date, page in [x for x in zip(papers, dates, clear_page_number)][:LIMIT]:
    counter += 1
    if newspaper.startswith("*") or newspaper.startswith("-") or 'date unknown' in newspaper:
        # print('--> skipping (due to invalid name for newspaper)')
        continue

        
    date_as_string = f'{date[0]}-{date[1]}-{date[2]}'
    
    
    try:
        print(no_link[newspaper_clean][date_as_string])
        print('it is true so skip!')
        continue
    except:
        pass
    
        
    if not page:
        print('--> skipping (due to missing page number)')
        continue
        
        
    try:
        page_urls[newspaper_clean][date_as_string][page]
        print('--> skipping (due to already done)')
        continue
    except:
        pass
    
    
    if int(date[0]) < 1930 or int(date[0]) > 1939:
        print('--> skipping (due to date out of range)')
        continue
        
        
        
    skip_ahead = False
    
    not_found = get_current_variables()['not_found']
    firstpage_urls = get_current_variables()['firstpage_urls']
    newspaper_urls = get_current_variables()['newspaper_urls']
    page_urls = get_current_variables()['page_urls']
    ambiguous = get_current_variables()['ambiguous']
    no_link = get_current_variables()['no_link']
    
    newspaper_clean = re.sub(r' [A-Z][A-Z] ', ' ', newspaper)
    
    if newspaper_clean in not_found or newspaper_clean in ambiguous:
        # print('newspaper cannot be found', newspaper_clean)
        continue
    
    print()
    print("PROCESSING", newspaper_clean, f' {date[0]}-{date[1]}-{date[2]} ({counter}/{count})')
    
    if not newspaper_clean in newspaper_urls or newspaper_urls[newspaper_clean] == '':
        newspaper_urls[newspaper_clean] = ''
        if newspaper_urls[newspaper_clean] == '' or '#containing=' in newspaper_urls[newspaper_clean] or '/browse' in newspaper_urls[newspaper_clean]:
            print('searching for newspaper', newspaper_clean)
            b.get('https://www.newspapers.com/papers/')
            b.find_element_by_css_selector('[name="filtertext"]').send_keys(newspaper_clean)
            b.find_element_by_css_selector('[name="filtertext"]').send_keys(Keys.ENTER)
            time.sleep(3)
            if b.find_element_by_css_selector("#noresults"):
                if 'nothing matched your search' in b.find_element_by_css_selector("#noresults").text:
                    print('could not find newspaper', newspaper_clean, '-- searched for', newspaper_clean)
                    not_found.append(newspaper_clean)
                    set_current_variables(not_found, firstpage_urls, newspaper_urls, page_urls, ambiguous, no_link)
                    skip_ahead = True
            if b.find_elements_by_css_selector('.record-result'):
                if len(b.find_elements_by_css_selector('.record-result')) == 1:
                    b.find_elements_by_css_selector('.record-result')[0]
                    b.find_elements_by_css_selector('.record-result')[0].find_elements_by_tag_name('a')[0].click()
                    time.sleep(3)
                    newspaper_url = b.current_url
                    newspaper_urls[newspaper_clean] = newspaper_url
                else:
                    test = [x for x in b.find_elements_by_css_selector('.record-result') if x.find_element_by_tag_name('h2').text == newspaper_clean]
                    if len(test) == 1:
                        test[0].find_elements_by_tag_name('a')[0].click()
                        time.sleep(3)
                        newspaper_url = b.current_url
                        newspaper_urls[newspaper_clean] = newspaper_url                        
                    else:
                        ambiguous.append(newspaper_clean)
                        set_current_variables(not_found, firstpage_urls, newspaper_urls, page_urls, ambiguous, no_link)
                        print('--> skipping (due to ambivalent newspaper name)')
                        skip_ahead = True

    
    
    if skip_ahead:
        continue
        
    set_current_variables(not_found, firstpage_urls, newspaper_urls, page_urls, ambiguous, no_link)
    not_found = get_current_variables()['not_found']
    firstpage_urls = get_current_variables()['firstpage_urls']
    newspaper_urls = get_current_variables()['newspaper_urls']
    page_urls = get_current_variables()['page_urls']
    ambiguous = get_current_variables()['ambiguous']
    no_link = get_current_variables()['no_link']

    
    if not newspaper_clean in firstpage_urls:
        firstpage_urls[newspaper_clean] = {}
    
    if not date_as_string in firstpage_urls[newspaper_clean] or not '/image/' in firstpage_urls[newspaper_clean][date_as_string]:
        print(f'have browse page for {newspaper_clean} -- looking for first page for date')
        b.get(newspaper_urls[newspaper_clean])
        time.sleep(3)

        # choose year
        b.find_element_by_id('datepicker_year_combobox').send_keys(date[0])
        time.sleep(3)

        # choose month
        select = Select(b.find_element_by_id('select_month'))
        select.select_by_visible_text(date[1])
        time.sleep(3)

        # choose day
        day_element = [x for x in b.find_elements_by_class_name('calendar_option') if x.text == date[2]][0]
        try:
            day_element.find_element_by_tag_name('a')
            day_element.click()
            time.sleep(3)

            if not date_as_string in firstpage_urls[newspaper_clean]:
                firstpage_urls[newspaper_clean][date_as_string] = b.current_url
        except NoSuchElementException:
            if not newspaper_clean in no_link:
                no_link[newspaper_clean] = {}
            if not date_as_string in no_link[newspaper_clean]:
                no_link[newspaper_clean][date_as_string] = True
            print('--> skipping (due to missing date)')
            skip_ahead = True
    
    
    if skip_ahead:
        continue
        

    set_current_variables(not_found, firstpage_urls, newspaper_urls, page_urls, ambiguous, no_link)
    not_found = get_current_variables()['not_found']
    firstpage_urls = get_current_variables()['firstpage_urls']
    newspaper_urls = get_current_variables()['newspaper_urls']
    page_urls = get_current_variables()['page_urls']
    ambiguous = get_current_variables()['ambiguous']
    no_link = get_current_variables()['no_link']

    page = page.groups()[0]
    if not newspaper_clean in page_urls:
        page_urls[newspaper_clean] = {}
    
    if not date_as_string in page_urls[newspaper_clean]:
        page_urls[newspaper_clean][date_as_string] = {}
        
    if not page in page_urls[newspaper_clean][date_as_string]:
        if firstpage_urls[newspaper_clean][date_as_string].startswith('https://www.newspapers.com/paper'):
            skip_ahead = True
            del firstpage_urls[newspaper_clean][date_as_string]
            set_current_variables(not_found, firstpage_urls, newspaper_urls, page_urls, ambiguous, no_link)
        else:
            print(f'finding page {page} for date {date_as_string} in {newspaper_clean} -- browsing from first page')

            b.get(firstpage_urls[newspaper_clean][date_as_string])
            time.sleep(2)

            print(f'setting key to {int(page)-1}')
            b.find_element_by_id('filmstrip_pagenum_target_input').send_keys(int(page)-1)
            b.find_element_by_id('filmstrip_pagenum_target_input').send_keys(Keys.ENTER)
            
            current = b.find_element_by_id('filmstrip_pagenum_target_input').get_attribute('value')
            while not int(current) == int(page):
                b.find_element_by_css_selector('a[title="Next page"]').click()
                time.sleep(1)
                current = int(b.find_element_by_id('filmstrip_pagenum_target_input').get_attribute('value'))

            page_url = b.current_url

            if not newspaper_clean in page_urls:
                page_urls[newspaper_clean] = {}

            if not date_as_string in page_urls[newspaper_clean]:
                page_urls[newspaper_clean][date_as_string] = {}

            page_urls[newspaper_clean][date_as_string][page] = page_url

    
    if skip_ahead:
        continue
        
    set_current_variables(not_found, firstpage_urls, newspaper_urls, page_urls, ambiguous, no_link)

    print("DONE", newspaper_clean, '-->', page_urls[newspaper_clean][date_as_string][page])
    print()
    


PROCESSING Aiken Standard  1934-January-24 (7/3796)
DONE Aiken Standard --> https://www.newspapers.com/image/14191747


PROCESSING Akron Beacon Journal  1937-April-13 (8/3796)
DONE Akron Beacon Journal --> https://www.newspapers.com/image/228680699


PROCESSING Akron Beacon Journal  1930-August-1 (9/3796)
DONE Akron Beacon Journal --> https://www.newspapers.com/image/228836625


PROCESSING Akron Beacon Journal  1937-August-31 (10/3796)
DONE Akron Beacon Journal --> https://www.newspapers.com/image/228732511


PROCESSING Akron Beacon Journal  1935-December-11 (11/3796)
DONE Akron Beacon Journal --> https://www.newspapers.com/image/228473631


PROCESSING Akron Beacon Journal  1935-December-12 (12/3796)
DONE Akron Beacon Journal --> https://www.newspapers.com/image/228474387


PROCESSING Akron Beacon Journal  1935-December-14 (13/3796)
DONE Akron Beacon Journal --> https://www.newspapers.com/image/228475598


PROCESSING Akron Beacon Journal  1935-December-16 (14/3796)
DONE Akron Beacon J


PROCESSING Alton Illinois Evening Telegraph  1938-February-10 (192/3796)
DONE Alton Illinois Evening Telegraph --> https://www.newspapers.com/image/26078630


PROCESSING Alton Illinois Evening Telegraph  1938-February-26 (193/3796)
DONE Alton Illinois Evening Telegraph --> https://www.newspapers.com/image/26082470


PROCESSING Alton Illinois Evening Telegraph  1938-February-9 (194/3796)
DONE Alton Illinois Evening Telegraph --> https://www.newspapers.com/image/26078309


PROCESSING Alton Illinois Evening Telegraph  1934-June-19 (195/3796)
DONE Alton Illinois Evening Telegraph --> https://www.newspapers.com/image/17499950/


PROCESSING Alton Illinois Evening Telegraph  1937-May-20 (196/3796)
DONE Alton Illinois Evening Telegraph --> https://www.newspapers.com/image/16326696


PROCESSING Alton Illinois Evening Telegraph  1937-May-21 (197/3796)
DONE Alton Illinois Evening Telegraph --> https://www.newspapers.com/image/16326714


PROCESSING Alton Illinois Evening Telegraph  1937-October-2

--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to missing page number)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)

PROCESSING Brooklyn Daily Eagle  1931-April-20 (833/3796)
DONE Brooklyn Daily Eagle --> https://www.newspapers.com/image/57366056


PROCESSING Brooklyn Daily Eagle  1935-August-12 (834/3796)
DONE Brooklyn Daily Eagle --> https://www.newspapers.com/image/693805088

--> skipping (due to missing page number)

PROCESSING Brooklyn Daily Eagle  1930-December-1 

--> skipping (due to ambivalent newspaper name)

PROCESSING Buffalo Evening News  1935-December-18 (989/3796)
searching for newspaper Buffalo Evening News
--> skipping (due to ambivalent newspaper name)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)
--> skipping (due to date out of range)

PROCESSING Cache American  1935-April-2 (996/3796)
searching for newspaper Cache American
--> skipping (due to ambivalent newspaper name)

PROCESSING Camden Courier-Post  1938-August-4 (997/3796)
have browse page for Camden Courier-Post -- looking for first page for date
finding page 11 for date 1938-August-4 in Camden Courier-Post -- browsing from first page
setting key to 10
DONE Camden Courier-Post --> https://www.newspapers.com/image/480354466


PROCESSING Camden Courier-Post  1940-December-13 (998/3796)
have browse page for Camden Courier-Post -- looking for first page for date
finding page 26 for date 1940-December-13 in C

--> skipping (due to ambivalent newspaper name)

PROCESSING Coast Advertiser  1934-July-27 (1243/3796)
searching for newspaper Coast Advertiser
--> skipping (due to ambivalent newspaper name)

PROCESSING Cohoes American  1934-December-10 (1244/3796)
searching for newspaper Cohoes American
--> skipping (due to ambivalent newspaper name)
--> skipping (due to missing page number)
--> skipping (due to missing page number)
--> skipping (due to missing page number)
--> skipping (due to missing page number)

PROCESSING Columbus Telegram  1930-February-17 (1252/3796)
searching for newspaper Columbus Telegram
--> skipping (due to ambivalent newspaper name)

PROCESSING Corpus Christi Caller-Times  1936-August-28 (1253/3796)
searching for newspaper Corpus Christi Caller-Times
--> skipping (due to ambivalent newspaper name)
--> skipping (due to missing page number)

PROCESSING Cullman Tribune  1933-December-28 (1256/3796)
searching for newspaper Cullman Tribune
--> skipping (due to ambivalent news

In [321]:
get_current_variables()['page_urls']

{'Camden NJ Courier-Post': {'1943-July-19': {'8': 'https://www.newspapers.com/image/480816100'}},
 'Camden Courier-Post': {'1943-July-19': {'8': 'https://www.newspapers.com/image/480816100'}},
 'Minneapolis Star Tribune': {'1949-August-12': {'4': 'https://www.newspapers.com/image/180751297'}},
 'Hazleton Plain Speaker': {'1935-November-23': {'14': 'https://www.newspapers.com/image/98116292'}},
 'Indianapolis Star': {'1930-April-9': {'9': 'https://www.newspapers.com/image/104983031'}}}