In [1]:
# First make YAML from dataset

# Then maybe use jinja or something to render as HTML



In [2]:
import pandas as pd
from datetime import datetime as dt
import re

YEAR_RANGE = (1900, 1950)

In [3]:
def remove(row):
    try:
        date = dt.strptime(row.Date, '%Y-%m-%d')
    except:
        return True
    
    if YEAR_RANGE and date.year >= YEAR_RANGE[0] and date.year <= YEAR_RANGE[1]:
        pass # in the range!
    else:
        return True

    if row['Exclude from visualization'] or row['Unsure whether drag artist']:
        return True
    
    no_city = row['City'] == ''
    no_performer = row['Performer'] == ''
    no_venue = row['Venue'] == ''
    unnamed_performer = 'unnamed' in row['Performer'].lower()
    
    if no_city and no_performer and no_venue:
        return True
    
    if unnamed_performer:
        return True

    return False

def extract_addresses_dict(normalized_df):
    addresses = {}
    rows_with_addresses = normalized_df[normalized_df['Address']!='']
    warnings = []
    for x in zip(rows_with_addresses['Date'], rows_with_addresses['Source'], rows_with_addresses['Venue'], rows_with_addresses['Address']):
        date, source, venue, address = x
        if venue == '':
            warnings.append(address)
        else:
            if not venue in addresses:
                addresses[venue] = {}
            if not source in addresses[venue]:
                addresses[venue][source] = address
    if len(warnings):
        print(f'Warning: {len(warnings)} Venues with no names have addresses:')
        print('- ' + '- '.join(warnings))

    return addresses

def get_comments(df, comment_field='Comment on edge: revue', match_field='Revue', transform=None):
    comments = {}
    rows_with_comments = df[df[comment_field]!='']
    warnings = []
    for x in zip(rows_with_comments['Date'], rows_with_comments['Source'], rows_with_comments[match_field], rows_with_comments[comment_field]):
        date, source, match, comment = x
        comment = str(comment).strip()
        if transform:
            comment = transform(comment)
        if match == '':
            warnings.append(str(comment)[:40]+'...')
        else:
            if not match in comments:
                comments[match] = {}
            if not source in comments[match]:
                comments[match][source] = comment
    if len(warnings):
        print(f'Warning: {len(warnings)} mentions in `{comment_field}` with no value have comments:')
        print('- ' + '\n- '.join(warnings))

    return comments

def get_revue_comments_dict(df):
    return get_comments(df, 'Comment on edge: revue', 'Revue')

def get_performer_comments_dict(df):
    return get_comments(df, 'Comment on node: performer', 'Performer')

def get_venue_comments_dict(df):
    return get_comments(df, 'Comment on node: venue', 'Venue')

def get_city_comments_dict(df):
    return get_comments(df, 'Comment on node: city', 'City')

def get_true_value(row, type):
    if type == 'source':
        if row['Source clean'] != '':
            return row['Source clean']
        return row['Source']
    if type == 'performer':
        if row['Normalized performer'] != '':
            return row['Normalized performer']
        if row['Performer first-name'] != '' and row['Performer last-name'] != '':
            return row['Normalized performer']
        return row['Performer']
    if type == 'city':
        if row['Normalized City'] != '':
            return row['Normalized City']
        return row['City']
    if type == 'revue':
        if row['Normalized Revue Name'] != '':
            return row['Normalized Revue Name']
        return row['Revue name']
    if type == 'venue':
        if row['Normalized Venue'] != '':
            return row['Normalized Venue']
        return row['Venue']
    raise NotImplementedError(f'type `{type}` is not yet implemented')

def find_ref(row, eima=True):
    source = row['Source']
    source += ' ' + row['EIMA']
    source += ' ' + row['Search (newspapers.com)']
    source += ' ' + row['Source clean']
    
    is_eima = 'eima' in source.lower() or 'variety' in source.lower() or 'billboard' in source.lower()
    has_ref = re.search(r'(\d{7,10})', source)
    refs = list(set(re.findall(r'(\d{7,10})', source)))
    if has_ref and eima and is_eima:
        return '|'.join(refs)
    
    if has_ref and not eima and not is_eima:
        return '|'.join(refs)

    return ''
    

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=0&single=true&output=csv')
df = df.fillna('')
df = df.replace('—', '')
df = df.replace('—*', '')

# First, set up our references to EIMA and newspapers.com
df['EIMA'] = df.apply(lambda row: find_ref(row), axis=1)
df['Newspapers.com'] = df.apply(lambda row: find_ref(row, False), axis=1)

# Normalize dataframe
df['Source'] = df.apply(lambda row: get_true_value(row, 'source'), axis=1)
df['Venue'] = df.apply(lambda row: get_true_value(row, 'venue'), axis=1)
df['Performer'] = df.apply(lambda row: get_true_value(row, 'performer'), axis=1)
df['City'] = df.apply(lambda row: get_true_value(row, 'city'), axis=1)
df['Revue'] = df.apply(lambda row: get_true_value(row, 'revue'), axis=1)

# Extract "node" information
addresses = extract_addresses_dict(df)
revue_comments = get_revue_comments_dict(df)
performer_comments = get_performer_comments_dict(df)
venue_comments = get_venue_comments_dict(df)
city_comments = get_city_comments_dict(df)
edge_comments = get_comments(df, 'Edge Comment', 'Source')
legal_names = get_comments(df, 'Legal name', 'Performer')
ages = get_comments(df, 'Alleged age', 'Performer', lambda x: int(float(x)))
birth_years = get_comments(df, 'Assumed birth year', 'Performer', lambda x: int(float(x)))
eima_links = get_comments(df, 'EIMA', 'Source')
newspaper_links = get_comments(df, 'Newspapers.com', 'Source')

# Edge booleans
blackface_performers = get_comments(df, 'Blackface', 'Performer', lambda x: bool(x))
sepia_performers = get_comments(df, 'Sepia', 'Performer', lambda x: bool(x))
fan_dance_performers = get_comments(df, 'Fan dancer/Sally Rand', 'Performer', lambda x: bool(x))
exotic_dancers = get_comments(df, 'Exotic/erotic/oriental dancer/Gypsy', 'Performer', lambda x: bool(x))
has_image = get_comments(df, 'Has image', 'Performer', lambda x: bool(x))

# Drop filtered data
df['remove'] = df.apply(lambda row: remove(row), axis=1)
df = df.drop(df[df['remove']==True].index)

# Drop columns
df = df.drop(['Category', 'EIMA', 'Newspapers.com', 'Search (newspapers.com)', 'Blackface', 'Sepia', 'Fan dancer/Sally Rand', 'Exotic/erotic/oriental dancer/Gypsy', 'Has image', 'Legal name', 'Alleged age', 'Assumed birth year', 'Search (fulton)', 'Imported from former archive', 'Edge Comment', 'Comment on node: performer', 'Comment on node: venue', 'Comment on node: city', 'Comment on edge: revue', 'Exclude from visualization', 'Address', 'Unsure whether drag artist', 'Source clean', 'Normalized performer', 'Performer first-name', 'Performer last-name', 'Normalized Venue', 'Normalized City', 'Normalized Revue Name', 'Revue name', 'remove'], axis=1)

- 925 East Main Street
- "Gala Floor Show"...
- "Opening Wednesday Nite, July 3rd", "Pre...
- "Opening Wednesday Nite, July 3rd", "Pre...
- "Opening Wednesday Nite, July 3rd", "Pre...
- "Opening Wednesday Nite, July 3rd", "Pre...
- Direct to The Blue Room from Club Richma...
- Direct to The Blue Room from Club Richma...
- Direct to The Blue Room from Club Richma...
- Direct to The Blue Room from Club Richma...
- touring circus...
- touring circus...
- touring circus...
- "Where Boys Will Be Girls" "8 Big Acts"...
- "3 Complete Floor Shows Nightly"...
- "3 Complete Floor Shows Nightly"...
- "3 Complete Floor Shows Nightly"...
- "3 Complete Floor Shows Nightly"...
- "3 Complete Floor Shows Nightly"...
- "3 Complete Floor Shows Nightly"...
- "3 Complete Floor Shows Nightly"...
- "Sensational Floor Show"...
- "Sensational Floor Show"...
- "Big Floor Show", "Positively the Funnie...
- "Big Floor Show", "Positively the Funnie...
- "Big Floor Show", "Positively the Funnie...
- "Big Floor Show

In [11]:
# Time to render some files

from jinja2 import Environment, FileSystemLoader
from slugify import slugify
from pathlib import Path
import os

In [12]:
def keyshift(dictionary, key, diff):
    if key in dictionary:
        token = object()
        keys = [token]*(diff*-1) + sorted(dictionary) + [token]*diff
        newkey = keys[keys.index(key)+diff]
        if newkey is token:
            return None
        else:
            return {newkey: dictionary[newkey]}
    else:
        return None


def slugify_column(df, column='Performer'):
    if not column == 'Performer':
        all_values = list(sorted(set([x for x in df[column] if x and not x.startswith('—')])))
    else:
        all_values = list(sorted(set([x for x in df[column] if x]))) # we have to include the ones that start with — here
    values_dict = {}
    for value in all_values:
        done = False
        i = 0
        while not done:
            if i == 0:
                if not slugify(value) in values_dict:
                    values_dict[slugify(value)] = value
                    done = True
                else:
                    i += 1
            else:
                print('Warning: Multiple values with the same value. This should not happen:', value)
                if not f'{slugify(value)}-{i}' in values_dict:
                    values_dict[f'{slugify(value)}-{i}'] = value
                    done = True
                else:
                    i += 1
    return {v: k for k, v in values_dict.items()} # reversed
    

In [29]:
def get_venue_slug(venue):
    return ALL_VENUES[venue]

def get_performer_slug(venue):
    return ALL_PERFORMERS[venue]


TEMPLATE_DIR = '/Users/kallewesterling/Repositories/kallewesterling/dissertation/drag-data-browser/templates/'
OUTPUT_DIR = '/Users/kallewesterling/Repositories/kallewesterling/dissertation/drag-data-browser/docs/'


e = Environment(loader=FileSystemLoader(TEMPLATE_DIR))

ALL_YEARS = list(range(YEAR_RANGE[0], YEAR_RANGE[1]))
ALL_PERFORMERS = slugify_column(df, 'Performer')
ALL_VENUES = slugify_column(df, 'Unique venue')
ALL_CITIES = slugify_column(df, 'City')

e.globals['get_venue_slug'] = get_venue_slug
e.globals['get_performer_slug'] = get_performer_slug
e.globals['ALL_YEARS'] = ALL_YEARS
e.globals['ALL_PERFORMERS'] = ALL_PERFORMERS
e.globals['ALL_VENUES'] = ALL_VENUES
e.globals['ALL_CITIES'] = ALL_CITIES

In [31]:
artist_template = e.get_template('artist.html')


performers_active_dates_overview = {}

for performer, row in df.groupby(['Performer']):
    if not performer:
        continue

    html_file = os.path.join(OUTPUT_DIR, 'performer', ALL_PERFORMERS[performer], 'index.html')
    if not Path(html_file).parent.exists():
        Path(html_file).parent.mkdir(parents=True)
    
    full_venues = list(set([x for x in row['Unique venue'] if x and not x.startswith('—')]))
    full_venues = {x: ALL_VENUES[x] for x in full_venues}

    cities = list(set([x for x in row['City'] if x]))
    years_active = list(set(([x.year for x in pd.to_datetime(row['Date'])])))
    appears_at = list(set([x for x in row['Venue'] if x]))
    
    performers_active_dates_overview[performer] = years_active

    text = artist_template.render(data={
        'name': performer,
        'years_active': years_active,
        'full_venues': full_venues,
        'cities': cities,
        'in_blackface': blackface_performers.get(performer, {}),
        'sepia_performer': sepia_performers.get(performer, {}),
        'fan_dancer': fan_dance_performers.get(performer, {}),
        'exotic_dancer': exotic_dancers.get(performer, {}),
        'images': has_image.get(performer, {}),
        'comments': performer_comments.get(performer, {}),
        'legal_name': legal_names.get(performer, {}),
        'age': ages.get(performer, {}),
        'birth_year': birth_years.get(performer, {}),
        'relative': {
            'next': keyshift(ALL_PERFORMERS, performer, +1),
            'prev': keyshift(ALL_PERFORMERS, performer, -1)
        }
    })

    with open(html_file, 'w+') as f:
        f.write(text)
        
        
        
########################


venue_template = e.get_template('venue.html')


venues_active_dates_overview = {}

for venue, row in df.groupby(['Unique venue']):
    if not venue or venue.startswith('—'):
        continue

    html_file = os.path.join(OUTPUT_DIR, 'venue', ALL_VENUES[venue], 'index.html')
    if not Path(html_file).parent.exists():
        Path(html_file).parent.mkdir(parents=True)

    associated_performers = list(set([x for x in row['Performer'] if x]))
    associated_performers = {x: ALL_PERFORMERS[x] for x in associated_performers}
    years_active = list(set(([x.year for x in pd.to_datetime(row['Date'])])))
    
    venues_active_dates_overview[venue] = years_active
    
    text = venue_template.render(data={
        'name': venue,
        'years_active': years_active,
        'associated_performers': associated_performers,
        'addresses': addresses.get(venue, {}),
        'comments': venue_comments.get(venue, {}),
        'relative': {
            'next': keyshift(ALL_VENUES, venue, +1),
            'prev': keyshift(ALL_VENUES, venue, -1)
        }
    })
    
    with open(html_file, 'w+') as f:
        f.write(text)
        


home_template = e.get_template('home.html')

html_file = os.path.join(OUTPUT_DIR, 'index.html')
if not Path(html_file).parent.exists():
    Path(html_file).parent.mkdir(parents=True)

text = home_template.render(data={
    'xxx': 'xxx'
})

with open(html_file, 'w+') as f:
    f.write(text)







import itertools
MIN = min(list(itertools.chain.from_iterable(venues_active_dates_overview.values())))
MAX = max(list(itertools.chain.from_iterable(venues_active_dates_overview.values())))
        
venue_list_template = e.get_template('venue-list.html')

html_file = os.path.join(OUTPUT_DIR, 'venue', 'index.html')
if not Path(html_file).parent.exists():
    Path(html_file).parent.mkdir(parents=True)

text = venue_list_template.render(data={
    'venues_active_dates_overview': venues_active_dates_overview,
    'venues_years_range': [x for x in range(MIN,MAX)]
})

with open(html_file, 'w+') as f:
    f.write(text)







import itertools
MIN = min(list(itertools.chain.from_iterable(performers_active_dates_overview.values())))
MAX = max(list(itertools.chain.from_iterable(performers_active_dates_overview.values())))
        
performer_list_template = e.get_template('performer-list.html')

html_file = os.path.join(OUTPUT_DIR, 'performer', 'index.html')
if not Path(html_file).parent.exists():
    Path(html_file).parent.mkdir(parents=True)

text = performer_list_template.render(data={
    'performers_active_dates_overview': performers_active_dates_overview,
    'performer_years_range': [x for x in range(MIN,MAX)]
})

with open(html_file, 'w+') as f:
    f.write(text)
