In [11]:
import pandas as pd
import networkx as nx
import re
from datetime import datetime, timedelta
from collections import OrderedDict

def clean(string):
    string = string.strip()
    string = re.sub(' +', ' ', string)
    if string == '-' or string == '–' or string == '—' or string == '?':
        string = ''
    
    if string == '':
        string = None

    return string

In [2]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=0&single=true&output=csv')

In [5]:
df = df.fillna('')
df['Date']

0       1901-02-12
1       1904-04-30
2             1918
3             1920
4             1920
           ...    
7990              
7991              
7992              
7993              
7994              
Name: Date, Length: 7995, dtype: object

In [1]:
# df.columns

In [12]:
venue_data = {}

for ix, row in df.iterrows():
    if row['Exclude from visualization'] or row['Unsure whether drag artist']:
        continue
        
    if clean(row['Performer first-name']) and clean(row['Performer last-name']):
        performer = clean(row['Performer first-name']) + ' ' + clean(row['Performer last-name'])
    else:
        if row['Normalized performer']:
            performer = clean(row['Normalized performer'])
        else:
            performer = clean(row['Performer'])

    venue = clean(row['Venue'])
    
    if row['Normalized City']:
        city = clean(row['Normalized City'])
    else:
        city = clean(row['City'])
        
    date_precision = ''
    
    if row['Date']:
        row['Date'] = row['Date'].replace('?', '')
        try:
            date = datetime.strptime(row['Date'], '%Y-%m-%d')
            date_precision = 'full'
        except:
            try:
                date = datetime.strptime(row['Date'], '%Y-%m')
                date_precision = 'year-month'
            except:
                try:
                    date = datetime.strptime(row['Date'], '%Y')
                    date_precision = 'year'
                except:
                    print(row['Date'])
                    date = None
    else:
        date = None
        
    # We only want performers and venues here
    if not performer or not venue:
        continue
        
    source = row['Source clean']
    if not source:
        source = row['Source']
        
    if not venue in venue_data:
        venue_data[venue] = {'full_dates': OrderedDict(), 'years': OrderedDict(), 'year-months': OrderedDict()}

    if date_precision == 'full' and not date in venue_data[venue]['full_dates']:
        venue_data[venue]['full_dates'][date] = []

    if date_precision == 'year-month' and not date in venue_data[venue]['year-months']:
        venue_data[venue]['year-months'][date] = []

    if date_precision == 'year' and not date in venue_data[venue]['years']:
        venue_data[venue]['years'][date] = []

    if date_precision == 'full':
        venue_data[venue]['full_dates'][date].append({
            'performer': performer,
            'source': source
        })
        
    if date_precision == 'year-month':
        venue_data[venue]['year-months'][date].append({
            'performer': performer,
            'source': source
        })
    
    if date_precision == 'year':
        venue_data[venue]['years'][date].append({
            'performer': performer,
            'source': source
        })

In [13]:
G = nx.DiGraph()

process, i = 50, 0

for venue, d in venue_data.items():
    i+=1
    #if i > process:
    #    continue
        
    print(venue)
    
    for date, datapoints in d['full_dates'].items():
        start_date = date - timedelta(days=7)
        end_date = date + timedelta(days=7)
        dates = [start_date + timedelta(days=x) for x in range((end_date-start_date).days + 1)]
        for data in datapoints:
            print({x:y for x,y in venue_data[venue]['full_dates'].items() if x in dates and y != data})

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{datetime.datetime(1939, 2, 4, 0, 0): [{'performer': 'Jean La Monte', 'source': 'The Miami News, February 4, 1939, 5'}, {'performer': 'Sonny La Ray', 'source': 'The Miami News, February 4, 1939, 5'}]}
Elk's Rendezvous
{datetime.datetime(1939, 2, 18, 0, 0): [{'performer': 'Phil Black', 'source': 'The Chicago Defender, February 18, 1939, 19'}]}
Pioneer Club
{datetime.datetime(1939, 4, 22, 0, 0): [{'performer': 'Max Lengel', 'source': 'Daily News, April 22, 1939, 22'}]}
{datetime.datetime(1939, 5, 13, 0, 0): [{'performer': 'Max Lengel', 'source': 'Daily News, May 13, 1939, 22'}]}
{datetime.datetime(1939, 6, 3, 0, 0): [{'performer': 'Max Lengel', 'source': 'Daily News, 3 June 1939, 22'}]}
{datetime.datetime(1939, 6, 30, 0, 0): [{'performer': 'Sammy Grand', 'source': 'The Post-Star, June 30, 1939, 22'}], datetime.datetime(1939, 7, 1, 0, 0): [{'performer': 'Sammy Grand', 'source': 'The Post-Star, July 1, 1939, 10'}]}
{datetime.datetime(1939, 6, 30, 0, 0): [{'performer': 'Sammy Grand', 'sourc

In [422]:
large_g = nx.DiGraph()

for venue, d in venue_data.items():
    small_g = None
    for date, datapoints in d['full_dates'].items():
        edge_data = {
            'venue': venue,
            'date': date.strftime('%Y-%m-%d')
        }

        performers = list(set([data['performer'] for data in datapoints]))
        attrs = {}
        if len(performers) > 1:
            small_g = nx.complete_graph(performers)
            for edge in small_g.edges:
                attrs[edge] = edge_data
            nx.set_edge_attributes(small_g, attrs)
    if small_g:
        attrs = {}
        for edge in small_g.edges:
            if large_g.has_edge(edge[0], edge[1]):
                large_g.edges[edge[0], edge[1]]['sources'].append(edge_data)
                large_g.edges[edge[0], edge[1]]['weight'] += 1
            else:
                large_g.add_edge(edge[0], edge[1])
                attrs[edge] = {'sources': [edge_data], 'weight': 1}
        nx.set_edge_attributes(large_g, attrs)

In [423]:
attrs = {}
export_g = nx.DiGraph()
for edge in large_g.edges:
    min_date = None
    max_date = None
    for source in large_g.edges[edge]['sources']:
        date = datetime.strptime(source['date'], '%Y-%m-%d')
        if not min_date:
            min_date = date
        if not max_date:
            max_date = date
        if date > max_date:
            max_date = date
        if date < min_date:
            min_date = date
    attrs[edge] = {
        'min_date': min_date.strftime('%Y-%m-%d'),
        'max_date': max_date.strftime('%Y-%m-%d'),
        'weight': large_g.edges[edge]['weight']
    }
    export_g.add_edge(edge[0], edge[1], **attrs[edge])

In [424]:
nx.write_gexf(export_g, 'relations.gexf')

In [425]:
## Task: create chains of dates through iterations of them

DAYS = 7

diff = timedelta(days=DAYS)

start_date = datetime(1931, 1, 15, 0, 0)
test_list = [
    datetime(1931, 1, 15, 0, 0),
    datetime(1930, 1, 15, 0, 0),
    datetime(1929, 1, 15, 0, 0),
    datetime(1930, 12, 15, 0, 0),
    datetime(1930, 12, 30, 0, 0),
    datetime(1931, 1, 17, 0, 0),
    datetime(1931, 1, 19, 0, 0),
    datetime(1931, 1, 21, 0, 0),
    datetime(1931, 1, 26, 0, 0),
    datetime(1931, 1, 10, 0, 0),
    datetime(1931, 1, 5, 0, 0),
    datetime(1931, 2, 1, 0, 0),
    datetime(1931, 3, 1, 0, 0),
    datetime(1931, 6, 1, 0, 0),
    datetime(1931, 7, 1, 0, 0)
]


def create_chain_of_dates(start_date=datetime.now(), datelist=[datetime.now()], diff=timedelta(days=7)):
    """ Returns a list of dates that are chained (3 iterations down) """
    captured_dates, more_dates = [], True

    range_start = start_date - diff
    range_end = start_date + diff

    # Will search for dates in the range from {range_start} to {range_end}

    while more_dates:
        dates_within_range = [date for date in datelist if date > range_start and date < range_end]
        captured_dates.extend(dates_within_range)

        for date in dates_within_range:
            start_date = date

            range_start = start_date + timedelta(days=-DAYS)
            range_end = start_date + timedelta(days=DAYS)

            # Will search for dates in the range from {range_start} to {range_end}

            dates_within_range = [date for date in datelist if date > range_start and date < range_end]

            for date in dates_within_range:
                start_date = date

                range_start = start_date + timedelta(days=-DAYS)
                range_end = start_date + timedelta(days=DAYS)

                # Will search for dates in the range from {range_start} to {range_end}

                dates_within_range = [date for date in datelist if date > range_start and date < range_end]

                captured_dates.extend(dates_within_range)

            captured_dates.extend(dates_within_range)
        before = len(captured_dates)
        captured_dates = list(set(captured_dates))
        more_dates = (len(captured_dates) - before) > 0
        # print(more_dates)

    return sorted(captured_dates)

create_chain_of_dates(start_date=start_date, datelist=test_list, diff=diff)

[datetime.datetime(1930, 12, 30, 0, 0),
 datetime.datetime(1931, 1, 5, 0, 0),
 datetime.datetime(1931, 1, 10, 0, 0),
 datetime.datetime(1931, 1, 15, 0, 0),
 datetime.datetime(1931, 1, 17, 0, 0),
 datetime.datetime(1931, 1, 19, 0, 0),
 datetime.datetime(1931, 1, 21, 0, 0),
 datetime.datetime(1931, 1, 26, 0, 0),
 datetime.datetime(1931, 2, 1, 0, 0)]

In [426]:
periods = {}
MIN_YEAR = 1930
MAX_YEAR = 1945

for venue in venue_data:
    periods[venue] = {}
    for date in sorted([x for x in venue_data[venue]['full_dates'].keys()]):
        if date.year > MAX_YEAR or date.year < MIN_YEAR:
            # print(f'sorting out {date.year}')
            continue
            
        # print(f'...ok {date.year}')
        if not date.year in periods[venue]:
            periods[venue][date.year] = {}
        if not date.month in periods[venue][date.year]:
            periods[venue][date.year][date.month] = {}

        for datapoint in venue_data[venue]['full_dates'][date]:
            if not datapoint['performer'] in periods[venue][date.year][date.month]:
                periods[venue][date.year][date.month][datapoint['performer']] = {'counter': 0}
            periods[venue][date.year][date.month][datapoint['performer']]['counter'] += 1

In [427]:
# create network

G = nx.DiGraph()

for venue, _ in periods.items():
    for year, _ in _.items():
        for month, _ in _.items():
            for edge in nx.complete_graph([x for x in _.keys()]).edges:
                # edge1
                G.add_edge(edge[0], edge[1], weight=_[edge[0]]['counter'])
                G.add_edge(edge[1], edge[0], weight=_[edge[1]]['counter'])

In [428]:
nx.write_gexf(G, 'relations-in-time.gexf')

In [460]:
comments = {}

for ix, row in df.iterrows():
    if row['Unsure whether drag artist'] or row['Exclude from visualization']:
        continue
        
    if row['Comment on node: performer']:
        if row['Performer first-name'] and row['Performer last-name']:
            performer = row['Performer first-name'] + ' ' + row['Performer last-name']
        elif row['Normalized performer']:
            performer = row['Normalized performer']
        elif row['Performer']:
            performer = row['Performer']
        else:
            print('ERROR')
            
        if not performer in comments:
            comments[performer] = []
        
        comment = row['Comment on node: performer'].strip()
        if not comment in comments[performer]:
            comments[performer].append(comment)

comments

{'Pepper Cortez': ["Opener for women's wrestling match!"],
 'Gene Dana': ["Opener for women's wrestling match!",
  'warbled Shanghai Flo and seemed to be well liked by the customers'],
 'Neil Dornay': ["Opener for women's wrestling match!",
  'credited as "exotic dancer"'],
 'Rhodie Kinsella': ['arrested'],
 '— Rocky': ['dressed as girls, are daintier than any of the back stars.',
  'unclear if performing in drag here'],
 'Billy Rohmer': ["Opener for women's wrestling match!"],
 'Jackie Collier': ['an imitator of Barbette!'],
 'Sepia Mae West —': ['"An important night club note that should have been recorded before now is that Mae West, the creole fashion plate, one of the foremost female impersonators of the present time, is headlining at the 1-0-1 Club in West 139th street. Probably one of the reason [sic] Impresario MacDonald will have to enlarge his emporium to accommodate his overflow patronage."',
  '"returned to the city last week and will be seen nightly . . . Big Ivy, owner of