In [805]:
import pandas as pd
import re
from IPython.display import Markdown, display
from collections import Counter
import networkx as nx

def periodize_dates(dates:list=[], delta:datetime.timedelta=timedelta(days=14), dateformat='%Y-%m-%d'):
    """https://gist.github.com/kallewesterling/9a8d12ce073776ed52865bfb362ad073"""

    try:
        dates = sorted([datetime.datetime.strptime(x, dateformat) for x in dates])
    except ValueError as e:
        date = re.search(r'''['"](.*)['"] does not match format''', str(e))
        if date:
            date = date.groups()[0]
        raise ValueError(f'A date found in list that did not adhere to format (`{date}`). Needs to follow format `{dateformat}`.') from None

    if isinstance(delta, int):
        delta = timedelta(days=delta)

    periods = []

    for ix, date in enumerate(dates):
        min_date = date - delta
        max_date = date + delta

        prev_date, next_date = None, None
        start_chain, end_chain, in_chain, solo_date = None, None, None, None
        prev_date_in_range, next_date_in_range = None, None

        try:
            if ix-1 >= 0:
                prev_date = dates[ix-1]
        except IndexError:
            prev_date = None

        try:
            next_date = dates[ix+1]
        except IndexError:
            next_date = None

        if next_date:
            next_date_in_range = next_date >= min_date and next_date <= max_date

        if prev_date:
            prev_date_in_range = prev_date >= min_date and prev_date <= max_date

        if all([next_date, prev_date, prev_date_in_range, next_date_in_range]):
            # In the loop and in a chain (near previous date and next)
            in_chain = True
        elif all([next_date, prev_date, next_date_in_range]) and not prev_date_in_range:
            # In the loop and beginning of a chain (not near previous date but near next)
            start_chain = True
        elif all([next_date, prev_date, prev_date_in_range]) and not next_date_in_range:
            # In the loop and end of a chain (near previous date but not next)
            end_chain = True
        elif all([next_date, prev_date]) and not all([prev_date_in_range, next_date_in_range]):
            # In the loop but solo date (not not near previous date nor next)
            solo_date = True
        elif next_date and next_date_in_range:
            # In the loop but solo date (not not near previous date nor next)
            start_chain = True
        elif next_date:
            solo_date = True
        elif prev_date and prev_date_in_range:
            end_chain = True
        elif prev_date:
            solo_date = True
        elif not next_date and not prev_date:
            solo_date = True
        else:
            raise RuntimeError('An unexpected error occurred.')

        if start_chain:
            periods.append([date])

        elif end_chain:
            periods[len(periods)-1].append(date)

        elif solo_date:
            periods.append([date])

        elif in_chain:
            periods[len(periods)-1].append(date)
            
    return periods


# Data clean up functions
def has_required_data(row):
    '''(internal) for use with DataFrame lambda function to ensure that any given row has the required data present'''
    has_performer = row['Performer'] != '' or row['Normalized performer'] != '' or (row['Performer first-name'] != '' or row['Performer last-name']) != ''
    # has_city = row['City'] or row['Normalized City']
    has_venue = row['Venue'] != ''
    if has_performer and has_venue:
        return True
    else:
        return False

def has_correct_date(row):
    '''(internal) for use with DataFrame lambda function to ensure that any given row has a correct date present'''
    return re.search(r'\d{4}\-\d{2}\-\d{2}', row['Date']) != None


def get_performer(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a performer's name (in an order of priority)'''

    if row['Performer first-name'] and row['Performer last-name']:
        return row['Performer first-name'] + ' ' + row['Performer last-name']
    
    for r in ['Normalized performer', 'Performer']:
        if row[r]:
            return row[r]
    
    return null_value

    
def get_city(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a city's name (in an order of priority)'''
    for r in ['Normalized City', 'City']:
        if row[r]:
            return row[r]
    
    return null_value


def get_unique_venue(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a venue's name (in an order of priority)'''
    if row['Venue'] and row['City']:
        return row['Venue'] + ' (' + row['City'] + ')'
    
    for r in ['Venue', 'City']:
        if row[r]:
            return row[r]

    return null_value


def get_source(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a source (in an order of priority)'''
    for r in ['Source clean', 'Source']:
        if row[r]:
            return row[r]

    return null_value


def get_revue(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a revue's name (in an order of priority)'''
    for r in ['Normalized Revue Name', 'Revue name']:
        if row[r]:
            return row[r]
    
    return null_value

In [806]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=0&single=true&output=csv')
Markdown(f'**{df.shape[0]} rows imported.**')

**7995 rows imported.**

In [807]:
# Fix basic stuff
df.replace('—', '', inplace=True)
df.replace('–', '', inplace=True)
df.fillna('', inplace=True)

In [808]:
# Filter
df.drop(df[df['Exclude from visualization'] == True].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Exclusion from visulization.')

**7971 rows after filtering**: Exclusion from visulization.

In [809]:
# Filter
df.drop(df[df['Unsure whether drag artist'] == 'TRUE'].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Unsure whether drag artist.')

**7494 rows after filtering**: Unsure whether drag artist.

In [810]:
df['has_required_data'] = df.apply(lambda row: has_required_data(row), axis=1)
df.drop(df[df['has_required_data'] == False].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Required data.')

**5518 rows after filtering**: Required data.

In [811]:
df['has_correct_date'] = df.apply(lambda row: has_correct_date(row), axis=1)
df.drop(df[df['has_correct_date'] == False].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Full date in `Date` column.')

**5456 rows after filtering**: Full date in `Date` column.

In [812]:
# Clean up names
df['Performer'] = df.apply(lambda row: get_performer(row), axis=1)
df['City'] = df.apply(lambda row: get_city(row), axis=1)
df['Source'] = df.apply(lambda row: get_source(row), axis=1)
df['Revue'] = df.apply(lambda row: get_revue(row), axis=1)
df['Unique venue'] = df.apply(lambda row: get_unique_venue(row), axis=1)
Markdown(f'_Cleaned up all names_.')

_Cleaned up all names_.

In [813]:
# Drop *node* information - i.e. not relevant for edges
node_information = {}
node = {
    'Legal name': None,
    'Alleged age': None,
    'Assumed birth year': None,
    'Source': None
}
for ix, row in df.iterrows():
    if row['Legal name'] or row['Alleged age'] or row['Assumed birth year']:
        if row['Performer'] not in node_information:
            node_information[row['Performer']] = []
        
        d = node
        for cat in ['Legal name', 'Assumed birth year', 'Alleged age', 'Source']:
            d[cat] = row[cat]
            
        node_information[row['Performer']].append(d)
Markdown(f'_Extracted node information_.')

_Extracted node information_.

In [814]:
# Drop unnecessary information
cols = ['Revue name', 'Normalized Revue Name', 'Legal name', 'Alleged age', 'Assumed birth year', 'Source clean', 'Category', '2020-12-31 ID', 'Normalized City', 'Performer first-name', 'Performer last-name', 'Normalized performer', 'has_required_data', 'has_correct_date', 'Exclude from visualization', 'Unsure whether drag artist']
for col in cols:
    try:
        del df[col]
    except KeyError:
        pass # already gone
Markdown(f'_Removed all unneccesary columns_:' + '\n - ' + '\n -'.join(cols))

_Removed all unneccesary columns_:
 - Revue name
 -Normalized Revue Name
 -Legal name
 -Alleged age
 -Assumed birth year
 -Source clean
 -Category
 -2020-12-31 ID
 -Normalized City
 -Performer first-name
 -Performer last-name
 -Normalized performer
 -has_required_data
 -has_correct_date
 -Exclude from visualization
 -Unsure whether drag artist

In [815]:
all_venues = list(set([x for x in df['Unique venue']]))

dfs = {}
for venue in [x for x in all_venues]:
    dfs[venue] = {'df': df[df['Unique venue'] == venue], 'count': 0}
    for col in dfs[venue]['df'].columns:
        if not col in ['Date', 'Performer', 'Unique venue', 'City', 'Source']:
            del dfs[venue]['df'][col]
    dfs[venue]['count'] = len(dfs[venue]['df'])
    dfs[venue]['all_dates'] = sorted(list(set([x for x in dfs[venue]['df']['Date']])))
    dfs[venue]['periodized_dates'] = periodize_dates(dfs[venue]['all_dates'], timedelta(days=14))

In [816]:
# df['venue-link'] = ''

In [817]:
venue_connections = {}
for venue in dfs:
    if not venue in venue_connections:
        venue_connections[venue] = {}
    for period, dates in enumerate(dfs[venue]['periodized_dates'], start=1):
        if not period in venue_connections[venue]:
            venue_connections[venue][period] = {'dates': [], 'performers': []}
        for date in dates:
            venue_connections[venue][period]['dates'].append(date.strftime('%Y-%m-%d'))
            for x, y in dfs[venue]['df'][dfs[venue]['df']['Date'] == date.strftime('%Y-%m-%d')].iterrows():
                venue_connections[venue][period]['performers'].append(y['Performer'])
        venue_connections[venue][period]['performers'] = list(set(venue_connections[venue][period]['performers']))
        venue_connections[venue][period]['dates'] = sorted(list(set(venue_connections[venue][period]['dates'])))

In [818]:
performer_connections = {}

for venue, venue_connection in venue_connections.items():
    for _, data in venue_connection.items():
        for performer_out in data['performers']:
            other_performers = [x for x in data['performers'] if not x == performer_out]
            weight = len(data['dates'])
            min_date = min([datetime.datetime.strptime(x, '%Y-%m-%d') for x in data['dates']])
            max_date = max([datetime.datetime.strptime(x, '%Y-%m-%d') for x in data['dates']])
            if not other_performers:
                pass # Only one performer appeared in the period
            for performer_in in other_performers:
                if not performer_out in performer_connections:
                    performer_connections[performer_out] = []
                performer_connections[performer_out].append((performer_in, weight, venue, min_date, max_date))

In [819]:
G = nx.MultiDiGraph()

for source, connections in performer_connections.items():
    for connection in connections:
        target, weight, venue, min_date, max_date = connection
        G.add_edges_from([(source, target, weight, {'venue': venue, 'min_date': min_date.strftime('%Y-%m-%d'), 'max_date': max_date.strftime('%Y-%m-%d')})])

In [820]:
nx.write_gexf(G=G, path='test.gexf')