In [None]:




































Note: This notebook now lives here:
    
    
https://colab.research.google.com/drive/1Id07FmLqLDLwDmThXrX45VCDsDmgj-tO
























































In [1]:
import pandas as pd
import re
from IPython.display import Markdown, display
from collections import Counter
import networkx as nx
from networkx.algorithms import community
from datetime import timedelta
import datetime

SETTINGS = {
    'DELTA': timedelta(days=14), # sets the standard for the periodizing of dates as belonging together
    'MIN_DATE': datetime.datetime.strptime('1920', '%Y'),
    'MAX_DATE': datetime.datetime.strptime('1950', '%Y'),
}

GRAPH_SETTINGS = {
    'FILTER_NODES': [
        'unknown',
        'unnamed'
    ]
}

    
def group_dates(dates:list=[], delta:datetime.timedelta=SETTINGS['DELTA'], dateformat='%Y-%m-%d'):
    """https://gist.github.com/kallewesterling/9a8d12ce073776ed52865bfb362ad073"""

    try:
        dates = sorted([datetime.datetime.strptime(x, dateformat) for x in dates])
    except ValueError as e:
        date = re.search(r'''['"](.*)['"] does not match format''', str(e))
        if date:
            date = date.groups()[0]
        raise ValueError(f'A date found in list that did not adhere to format (`{date}`). Needs to follow format `{dateformat}`.') from None

    if isinstance(delta, int):
        delta = timedelta(days=delta)

    periods = []

    for ix, date in enumerate(dates):
        min_date = date - delta
        max_date = date + delta

        prev_date, next_date = None, None
        start_chain, end_chain, in_chain, solo_date = None, None, None, None
        prev_date_in_range, next_date_in_range = None, None

        try:
            if ix-1 >= 0:
                prev_date = dates[ix-1]
        except IndexError:
            prev_date = None

        try:
            next_date = dates[ix+1]
        except IndexError:
            next_date = None

        if next_date:
            next_date_in_range = next_date >= min_date and next_date <= max_date

        if prev_date:
            prev_date_in_range = prev_date >= min_date and prev_date <= max_date

        if all([next_date, prev_date, prev_date_in_range, next_date_in_range]):
            # In the loop and in a chain (near previous date and next)
            in_chain = True
        elif all([next_date, prev_date, next_date_in_range]) and not prev_date_in_range:
            # In the loop and beginning of a chain (not near previous date but near next)
            start_chain = True
        elif all([next_date, prev_date, prev_date_in_range]) and not next_date_in_range:
            # In the loop and end of a chain (near previous date but not next)
            end_chain = True
        elif all([next_date, prev_date]) and not all([prev_date_in_range, next_date_in_range]):
            # In the loop but solo date (not not near previous date nor next)
            solo_date = True
        elif next_date and next_date_in_range:
            # In the loop but solo date (not not near previous date nor next)
            start_chain = True
        elif next_date:
            solo_date = True
        elif prev_date and prev_date_in_range:
            end_chain = True
        elif prev_date:
            solo_date = True
        elif not next_date and not prev_date:
            solo_date = True
        else:
            raise RuntimeError('An unexpected error occurred.')

        if start_chain:
            periods.append([date])

        elif end_chain:
            periods[len(periods)-1].append(date)

        elif solo_date:
            periods.append([date])

        elif in_chain:
            periods[len(periods)-1].append(date)
            
    return periods


# Data clean up functions
def has_required_data(row):
    '''(internal) for use with DataFrame lambda function to ensure that any given row has the required data present'''
    has_performer = row['Performer'] != '' or row['Normalized performer'] != '' or (row['Performer first-name'] != '' or row['Performer last-name']) != ''
    # has_city = row['City'] or row['Normalized City']
    has_venue = row['Venue'] != ''
    if has_performer and has_venue:
        return True
    else:
        return False


def has_correct_date(row):
    '''(internal) for use with DataFrame lambda function to ensure that any given row has a correct date present'''
    return re.search(r'\d{4}\-\d{2}\-\d{2}', row['Date']) != None


def get_performer(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a performer's name (in an order of priority)'''

    if row['Performer first-name'] and row['Performer last-name']:
        return row['Performer first-name'] + ' ' + row['Performer last-name']
    
    for r in ['Normalized performer', 'Performer']:
        if row[r]:
            return row[r]
    
    return null_value

    
def get_city(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a city's name (in an order of priority)'''
    for r in ['Normalized City', 'City']:
        if row[r]:
            return row[r]
    
    return null_value


def get_unique_venue(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a venue's name (in an order of priority)'''
    if row['Venue'] and row['City']:
        return row['Venue'] + ' (' + row['City'] + ')'
    
    for r in ['Venue', 'City']:
        if row[r]:
            return row[r]

    return null_value


def get_source(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a source (in an order of priority)'''
    for r in ['Source clean', 'Source']:
        if row[r]:
            return row[r]

    return null_value


def get_revue(row, null_value=''):
    '''(internal) for use with DataFrame lambda function to return the cleaned-up version of a revue's name (in an order of priority)'''
    for r in ['Normalized Revue Name', 'Revue name']:
        if row[r]:
            return row[r]
    
    return null_value

In [464]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=0&single=true&output=csv')

# Fix basic stuff
df.replace('—', '', inplace=True)
df.replace('–', '', inplace=True)
df.fillna('', inplace=True)

Markdown(f'**{df.shape[0]} rows imported.**')

**8001 rows imported.**

In [465]:
# Check for required data
df['has_required_data'] = df.apply(lambda row: has_required_data(row), axis=1)
df.drop(df[df['has_required_data'] == False].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Required data.')

df.sort_values('Normalized performer').head()

Unnamed: 0,Date,Category,Performer,Normalized performer,Performer first-name,Performer last-name,Venue,City,Normalized City,Revue name,...,Comment on node: venue,Comment on node: city,Comment on edge: revue,Blackface,Sepia,Fan dancer/Sally Rand,Exotic/erotic/oriental dancer/Gypsy,Has image,Address,has_required_data
1939,1956-07-07,,,Adrian Ames,Adrian,Ames,Forbidden City,"Honolulu, HI?","Honolulu, HI",,...,,,,,,,,,,True
1876,1952-11-07,,,Adrian Ames,Adrian,Ames,Falcon Show Bar,,,,...,,,,,,,,,,True
1875,1952-11-04,,,Adrian Ames,Adrian,Ames,Falcon Show Bar,,,,...,,,,,,,,,,True
1874,1952-10-31,,,Adrian Ames,Adrian,Ames,Falcon Show Bar,,,,...,,,,,,,,,,True
1873,1952-10-30,,,Adrian Ames,Adrian,Ames,Falcon Show Bar,,,,...,,,,,,,,,,True


In [466]:
# Filter
df.drop(df[df['Exclude from visualization'] == True].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Exclusion from visulization.')

**5994 rows after filtering**: Exclusion from visulization.

In [432]:
# Filter
df.drop(df[df['Unsure whether drag artist'] == 'TRUE'].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Unsure whether drag artist.')

**7497 rows after filtering**: Unsure whether drag artist.

In [434]:
df['has_correct_date'] = df.apply(lambda row: has_correct_date(row), axis=1)
df.drop(df[df['has_correct_date'] == False].index, inplace=True)
Markdown(f'**{df.shape[0]} rows after filtering**: Full date in `Date` column.')

**5460 rows after filtering**: Full date in `Date` column.

In [435]:
def string_date(row):
    return row['Date'].strftime('%Y-%m-%d')

if SETTINGS.get('MIN_DATE') or SETTINGS.get('MAX_DATE'):
    df['Date'] = pd.to_datetime(df['Date'])
    df = df[(df['Date'] > SETTINGS.get('MIN_DATE')) & (df['Date'] < SETTINGS.get('MAX_DATE'))]
    df['Date'] = df.apply(lambda row: string_date(row), axis=1)
    display(Markdown(f'**{df.shape[0]} rows after filtering**: Min and max date set.'))

**4881 rows after filtering**: Min and max date set.

In [436]:
# Clean up names
df['Performer'] = df.apply(lambda row: get_performer(row), axis=1)
df['City'] = df.apply(lambda row: get_city(row), axis=1)
df['Source'] = df.apply(lambda row: get_source(row), axis=1)
df['Revue'] = df.apply(lambda row: get_revue(row), axis=1)
df['Unique venue'] = df.apply(lambda row: get_unique_venue(row), axis=1)
Markdown(f'**Cleaned up all names**.')

**Cleaned up all names**.

In [437]:
# TODO: redo this part + include comment on node parts
# Drop *node* information - i.e. not relevant for edges

node_information = {}
node = {
    'Legal name': None,
    'Alleged age': None,
    'Assumed birth year': None,
    'Source': None
}
for ix, row in df.iterrows():
    if row['Legal name'] or row['Alleged age'] or row['Assumed birth year']:
        if row['Performer'] not in node_information:
            node_information[row['Performer']] = []
        
        d = node
        for cat in ['Legal name', 'Assumed birth year', 'Alleged age', 'Source']:
            d[cat] = row[cat]
            
        node_information[row['Performer']].append(d)

Markdown(f'**Extracted node information**.')

**Extracted node information**.

In [438]:
# Drop unnecessary information
cols = ['EIMA', 'Imported from former archive', 'Search (newspapers.com)', 'Search (fulton)', 'Venue', 'City', 'Revue name', 'Normalized Revue Name', 'Legal name', 'Alleged age', 'Assumed birth year', 'Source clean', 'Category', '2020-12-31 ID', 'Normalized City', 'Performer first-name', 'Performer last-name', 'Normalized performer', 'has_required_data', 'has_correct_date', 'Exclude from visualization', 'Unsure whether drag artist']
for col in cols:
    try:
        del df[col]
    except KeyError:
        pass # already gone

df = df.rename(columns={'Unique venue': 'Venue'})

Markdown(f'**Fixed columns**: Renamed some columns and removed all unneccesary columns:' + '\n - ' + '\n -'.join(cols))

**Fixed columns**: Renamed some columns and removed all unneccesary columns:
 - EIMA
 -Imported from former archive
 -Search (newspapers.com)
 -Search (fulton)
 -Venue
 -City
 -Revue name
 -Normalized Revue Name
 -Legal name
 -Alleged age
 -Assumed birth year
 -Source clean
 -Category
 -2020-12-31 ID
 -Normalized City
 -Performer first-name
 -Performer last-name
 -Normalized performer
 -has_required_data
 -has_correct_date
 -Exclude from visualization
 -Unsure whether drag artist

In [439]:
df = df.reset_index(drop=True)
Markdown(f'**Index has been reset**.')

**Index has been reset**.

In [440]:
df_venues = pd.DataFrame()

for venue, data in df.groupby(['Venue']):
    all_dates = [x for x in data['Date']]
    grouped_dates = group_dates(all_dates)
    grouped_dates_count = len(group_dates(all_dates))
    s = pd.Series({
        'Number of datapoints': len(data),
        'All dates': list(set(all_dates)),
        'Grouped dates': grouped_dates,
        'Number of grouped dates': grouped_dates_count,
        'First date in dataset': min([pd.to_datetime(x) for x in all_dates]),
        'Last date in dataset': max([pd.to_datetime(x) for x in all_dates])
    }, name=venue)
    df_venues = df_venues.append(s)

df_venues['Number of datapoints'] = df_venues['Number of datapoints'].apply(lambda x: int(x))
df_venues['Number of grouped dates'] = df_venues['Number of grouped dates'].apply(lambda x: int(x))

display(Markdown(f'**Grouped information about appearances at clubs**: In a separate dataframe (`df_venues`).'))
df_venues.sort_values('Number of grouped dates', ascending=False).head(5)

**Grouped information about appearances at clubs**: In a separate dataframe (`df_venues`).

Unnamed: 0,All dates,First date in dataset,Grouped dates,Last date in dataset,Number of datapoints,Number of grouped dates
"Club Richman (New York, NY)","[1935-01-03, 1934-07-12, 1936-02-15, 1934-01-2...",1931-12-13,"[[1931-12-13 00:00:00], [1932-02-06 00:00:00],...",1944-12-06,94,24
"Cabin Inn (Chicago, IL)","[1939-09-09, 1937-03-13, 1936-09-12, 1938-08-2...",1935-09-21,"[[1935-09-21 00:00:00], [1936-09-12 00:00:00],...",1939-10-28,58,14
"Finocchio's (San Francisco, CA)","[1941-05-23, 1942-01-30, 1945-03-01, 1946-08-2...",1938-12-22,"[[1938-12-22 00:00:00, 1938-12-22 00:00:00, 19...",1947-01-18,38,13
"Club Frontenac (Detroit, MI)","[1937-03-13, 1941-12-16, 1937-04-24, 1937-02-1...",1937-01-09,"[[1937-01-09 00:00:00, 1937-01-22 00:00:00, 19...",1942-06-04,110,13
"Joe’s Deluxe (Chicago, IL)","[1943-03-27, 1945-08-04, 1947-12-20, 1948-04-0...",1935-12-07,"[[1935-12-07 00:00:00], [1943-03-27 00:00:00, ...",1948-10-02,40,12


In [441]:
for venue, row in df_venues.iterrows():
    for period, date_list in enumerate(row['Grouped dates'], start=1):
        for date in date_list:
            df_ix = df[(df['Date'] == date.strftime('%Y-%m-%d')) & (df['Venue'] == venue)].index
            df.at[df_ix, 'Venue - Date grouping'] = period
            
display(Markdown(f'**Added date groupings to main dataframe**.'))
df.sort_values('Venue - Date grouping', ascending=False).head()

**Added date groupings to main dataframe**.

Unnamed: 0,Date,Performer,Source,Edge Comment,Quote from source,Comment on node: performer,Comment on node: venue,Comment on node: city,Comment on edge: revue,Blackface,Sepia,Fan dancer/Sally Rand,Exotic/erotic/oriental dancer/Gypsy,Has image,Address,Revue,Venue,Venue - Date grouping
4394,1944-12-06,Tex Hendricks,"OBITUARIES, Variety, 156, no. 13, December 6, ...",,,"Appeared as ""sepia dame""",,,,,True,,,,,,"Club Richman (New York, NY)",24.0
4229,1936-05-09,Satch and Satchell,"Club Richman Charged With Objectionable Show, ...",,,,,,,,,,,,,,"Club Richman (New York, NY)",23.0
2423,1936-05-09,Jack Mason,"Club Richman Charged With Objectionable Show, ...",,,MC,,,,,,,,,,,"Club Richman (New York, NY)",23.0
3353,1936-03-21,Lester Lamont,"Route Department, The Billboard, March 21, 1936",,,,,,,,,,,,,,"Club Richman (New York, NY)",22.0
2421,1936-04-04,Jack Mason,"Route Department, The Billboard, April 4, 1936...",,,,,,,,,,,,,Jack Mason and His Playboy Revue,"Club Richman (New York, NY)",22.0


In [442]:
############### SETUP NETWORK VISUALIZATION #############################################

In [450]:
G = nx.MultiGraph()

for groups, data in df.groupby(['Venue', 'Venue - Date grouping']):
    venue, period = groups
    performers = list(set(data['Performer']))
    in_date_grouping = len(data)
    dates = [pd.to_datetime(x) for x in data['Date']]
    min_date = min(dates).strftime('%Y-%m-%d')
    max_date = max(dates).strftime('%Y-%m-%d')
    if len(performers) > 1:
        for source in performers:
            for target in [x for x in performers if not x == source]:
                include = True
                
                for filtered_name in [x.lower() for x in GRAPH_SETTINGS['FILTER_NODES']]:
                    if filtered_name in source.lower() or filtered_name in target.lower():
                        include = False
                
                if include:
                    edge = (source, target, 0)
                    if edge in G.edges:
                        if not venue in G.edges[edge]['venues']:
                            G.edges[edge]['venues'].append(venue)
                    else:
                        G.add_edges_from([(source, target, 0, {'in_date_grouping': in_date_grouping, 'venues': [venue], 'min_date': min_date, 'max_date': max_date})])

display(Markdown('**Generated network graph**.'))

**Generated network graph**.

In [451]:
display(Markdown('**Setting up community information**: using the Girvan-Newman algorithm.'))
c = nx.community.girvan_newman(G)
first_girvan_newman_iteration = next(c)
girvan_newman_groups = {group: names for group, names in enumerate([list(x) for x in first_girvan_newman_iteration], start=1)}

**Setting up community information**: using the Girvan-Newman algorithm.

In [452]:
display(Markdown('**Setting up community information**: using the Clauset-Newman-Moore algorithm.'))
c = nx.community.greedy_modularity_communities(G)
clauset_newman_moore_groups = {group: names for group, names in enumerate([list(x) for x in c], start=1)}

**Setting up community information**: using the Clauset-Newman-Moore algorithm.

In [453]:
def get_modularity(row, groups):
    for group, names in groups.items():
        if row['Performer'] in names:
            return group
    
    return 0

df['Girvan Newman Modularity'] = df.apply(lambda row: get_modularity(row, girvan_newman_groups), axis=1)
df['Clauset-Newman-Moore Modularity'] = df.apply(lambda row: get_modularity(row, clauset_newman_moore_groups), axis=1)

display(Markdown('**Added community information to dataframe**.'))

**Added community information to dataframe**.

In [454]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [455]:
comment_replacer = {
    'm.c.': 'mc',
    'm. c.': 'mc',
    'credited as': ''
}

comment_skipper = [
    'name inferred from',
    'name corroborated by',
    'corroborates that',
    'unclear who was in drag',
    'corroborated'
]

keyword_searches = [
    'snake dancer',
    'fan dance',
    ''
]


def get_clean_comments(comments, comment_replacer, comment_skipper, keyword_searches):
    clean_comments = [x.lower() for x in comments]
    _clean_comments = []
    for ix, comment in enumerate(clean_comments):
        moveon = False
        for skip in comment_skipper:
            if skip in comment:
                moveon = True
        if moveon:
            continue

        if '; ' in comment:
            for comment in comment.split('; '):
                _clean_comments.append(comment)
        else:
            _clean_comments.append(comment)

    clean_comments = _clean_comments
            
    for a, b in comment_replacer.items():
        clean_comments = list(set([x.replace(a, b).strip() for x in clean_comments]))

    return clean_comments


def get_keywords(clean_comments, exclude):
    keywords = []
    for comment in clean_comments:
        quotes = re.findall(r'(?:"(.*?)")', comment)
        for quote in quotes:
            doc = nlp(quote)
            lst = [x for x in [token.lemma_ for token in doc if not token.is_quote and not token.is_punct and not token.is_stop] if not x in names]
            keywords.extend(lst)
    return keywords


for node in G.nodes:
    modularity = list(set([x for x in df[df['Performer'] == node]['Girvan Newman Modularity']]))
    if len(modularity) > 1:
        raise RuntimeError(f'A performer was found ({node}) that belonged to more than one Girvan-Newman group. It should not be possible.')
    G.nodes[node]['Girvan Newman Modularity'] = modularity[0]
    
    modularity = list(set([x for x in df[df['Performer'] == node]['Clauset-Newman-Moore Modularity']]))
    if len(modularity) > 1:
        raise RuntimeError(f'A performer was found ({node}) that belonged to more than one Clauset-Newman-Moore group. It should not be possible.')
    G.nodes[node]['Clauset-Newman-Moore Modularity'] = modularity[0]
    
    comments = list(set([x for x in df[df['Performer'] == node]['Comment on node: performer'] if x]))
    clean_comments = get_clean_comments(comments, comment_replacer, comment_skipper, keyword_searches)
    
    names = node.lower().split()
    keywords = get_keywords(clean_comments, names)
    
    if keywords:
        G.nodes[node]['Keywords'] = ', '.join(list(set(keywords)))
    else:
        G.nodes[node]['Keywords'] = ''

    G.nodes[node]['Comments'] = ', '.join(comments)

display(Markdown('**Added/adjusted node information in graph**.'))

**Added/adjusted node information in graph**.

In [456]:
for edge in G.edges:
    G.edges[edge]['weight'] = len(G.edges[edge]['venues']) # Adjust the weight to be reflecting the number of edges venues that the two nodes appeared in -- TODO: This might mean that we're doing/should do weighted edges
    G.edges[edge]['venues'] = ', '.join(G.edges[edge]['venues'])

display(Markdown('**Added/adjusted edge information in graph**.'))

**Added/adjusted edge information in graph**.

In [457]:
filename = 'graph'
if SETTINGS.get('MIN_DATE'):
    filename += '-' + str(SETTINGS.get('MIN_DATE').year)
if SETTINGS.get('MAX_DATE'):
    filename += '-' + str(SETTINGS.get('MAX_DATE').year)
filename += datetime.datetime.now().strftime(' (exported %Y-%m-%d)')
filename += '.gexf'

nx.write_gexf(G=G, path=filename)

display(Markdown(f'**File saved**: {filename}.'))

**File saved**: graph-1920-1950 (exported 2021-04-14).gexf.

In [405]:
df.sample(50)

Unnamed: 0,Date,Performer,Source,Edge Comment,Quote from source,Comment on node: performer,Comment on node: venue,Comment on node: city,Comment on edge: revue,Revue,Venue,Venue - Date grouping,Girvan Newman Modularity,Clauset-Newman-Moore Modularity
1985,1935-08-14,Jean La Monte,"Jamestown Evening Journal, August 14, 1935, 14",,,"""In His Famous Fan Dance and the Continental""",,,,Fay Norman's Gay Boy Revue,"Terrace Gardens (Warren, PA)",1.0,3,1
2719,1936-12-11,Unnamed performer at The Paddock 14,"The Courier-Journal, December 11, 1936, 40",,,,,,"""14 Company""",Fay Norman's Gay Boy Revue,"The Paddock (Louisville, KY)",1.0,3,9
1487,1935-05-17,Jean Farrelly,"The Times Record, May 17, 1935, 10",,,,,,,Jack Mason's Play Boy Revue,"Echo Tavern (Albany, NY)",2.0,3,1
3791,1940-05-11,Nicki Gallucci,"Detroit Free Press, May 11, 1940, 18",,,,,,,Karyl Norman's All-Star Male Revue,"Club Frontenac (Detroit, MI)",5.0,3,1
418,1933-08-08,"Billy (""Senorita"") Herrera","Variety Bills, Variety, 111, no. 9, August 8, ...",,,,,,,,"K9 Club (Chicago, IL)",3.0,3,1
3955,1942-02-23,Francis Russell,"Miami Herald, 15",,,,,,,"Gals, Girls, Molls, Dolls","Boulevard Club (Miami, FL)",1.0,3,2
39,1925-11-29,Bert Errol,"Buffalo Courier, November 29, 1925, 54",,,,,,,,"Shea's Theatre (Buffalo, NY)",1.0,3,1
3371,1939-02-12,Patsy Keller,"The Miami Herald, February 12, 1939, 54",,,,,,,,"Kelly's Torch Club (Miami, FL)",1.0,3,2
307,1932-09-27,Unnamed performer at B.B.B.'s cellar 9,"HOLLYWOOD ADDS 2 NEW NIGHTERY SPOTS, Variety, ...",,,,,,"""B.B.B.'s cellar does not feature any one perf...",Unnamed revue,"B.B.B.'s (Los Angeles, CA)",1.0,3,16
2198,1935-11-30,Sepia Mae West,"The Chicago Defender, November 30, 1935, 8",,,"""Brown Mae West""",,,,,"Lincoln Theatre (Los Angeles, CA)",1.0,4,5
