In [47]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

In [48]:
import pandas as pd
import re
from datetime import datetime as dt

YEAR_RANGE = (1930, 1940)

def remove(row):
    try:
        date = dt.strptime(row.Date, '%Y-%m-%d')
    except:
        return True
    
    if YEAR_RANGE and date.year >= YEAR_RANGE[0] and date.year <= YEAR_RANGE[1]:
        pass # in the range!
    else:
        return True

    if row['Exclude from visualization'] or row['Unsure whether drag artist']:
        return True
    
    no_city = row['City'] == ''
    no_performer = row['Performer'] == ''
    no_venue = row['Venue'] == ''
    unnamed_performer = 'unnamed' in row['Performer'].lower()
    
    if no_city and no_performer and no_venue:
        return True
    
    if unnamed_performer:
        return True

    return False

def extract_addresses_dict(normalized_df):
    addresses = {}
    rows_with_addresses = normalized_df[normalized_df['Address']!='']
    warnings = []
    for x in zip(rows_with_addresses['Date'], rows_with_addresses['Source'], rows_with_addresses['Venue'], rows_with_addresses['Address']):
        date, source, venue, address = x
        if venue == '':
            warnings.append(address)
        else:
            if not venue in addresses:
                addresses[venue] = {}
            if not source in addresses[venue]:
                addresses[venue][source] = address
    if len(warnings):
        print(f'Warning: {len(warnings)} Venues with no names have addresses:')
        print('- ' + '- '.join(warnings))

    return addresses

def reverse_comment_dict(comment_dict):
    comments_reverse = {}
    for performer, comments in comment_dict.items():
        if not performer in comments_reverse:
            comments_reverse[performer] = {}
        for source, comment in comments.items():
            if not comment in comments_reverse[performer]:
                comments_reverse[performer][comment] = []
            comments_reverse[performer][comment].append(source)
    return comments_reverse
        
def get_comments(df, comment_field='Comment on edge: revue', match_field='Revue', transform=None):
    comments = {}
    rows_with_comments = df[df[comment_field]!='']
    warnings = []
    for x in zip(rows_with_comments['Date'], rows_with_comments['Source'], rows_with_comments[match_field], rows_with_comments[comment_field]):
        date, source, match, comment = x
        comment = str(comment).strip()
        if transform:
            comment = transform(comment)
        if match == '':
            warnings.append(str(comment)[:40]+'...')
        else:
            if not match in comments:
                comments[match] = {}
            if not source in comments[match]:
                comments[match][source] = comment
    if len(warnings):
        print(f'Warning: {len(warnings)} mentions in `{comment_field}` with no value have comments:')
        print('- ' + '\n- '.join(warnings))

    return comments

def get_revue_comments_dict(df):
    return get_comments(df, 'Comment on edge: revue', 'Revue')

def get_performer_comments_dict(df):
    return get_comments(df, 'Comment on node: performer', 'Performer')

def get_venue_comments_dict(df):
    return get_comments(df, 'Comment on node: venue', 'Venue')

def get_city_comments_dict(df):
    return get_comments(df, 'Comment on node: city', 'City')

def get_true_value(row, type):
    if type == 'source':
        if row['Source clean'] != '':
            return row['Source clean']
        return row['Source']
    if type == 'performer':
        if row['Normalized performer'] != '':
            return row['Normalized performer']
        if row['Performer first-name'] != '' and row['Performer last-name'] != '':
            return row['Normalized performer']
        return row['Performer']
    if type == 'city':
        if row['Normalized City'] != '':
            return row['Normalized City']
        return row['City']
    if type == 'revue':
        if row['Normalized Revue Name'] != '':
            return row['Normalized Revue Name']
        return row['Revue name']
    if type == 'venue':
        if row['Normalized Venue'] != '':
            return row['Normalized Venue']
        return row['Venue']
    raise NotImplementedError(f'type `{type}` is not yet implemented')

def find_ref(row, eima=True):
    source = row.get('Source', '')
    source += ' ' + str(row.get('EIMA_ID', ''))
    source += ' ' + str(row.get('Newspapers_ID', ''))
    source += ' ' + str(row.get('EIMA', ''))
    source += ' ' + str(row.get('Search (newspapers.com)', ''))
    source += ' ' + str(row.get('Source clean', ''))
    
    is_eima = 'eima' in source.lower() or 'variety' in source.lower() or 'billboard' in source.lower()
    has_ref = re.search(r'(\d{7,10})', source)
    refs = list(set(re.findall(r'(\d{7,10})', source)))
    if has_ref and eima and is_eima:
        return '|'.join(refs)
    
    if has_ref and not eima and not is_eima:
        return '|'.join(refs)

    return ''
    

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=254069133&single=true&output=csv')
df = df.fillna('')
df = df.replace('—', '')
df = df.replace('—*', '')



# First, set up our references to EIMA and newspapers.com
df['EIMA'] = df.apply(lambda row: find_ref(row), axis=1)
df['Newspapers.com'] = df.apply(lambda row: find_ref(row, False), axis=1)

# Normalize dataframe
df['Source'] = df.apply(lambda row: get_true_value(row, 'source'), axis=1)
df['Venue'] = df.apply(lambda row: get_true_value(row, 'venue'), axis=1)
df['Performer'] = df.apply(lambda row: get_true_value(row, 'performer'), axis=1)
df['City'] = df.apply(lambda row: get_true_value(row, 'city'), axis=1)
df['Revue'] = df.apply(lambda row: get_true_value(row, 'revue'), axis=1)


# Drop filtered data
df['remove'] = df.apply(lambda row: remove(row), axis=1)
df = df.drop(df[df['remove']==True].index)

In [49]:
months = ['', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
DATE = re.compile(r'('+'|'.join([x for x in months if x])+') (\d{1,2}), (\d{4})')
STATE = re.compile(r' ([A-Z][A-Z])( |\])')

In [50]:
from collections import Counter

incorrect_refs = []
papers_no_state = Counter()

for paper, references in {_: [y for y in x.Source] for _, x in df.groupby('Newspaper')}.items():
    state = None
    if not paper or paper.startswith('*'):
        continue
    for reference in references:
        if 'page unknown' in reference or 'date unclear' in reference:
            continue
        reference = reference.replace(paper, '').strip(',')
        if DATE.search(reference):
            month_text, day, year = DATE.search(reference).groups()
            month = str(months.index(month_text)).zfill(2)
            if STATE.search(paper):
                #print(reference, STATE.search(reference))
                state = abbrev_us_state[STATE.search(paper).groups()[0]]
                # print(state)
            else:
                for state_ in abbrev_us_state.values():
                    if state_ in paper:
                        state = state_
        else:
            incorrect_refs.append((paper, reference))
            continue
        
        if not state and 'Canada' not in paper:
            papers_no_state[paper] += 1
            
incorrect_refs = list(set(incorrect_refs))
if incorrect_refs:
    print(f'ERROR: INCORRECT REFERENCES:')
    for ref in incorrect_refs:
        print(incorrect_refs)

ERROR: INCORRECT REFERENCES:
[('Broadway Tattler [New York NY]', 'Broadway Tattler, March 1933, 12 and 14')]


In [51]:
papers_no_state.most_common()

[]

In [52]:
def getmax(list1, N=10):
    final_list = []
  
    for i in range(0, N): 
        max1 = 0
          
        for j in range(len(list1)):     
            if list1[j] > max1:
                max1 = list1[j];
                  
        list1.remove(max1);
        final_list.append(max1)
          
    return(final_list)
    
from slugify import slugify

tagged_by_state = {x[1]: {y: 0 for y in range(1930,1941)} for x, _ in df.groupby(['year', 'state']) if x[1]}
performer_by_year = {x[1]: {y: [] for y in range(1930,1941)} for x, _ in df.groupby(['year', 'state']) if x[1]}

for groups, rows in df.groupby(['year', 'state']):
    year, state = groups
    if not state:
        continue
    
    tagged_by_state.setdefault(state, {})[year] = len(rows)
    performer_by_year.setdefault(state, {})[year] = list(set(x for x in rows.Performer if x))

    # print(year, state, len(rows))
    '''
        node_data = {
            'display': f'{state}, {year}',
            'name': slugify(f'{state}, {year}')
        }
        
        nodes.append(node_data)
    '''
    
tagged_by_state

KeyError: 'year'

In [None]:
all_values = {}
for state in tagged_by_state:
    for year in tagged_by_state[state]:
        nums = tagged_by_state[state][year]
        all_values.setdefault(year, []).extend([nums])

for year in all_values:
    all_values[year] = getmax(all_values[year])
    
# new all_values contains all the top values for our data

In [None]:
for year in all_values:
    max_values = all_values[year]
    for state in tagged_by_state:
        if tagged_by_state[state][year] in max_values:
            print(state, year, tagged_by_state[state][year])

In [53]:
def sumup(performer_data):
    _ = 0
    for key in performer_data:
        _ += len([x for x in performer_data[key]])
    return _

states = [x for x in performer_by_year]
years = range(1930, 1941)
# [performer_by_year[x][1930] for x in states]

all_state_performers = {}
for state in states:
    state_performers = []
    for year in years:
        [state_performers.extend(performer_by_year[x][year]) for x in states]
    state_performers = sorted(list(set(state_performers)))
    
    for performer in state_performers:
        years_appeared = []
        for year in years:
            if performer in performer_by_year[state][year]:
                # print('found', performer, 'in', state, year)
                years_appeared.append(year)
        all_state_performers.setdefault(performer, {})[state] = years_appeared

for performer in all_state_performers:
    print(performer, sumup(all_state_performers[performer]))
    
all_state_performers

NameError: name 'performer_by_year' is not defined

In [54]:
nodes = []
links = []
for state in tagged_by_state:
    for year in tagged_by_state[state]:
        name = f'{state} {year}'
        display = slugify(f'{state} {year}')
        count = tagged_by_state[state][year]
        node = {
            'name': name,
            'display': display,
            'count': count
        }
        nodes.append(node)

NameError: name 'tagged_by_state' is not defined

In [55]:
nodes

[]

In [56]:
#### Rethink quickly as object

In [57]:
'''
import pandas as pd
import re
from datetime import datetime as dt

YEAR_RANGE = (1930, 1940)

def remove(row):
    try:
        date = dt.strptime(row.Date, '%Y-%m-%d')
    except:
        return True
    
    if YEAR_RANGE and date.year >= YEAR_RANGE[0] and date.year <= YEAR_RANGE[1]:
        pass # in the range!
    else:
        return True

    if row['Exclude from visualization'] or row['Unsure whether drag artist']:
        return True
    
    no_city = row['City'] == ''
    no_performer = row['Performer'] == ''
    no_venue = row['Venue'] == ''
    unnamed_performer = 'unnamed' in row['Performer'].lower()
    
    if no_city and no_performer and no_venue:
        return True
    
    if unnamed_performer:
        return True

    return False

def extract_addresses_dict(normalized_df):
    addresses = {}
    rows_with_addresses = normalized_df[normalized_df['Address']!='']
    warnings = []
    for x in zip(rows_with_addresses['Date'], rows_with_addresses['Source'], rows_with_addresses['Venue'], rows_with_addresses['Address']):
        date, source, venue, address = x
        if venue == '':
            warnings.append(address)
        else:
            if not venue in addresses:
                addresses[venue] = {}
            if not source in addresses[venue]:
                addresses[venue][source] = address
    if len(warnings):
        print(f'Warning: {len(warnings)} Venues with no names have addresses:')
        print('- ' + '- '.join(warnings))

    return addresses

def reverse_comment_dict(comment_dict):
    comments_reverse = {}
    for performer, comments in comment_dict.items():
        if not performer in comments_reverse:
            comments_reverse[performer] = {}
        for source, comment in comments.items():
            if not comment in comments_reverse[performer]:
                comments_reverse[performer][comment] = []
            comments_reverse[performer][comment].append(source)
    return comments_reverse
        
def get_comments(df, comment_field='Comment on edge: revue', match_field='Revue', transform=None):
    comments = {}
    rows_with_comments = df[df[comment_field]!='']
    warnings = []
    for x in zip(rows_with_comments['Date'], rows_with_comments['Source'], rows_with_comments[match_field], rows_with_comments[comment_field]):
        date, source, match, comment = x
        comment = str(comment).strip()
        if transform:
            comment = transform(comment)
        if match == '':
            warnings.append(str(comment)[:40]+'...')
        else:
            if not match in comments:
                comments[match] = {}
            if not source in comments[match]:
                comments[match][source] = comment
    if len(warnings):
        print(f'Warning: {len(warnings)} mentions in `{comment_field}` with no value have comments:')
        print('- ' + '\n- '.join(warnings))

    return comments

def get_revue_comments_dict(df):
    return get_comments(df, 'Comment on edge: revue', 'Revue')

def get_performer_comments_dict(df):
    return get_comments(df, 'Comment on node: performer', 'Performer')

def get_venue_comments_dict(df):
    return get_comments(df, 'Comment on node: venue', 'Venue')

def get_city_comments_dict(df):
    return get_comments(df, 'Comment on node: city', 'City')

def get_true_value(row, type):
    if type == 'source':
        if row['Source clean'] != '':
            return row['Source clean']
        return row['Source']
    if type == 'performer':
        if row['Normalized performer'] != '':
            return row['Normalized performer']
        if row['Performer first-name'] != '' and row['Performer last-name'] != '':
            return row['Normalized performer']
        return row['Performer']
    if type == 'city':
        if row['Normalized City'] != '':
            return row['Normalized City']
        return row['City']
    if type == 'revue':
        if row['Normalized Revue Name'] != '':
            return row['Normalized Revue Name']
        return row['Revue name']
    if type == 'venue':
        if row['Normalized Venue'] != '':
            return row['Normalized Venue']
        return row['Venue']
    raise NotImplementedError(f'type `{type}` is not yet implemented')

def find_ref(row, eima=True):
    source = row.get('Source', '')
    source += ' ' + str(row.get('EIMA_ID', ''))
    source += ' ' + str(row.get('Newspapers_ID', ''))
    source += ' ' + str(row.get('EIMA', ''))
    source += ' ' + str(row.get('Search (newspapers.com)', ''))
    source += ' ' + str(row.get('Source clean', ''))
    
    is_eima = 'eima' in source.lower() or 'variety' in source.lower() or 'billboard' in source.lower()
    has_ref = re.search(r'(\d{7,10})', source)
    refs = list(set(re.findall(r'(\d{7,10})', source)))
    if has_ref and eima and is_eima:
        return '|'.join(refs)
    
    if has_ref and not eima and not is_eima:
        return '|'.join(refs)

    return ''
    

df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=254069133&single=true&output=csv')
df = df.fillna('')
df = df.replace('—', '')
df = df.replace('—*', '')



# First, set up our references to EIMA and newspapers.com
df['EIMA'] = df.apply(lambda row: find_ref(row), axis=1)
df['Newspapers.com'] = df.apply(lambda row: find_ref(row, False), axis=1)

# Normalize dataframe
df['Source'] = df.apply(lambda row: get_true_value(row, 'source'), axis=1)
df['Venue'] = df.apply(lambda row: get_true_value(row, 'venue'), axis=1)
df['Performer'] = df.apply(lambda row: get_true_value(row, 'performer'), axis=1)
df['City'] = df.apply(lambda row: get_true_value(row, 'city'), axis=1)
df['Revue'] = df.apply(lambda row: get_true_value(row, 'revue'), axis=1)


# Drop filtered data
df['remove'] = df.apply(lambda row: remove(row), axis=1)
df = df.drop(df[df['remove']==True].index)
'''
pass

In [58]:
'''
class Source():
    newspaper = None
    reference = None
    edges = []
    eima_id = None
    newspaper_id = None
    
    def __init__(self):
        pass
    
    def __str__(self):
        return f'{eima_id}, {newspaper_id}'
        
for source, rows in df.groupby('Source'):
    s = Source()
    edges = list(zip([x for x in rows.Performer], list(zip([x for x in rows.Venue], [x for x in rows.City]))))
    tagged_performers = list(set([x for x in rows.Performer]))
    tagged_venues = list(set(zip([x for x in rows.Venue], [x for x in rows.City])))
    dates = list(set(x for x in rows.Date))
    eima_id = list(set(x for x in rows.EIMA_ID if x))
    if eima_id:
        s.eima_id = eima_id[0]
    newspaper_id = list(set([x for x in rows.Newspaper_ID if x]))
    if newspaper_id:
        s.newspaper_id = newspaper_id[0]
    if len(dates) > 1:
        print(len(tagged_performers))
        print(len(tagged_venues))
        print(source)
    
    print(s)
'''
pass

In [59]:
def get_state(row):
    city, *state = row.City.split(', ')
    if not state:
        return ''
    state = state[0]
    if state in ['Canada', 'Mexico', 'Cuba', 'Switzerland']:
        return ''
    
    state = abbrev_us_state[state]
    return state
    

df['year'] = df.apply(lambda row: pd.to_datetime(row['Date']).year, axis=1)
df['state'] = df.apply(lambda row: get_state(row), axis=1)


for x, _ in df.groupby(['year', 'state', 'Performer']):
    year, state, performer = x


In [60]:
all_travels = {x: {y: [] for y in range(1930,1941)} for x in sorted(list(set(x for x in df.Performer if x)))}

for _, rows in df.groupby(['year', 'state', 'Performer']):
    year, state, performer = _
    if not state or not performer:
        continue
    
    next_year = None
    if year < 1940:
        next_year = year+1
    
    _from = state
    to = []
    if next_year:
        performer_safe = performer.replace('"', '\\"')
        to = list(set(x for x in df.query(f'Performer=="{performer_safe}" & year=="{next_year}" ').state if x))
    
    if to:
        all_travels[performer][year] = (_from, to)
    #print(df[(df.Performer == performer & df.year == )])

In [61]:
# We're going to create the sankey diagram, where we will assume that the performer stayed in the same place as they last arrived

def last_previous_trip(performer, year):
    all_years = range(1930,1941)
    
    # find the last recorded trip for the performer
    year = year - 1
    while year >= min(all_years):
        if len(all_travels[performer][year]):
            return (year, all_travels[performer][year][1][-1])
            year = 9999
        else:
            year = year - 1
    return (None, None)

padded_travels = {x: {y: {'from': None, 'to': [None]} for y in range(1930,1941)} for x in sorted(list(set(x for x in df.Performer if x)))}

for performer, travels in all_travels.items():
    years_recorded = 0
    for year in travels:
        if len(travels[year]) > 1:
            years_recorded += 1
    if not years_recorded:
        print()
        print('***********')
        print(performer, '(no trips)')
        del padded_travels[performer]
        print('***********')
        continue
    else:
        print()
        print('***********')
        print(performer)
        print('***********')
        if years_recorded == 8:
            pass # print(performer, travels) # For this performer, we have everything recorded
    for year, trip in travels.items():
        if trip:
            _from, to = trip
            if len(to) == 1 and to[0] == _from:
                print(f'in {year}, {performer} was only found in {_from}')
            elif len(to) == 1:
                print(f'in {year}, {performer} traveled from {_from} to {to[0]}')
            elif len(to) > 0:
                print(f'in {year}, {performer} traveled from {_from} to')
                print('- ' + '\n'.join(to))
            padded_travels[performer][year] = {
                'from': _from,
                'to': to,
                'inferred': False
            }
        else:
            previous_trip_year, previous_trip_from = last_previous_trip(performer, year)
            if previous_trip_year:
                # print(f'{performer} did not travel in {year} but we looked backwards and saw that they were last in {previous_trip_from} in {previous_trip_year} (need to fill out {list(range(previous_trip_year+1, year+1))}).')
                print(f'{performer} did not travel in {year} but we will assume that they stayed in their last location ({previous_trip_from}) since {previous_trip_year}. --> means we have to fill out {list(range(previous_trip_year+1, year+1))}')
            else:
                print(f'{performer} did not travel in {year} but we looked backwards and saw that we do not have a previously recorded trip')
            padded_travels[performer][year] = {
                'from': previous_trip_from,
                'to': [previous_trip_from],
                'inferred': True
            }



***********
Adrian Ames
***********
in 1930, Adrian Ames was only found in New York
Adrian Ames did not travel in 1931 but we will assume that they stayed in their last location (New York) since 1930. --> means we have to fill out [1931]
Adrian Ames did not travel in 1932 but we will assume that they stayed in their last location (New York) since 1930. --> means we have to fill out [1931, 1932]
Adrian Ames did not travel in 1933 but we will assume that they stayed in their last location (New York) since 1930. --> means we have to fill out [1931, 1932, 1933]
in 1934, Adrian Ames was only found in New York
Adrian Ames did not travel in 1935 but we will assume that they stayed in their last location (New York) since 1934. --> means we have to fill out [1935]
Adrian Ames did not travel in 1936 but we will assume that they stayed in their last location (New York) since 1934. --> means we have to fill out [1935, 1936]
Adrian Ames did not travel in 1937 but we will assume that they stayed in

Deanna Durbin (no trips)
***********

***********
Dee Liddell (no trips)
***********

***********
Del Dreer (no trips)
***********

***********
Del Le Roy
***********
Del Le Roy did not travel in 1930 but we looked backwards and saw that we do not have a previously recorded trip
Del Le Roy did not travel in 1931 but we looked backwards and saw that we do not have a previously recorded trip
Del Le Roy did not travel in 1932 but we looked backwards and saw that we do not have a previously recorded trip
Del Le Roy did not travel in 1933 but we looked backwards and saw that we do not have a previously recorded trip
Del Le Roy did not travel in 1934 but we looked backwards and saw that we do not have a previously recorded trip
in 1935, Del Le Roy traveled from New York to Ohio
in 1936, Del Le Roy traveled from Ohio to Michigan
Del Le Roy did not travel in 1937 but we will assume that they stayed in their last location (Michigan) since 1936. --> means we have to fill out [1937]
Del Le Roy di

Harry Brown (no trips)
***********

***********
Harry E. Brewster (no trips)
***********

***********
Harry Fink (no trips)
***********

***********
Harry Kelly (no trips)
***********

***********
Harry Pepper (no trips)
***********

***********
Harvey Lee
***********
Harvey Lee did not travel in 1930 but we looked backwards and saw that we do not have a previously recorded trip
Harvey Lee did not travel in 1931 but we looked backwards and saw that we do not have a previously recorded trip
Harvey Lee did not travel in 1932 but we looked backwards and saw that we do not have a previously recorded trip
Harvey Lee did not travel in 1933 but we looked backwards and saw that we do not have a previously recorded trip
Harvey Lee did not travel in 1934 but we looked backwards and saw that we do not have a previously recorded trip
in 1935, Harvey Lee traveled from New York to
- Ohio
Texas
New York
Harvey Lee did not travel in 1936 but we will assume that they stayed in their last location (New 

Lee Carroll did not travel in 1940 but we will assume that they stayed in their last location (Ohio) since 1936. --> means we have to fill out [1937, 1938, 1939, 1940]

***********
Lee Mason (no trips)
***********

***********
Lee Moore (no trips)
***********

***********
Lena Rivers
***********
Lena Rivers did not travel in 1930 but we looked backwards and saw that we do not have a previously recorded trip
Lena Rivers did not travel in 1931 but we looked backwards and saw that we do not have a previously recorded trip
Lena Rivers did not travel in 1932 but we looked backwards and saw that we do not have a previously recorded trip
Lena Rivers did not travel in 1933 but we looked backwards and saw that we do not have a previously recorded trip
Lena Rivers did not travel in 1934 but we looked backwards and saw that we do not have a previously recorded trip
in 1935, Lena Rivers traveled from Missouri to
- Missouri
Ohio
Texas
in 1936, Lena Rivers traveled from Texas to
- Missouri
Pennsylva

***********
Rocky twins
***********
Rocky twins did not travel in 1930 but we looked backwards and saw that we do not have a previously recorded trip
Rocky twins did not travel in 1931 but we looked backwards and saw that we do not have a previously recorded trip
in 1932, Rocky twins was only found in California
Rocky twins did not travel in 1933 but we will assume that they stayed in their last location (California) since 1932. --> means we have to fill out [1933]
Rocky twins did not travel in 1934 but we will assume that they stayed in their last location (California) since 1932. --> means we have to fill out [1933, 1934]
Rocky twins did not travel in 1935 but we will assume that they stayed in their last location (California) since 1932. --> means we have to fill out [1933, 1934, 1935]
Rocky twins did not travel in 1936 but we will assume that they stayed in their last location (California) since 1932. --> means we have to fill out [1933, 1934, 1935, 1936]
Rocky twins did not travel

In [62]:
# Reorganize into source - target:

# source_targets = {}
padded_travels_performers = {}
inferred_travels = {}

for performer, travels in padded_travels.items():
    for year, data in travels.items():
        data['to'] = [x for x in data['to'] if x]
        data['from'] = data['from']
        inferred = data['inferred']
        if data['from'] and len(data['to']):
            print(performer, year, data['to'], data['from'])
            last_to = data['to'][-1] # select only the last one - no travels inside the year counts - only when a performer appeared at the beginnning of the year and at the end of it...

            if inferred:
                if not data['from'] in inferred_travels:
                    inferred_travels[data['from']] = {}
                if not last_to in inferred_travels[data['from']]:
                    inferred_travels[data['from']][last_to] = {}
                if not year in inferred_travels[data['from']][last_to]:
                    inferred_travels[data['from']][last_to][year] = []
                inferred_travels[data['from']][last_to][year].append(performer)
                inferred_travels[data['from']][last_to][year] = list(set(inferred_travels[data['from']][last_to][year]))


            #if not data['from'] in source_targets:
            #    source_targets[data['from']] = {}
            #if not last_to in source_targets[data['from']]:
            #    source_targets[data['from']][last_to] = {}
            #if not year in source_targets[data['from']][last_to]:
            #    source_targets[data['from']][last_to][year] = 0
            #source_targets[data['from']][last_to][year] += 1
            
            slugified_from = data['from'] # could slugify here
            slugified_last_to = last_to # could slugify here
            if not slugified_from in padded_travels_performers:
                padded_travels_performers[slugified_from] = {}
            if not slugified_last_to in padded_travels_performers[slugified_from]:
                padded_travels_performers[slugified_from][slugified_last_to] = {}
            if not year in padded_travels_performers[slugified_from][slugified_last_to]:
                padded_travels_performers[slugified_from][slugified_last_to][year] = []
            padded_travels_performers[slugified_from][slugified_last_to][year].append(performer)
            padded_travels_performers[slugified_from][slugified_last_to][year] = list(set(padded_travels_performers[slugified_from][slugified_last_to][year]))
        else:
            pass # print(f'no recorded data for {performer} in {year}')

Adrian Ames 1930 ['New York'] New York
Adrian Ames 1931 ['New York'] New York
Adrian Ames 1932 ['New York'] New York
Adrian Ames 1933 ['New York'] New York
Adrian Ames 1934 ['New York'] New York
Adrian Ames 1935 ['New York'] New York
Adrian Ames 1936 ['New York'] New York
Adrian Ames 1937 ['New York'] New York
Adrian Ames 1938 ['New York'] New York
Adrian Ames 1939 ['New York'] New York
Adrian Ames 1940 ['New York'] New York
Al Benson 1933 ['Illinois'] Illinois
Al Benson 1934 ['Illinois'] Illinois
Al Benson 1935 ['Illinois'] Illinois
Al Benson 1936 ['Illinois'] Illinois
Al Benson 1937 ['Illinois'] Illinois
Al Benson 1938 ['Illinois'] Illinois
Al Benson 1939 ['Illinois'] Illinois
Al Benson 1940 ['Illinois'] Illinois
Al DeMarco 1934 ['New York'] New York
Al DeMarco 1935 ['New York'] New York
Al DeMarco 1936 ['Pennsylvania'] New York
Al DeMarco 1937 ['Pennsylvania'] Pennsylvania
Al DeMarco 1938 ['Pennsylvania'] Pennsylvania
Al DeMarco 1939 ['Pennsylvania'] Pennsylvania
Al DeMarco 1940 ['P

Carlo Levins 1936 ['Michigan', 'Kentucky'] Michigan
Carlo Levins 1937 ['Kentucky'] Kentucky
Carlo Levins 1938 ['Kentucky'] Kentucky
Carlo Levins 1939 ['Kentucky'] Kentucky
Carlo Levins 1940 ['Kentucky'] Kentucky
Carol Lee 1938 ['Illinois'] Illinois
Carol Lee 1939 ['Illinois'] Illinois
Carol Lee 1940 ['Illinois'] Illinois
Chickie Mears 1936 ['Kentucky'] Ohio
Chickie Mears 1937 ['Kentucky'] Kentucky
Chickie Mears 1938 ['Kentucky'] Kentucky
Chickie Mears 1939 ['Kentucky'] Kentucky
Chickie Mears 1940 ['Kentucky'] Kentucky
Clara Bow 1936 ['Pennsylvania'] Pennsylvania
Clara Bow 1937 ['Pennsylvania'] Pennsylvania
Clara Bow 1938 ['Pennsylvania'] Pennsylvania
Clara Bow 1939 ['Pennsylvania'] Pennsylvania
Clara Bow 1940 ['Pennsylvania'] Pennsylvania
Connie Walsh 1935 ['Michigan'] Pennsylvania
Connie Walsh 1936 ['Michigan', 'Kentucky'] Michigan
Connie Walsh 1937 ['Kentucky'] Kentucky
Connie Walsh 1938 ['Kentucky'] Kentucky
Connie Walsh 1939 ['Kentucky'] Kentucky
Connie Walsh 1940 ['Kentucky'] Kent

Gene Russell 1938 ['Florida'] Florida
Gene Russell 1939 ['Florida'] Florida
Gene Russell 1940 ['Florida'] Florida
George Hayes 1936 ['Ohio'] Ohio
George Hayes 1937 ['Ohio'] Ohio
George Hayes 1938 ['Ohio'] Ohio
George Hayes 1939 ['Ohio'] Ohio
George Hayes 1940 ['Ohio'] Ohio
George Kelly 1939 ['Florida', 'New York'] New York
George Kelly 1940 ['New York'] New York
George Oliver 1933 ['Illinois'] Illinois
George Oliver 1934 ['Illinois'] Illinois
George Oliver 1935 ['Illinois'] Illinois
George Oliver 1936 ['Illinois'] Illinois
George Oliver 1937 ['Illinois'] Illinois
George Oliver 1938 ['Illinois'] Illinois
George Oliver 1939 ['Illinois'] Illinois
George Oliver 1940 ['Illinois'] Illinois
Georgie Kaye 1934 ['Ohio', 'Florida', 'Texas', 'Louisiana'] Pennsylvania
Georgie Kaye 1935 ['Texas'] Texas
Georgie Kaye 1936 ['Texas'] Texas
Georgie Kaye 1937 ['Texas'] Texas
Georgie Kaye 1938 ['Texas'] Texas
Georgie Kaye 1939 ['Texas'] Texas
Georgie Kaye 1940 ['Texas'] Texas
Gita Gilmore 1936 ['Ohio'] Tex

Johnny Mangum 1937 ['California'] Missouri
Johnny Mangum 1938 ['California', 'Florida'] California
Johnny Mangum 1939 ['Michigan', 'Florida'] Florida
Johnny Mangum 1940 ['Florida'] Florida
Karyl Norman 1930 ['New York'] New York
Karyl Norman 1931 ['California', 'Illinois'] New York
Karyl Norman 1932 ['California'] Illinois
Karyl Norman 1933 ['Illinois', 'New York'] California
Karyl Norman 1934 ['New Jersey', 'Ohio', 'New York'] New York
Karyl Norman 1935 ['Ohio', 'Nebraska', 'Massachusetts'] Ohio
Karyl Norman 1936 ['Iowa', 'New York'] Ohio
Karyl Norman 1937 ['New York'] New York
Karyl Norman 1938 ['New York'] New York
Karyl Norman 1939 ['Michigan'] Ohio
Karyl Norman 1940 ['Michigan'] Michigan
La Belle Rose 1932 ['New York'] New York
La Belle Rose 1933 ['New York'] New York
La Belle Rose 1934 ['New York'] New York
La Belle Rose 1935 ['New York'] New York
La Belle Rose 1936 ['New York'] New York
La Belle Rose 1937 ['New York'] New York
La Belle Rose 1938 ['New York'] New York
La Belle Ro

Ralph Gilbert 1940 ['Illinois'] Illinois
Reeder Richards 1938 ['California'] California
Reeder Richards 1939 ['California'] California
Reeder Richards 1940 ['California'] California
Reggie "Dolly" Windsor 1935 ['Ohio'] New York
Reggie "Dolly" Windsor 1936 ['Ohio'] Ohio
Reggie "Dolly" Windsor 1937 ['Ohio'] Ohio
Reggie "Dolly" Windsor 1938 ['Ohio'] Ohio
Reggie "Dolly" Windsor 1939 ['Ohio'] Ohio
Reggie "Dolly" Windsor 1940 ['Ohio'] Ohio
Rocky twins 1932 ['California'] California
Rocky twins 1933 ['California'] California
Rocky twins 1934 ['California'] California
Rocky twins 1935 ['California'] California
Rocky twins 1936 ['California'] California
Rocky twins 1937 ['California'] California
Rocky twins 1938 ['California'] California
Rocky twins 1939 ['California'] California
Rocky twins 1940 ['California'] California
Roni Warren 1934 ['Maryland', 'New York'] New York
Roni Warren 1935 ['North Carolina'] New York
Roni Warren 1936 ['Connecticut'] North Carolina
Roni Warren 1937 ['Connecticut'

In [63]:
padded_travels_performers

{'New York': {'New York': {1930: ['Billy Reed',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'Karyl Norman',
    'Frances Fay',
    'Jean Malin',
    'Arthur Budd',
    'Lester Lamont'],
   1931: ['Billy Reed',
    'Frank Doran',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'Frances Fay',
    'Arthur Budd',
    'Arthur G. West',
    'G. Doran',
    'Jean Malin',
    'Niles Marsh',
    'William Harris',
    'Lester Lamont'],
   1932: ['Billy Reed',
    'Frank Doran',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'La Belle Rose',
    'Frances Fay',
    'Niles Marsh',
    'Arthur G. West',
    'G. Doran',
    'Jean Malin',
    'Olyn Landick',
    'Arthur Budd',
    'William Harris',
    'Lester Lamont'],
   1933: ['Billy Reed',
    'Frank Doran',
    'Max Lengel',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'La Belle Rose',
    'Frances Fay',
    'Niles Marsh',
 

In [64]:
years = range(1930,1941)
padded_travels_performers = {source: {target: {year: __.get(year, []) for year in years} for target, __ in _.items()} for source, _ in padded_travels_performers.items()}

In [65]:
new = {}
for source, _ in padded_travels_performers.items():
    if not source in new:
        new[source] = {}
    for target, __ in _.items():
        if not target in new[source]:
            new[source][target] = {}
        for year, performers in __.items():
            if not year in new[source][target]:
                new[source][target][year] = []
            if not performers:
                if source == target:
                    new[source][target][year] = ['filler']
            else:
                new[source][target][year] = performers

for source, _ in padded_travels_performers.items():
    for target, __ in _.items():
        try:
            new[source][source]
        except KeyError:
            new[source][source] = {year: ['filler'] for year in range(1930, 1941)}
    
padded_travels_performers = new.copy()

padded_travels_performers

{'New York': {'New York': {1930: ['Billy Reed',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'Karyl Norman',
    'Frances Fay',
    'Jean Malin',
    'Arthur Budd',
    'Lester Lamont'],
   1931: ['Billy Reed',
    'Frank Doran',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'Frances Fay',
    'Arthur Budd',
    'Arthur G. West',
    'G. Doran',
    'Jean Malin',
    'Niles Marsh',
    'William Harris',
    'Lester Lamont'],
   1932: ['Billy Reed',
    'Frank Doran',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'La Belle Rose',
    'Frances Fay',
    'Niles Marsh',
    'Arthur G. West',
    'G. Doran',
    'Jean Malin',
    'Olyn Landick',
    'Arthur Budd',
    'William Harris',
    'Lester Lamont'],
   1933: ['Billy Reed',
    'Frank Doran',
    'Max Lengel',
    'Frances Dunn',
    'Jackie Maye',
    'Adrian Ames',
    'Jarahal',
    'La Belle Rose',
    'Frances Fay',
    'Niles Marsh',
 

In [66]:
import networkx as nx
import json, datetime


G = nx.DiGraph()

In [70]:
for source, data in padded_travels_performers.items():
    for target, data2 in data.items():
        for year, performers in data2.items():
            count = len(performers)
            if year + 1 > 1940:
                continue
                
            node1_display = f'{source} {year}'
            node1_name = slugify(node1_display)
            
            node2_display = f'{target} {year + 1}'
            node2_name = slugify(node2_display)
            
            value = count

            print(source, target, year, value)
            
            inferred = inferred_travels.get(source, {}).get(target, {}).get(year, [])
            
            G.add_node(node1_name, display=node1_display, year=year, state=slugify(source))
            G.add_node(node2_name, display=node2_display, year=year+1, state=slugify(target))
            
            G.add_edge(node1_name, node2_name, value=value, startYear=year, endYear=year+1, startState=slugify(source), endState=slugify(target), performers=performers, inferred=inferred)
            # )

New York New York 1930 10
New York New York 1931 14
New York New York 1932 16
New York New York 1933 18
New York New York 1934 42
New York New York 1935 36
New York New York 1936 33
New York New York 1937 37
New York New York 1938 39
New York New York 1939 38
New York Pennsylvania 1930 0
New York Pennsylvania 1931 0
New York Pennsylvania 1932 0
New York Pennsylvania 1933 0
New York Pennsylvania 1934 2
New York Pennsylvania 1935 3
New York Pennsylvania 1936 3
New York Pennsylvania 1937 0
New York Pennsylvania 1938 0
New York Pennsylvania 1939 0
New York New Jersey 1930 0
New York New Jersey 1931 0
New York New Jersey 1932 0
New York New Jersey 1933 1
New York New Jersey 1934 0
New York New Jersey 1935 2
New York New Jersey 1936 0
New York New Jersey 1937 0
New York New Jersey 1938 0
New York New Jersey 1939 0
New York Texas 1930 0
New York Texas 1931 0
New York Texas 1932 0
New York Texas 1933 0
New York Texas 1934 0
New York Texas 1935 2
New York Texas 1936 0
New York Texas 1937 0
New 

Pennsylvania Connecticut 1935 0
Pennsylvania Connecticut 1936 0
Pennsylvania Connecticut 1937 1
Pennsylvania Connecticut 1938 0
Pennsylvania Connecticut 1939 0
Michigan Michigan 1930 1
Michigan Michigan 1931 1
Michigan Michigan 1932 1
Michigan Michigan 1933 1
Michigan Michigan 1934 1
Michigan Michigan 1935 1
Michigan Michigan 1936 2
Michigan Michigan 1937 6
Michigan Michigan 1938 7
Michigan Michigan 1939 6
Michigan Minnesota 1930 0
Michigan Minnesota 1931 0
Michigan Minnesota 1932 0
Michigan Minnesota 1933 0
Michigan Minnesota 1934 0
Michigan Minnesota 1935 0
Michigan Minnesota 1936 0
Michigan Minnesota 1937 2
Michigan Minnesota 1938 0
Michigan Minnesota 1939 0
Michigan Ohio 1930 0
Michigan Ohio 1931 0
Michigan Ohio 1932 0
Michigan Ohio 1933 0
Michigan Ohio 1934 0
Michigan Ohio 1935 0
Michigan Ohio 1936 1
Michigan Ohio 1937 1
Michigan Ohio 1938 0
Michigan Ohio 1939 0
Michigan Kentucky 1930 0
Michigan Kentucky 1931 0
Michigan Kentucky 1932 0
Michigan Kentucky 1933 0
Michigan Kentucky 19

Ohio Maryland 1935 0
Ohio Maryland 1936 0
Ohio Maryland 1937 1
Ohio Maryland 1938 0
Ohio Maryland 1939 0
Florida Maryland 1930 0
Florida Maryland 1931 0
Florida Maryland 1932 0
Florida Maryland 1933 0
Florida Maryland 1934 2
Florida Maryland 1935 0
Florida Maryland 1936 0
Florida Maryland 1937 0
Florida Maryland 1938 0
Florida Maryland 1939 0
Florida Florida 1930 1
Florida Florida 1931 1
Florida Florida 1932 1
Florida Florida 1933 3
Florida Florida 1934 3
Florida Florida 1935 3
Florida Florida 1936 3
Florida Florida 1937 3
Florida Florida 1938 4
Florida Florida 1939 15
Florida Louisiana 1930 0
Florida Louisiana 1931 0
Florida Louisiana 1932 0
Florida Louisiana 1933 0
Florida Louisiana 1934 1
Florida Louisiana 1935 0
Florida Louisiana 1936 0
Florida Louisiana 1937 0
Florida Louisiana 1938 0
Florida Louisiana 1939 0
Florida California 1930 0
Florida California 1931 0
Florida California 1932 0
Florida California 1933 0
Florida California 1934 1
Florida California 1935 0
Florida California

Utah Utah 1935 1
Utah Utah 1936 1
Utah Utah 1937 1
Utah Utah 1938 1
Utah Utah 1939 1
North Carolina Connecticut 1930 0
North Carolina Connecticut 1931 0
North Carolina Connecticut 1932 0
North Carolina Connecticut 1933 0
North Carolina Connecticut 1934 0
North Carolina Connecticut 1935 0
North Carolina Connecticut 1936 1
North Carolina Connecticut 1937 0
North Carolina Connecticut 1938 0
North Carolina Connecticut 1939 0
North Carolina North Carolina 1930 1
North Carolina North Carolina 1931 1
North Carolina North Carolina 1932 1
North Carolina North Carolina 1933 1
North Carolina North Carolina 1934 1
North Carolina North Carolina 1935 1
North Carolina North Carolina 1936 1
North Carolina North Carolina 1937 1
North Carolina North Carolina 1938 1
North Carolina North Carolina 1939 1
Connecticut Connecticut 1930 1
Connecticut Connecticut 1931 1
Connecticut Connecticut 1932 1
Connecticut Connecticut 1933 1
Connecticut Connecticut 1934 1
Connecticut Connecticut 1935 1
Connecticut Connect

In [71]:
with open('./sankey-test/sankey-names.json', 'w+') as f:
    f.write(json.dumps({
        'nodes': nx.node_link_data(G)['nodes'],
        'links': nx.node_link_data(G)['links'],
        'paddedTravels': {
            slugify(k): {
                slugify(x): y for x,y in v.items()
            } for k, v in padded_travels_performers.items()
        },
        'saved': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }))
    print('saved.')
    
with open('../docs/data/sankey-data.json', 'w+') as f:
    f.write(json.dumps({
        'nodes': nx.node_link_data(G)['nodes'],
        'links': nx.node_link_data(G)['links'],
        'paddedTravels': {
            slugify(k): {
                slugify(x): y for x,y in v.items()
            } for k, v in padded_travels_performers.items()
        },
        'saved': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }))
    print('saved.')

saved.
saved.
