In [1]:
## Motivation: separation of concerns and that the script currently is taking too long to run.

In [2]:
from dataset.meta import log, in_notebook
from dataset.getting import get_raw_data, get_clean_network_data
from dataset.cache import get_cached_data, test_same_df
from dataset.cleaning import clean_data

In [3]:
import datetime
import pandas as pd

if __name__ == "__main__":
    _df = get_raw_data()

    df_test = get_cached_data()

    if test_same_df(_df, df_test):
        print("Dataset is same. Exiting...")
        if not in_notebook():
            exit()

    _df.to_pickle('network-app/data/_df.pickle')
    
    df = get_clean_network_data(_df)

<font color="green">[14:53:33] **9898 rows imported.**</font>

Dataset is same. Exiting...


<font color="green">[14:53:35] **7818 rows after filtering**: Required data.</font>

<font color="green">[14:53:35] **7781 rows after filtering**: Exclusion from visulization.</font>

<font color="green">[14:53:35] **7087 rows after filtering**: Unsure whether drag artist.</font>

<font color="green">[14:53:35] **7059 rows after filtering**: Full date in `Date` column.</font>

<font color="green">[14:53:35] **Cleaned up all names**.</font>

<font color="green">[14:53:35] **Fixed columns**: Renamed some columns and removed all unneccesary columns.</font>

<font color="green">[14:53:35] **Index has been reset**.</font>

In [4]:
df = get_clean_network_data(_df, min_date=datetime.datetime(year=1930, month=1, day=1), max_date=datetime.datetime(year=1940, month=12, day=31))

KeyError: 'Venue'

In [None]:
import networkx as nx

def group_dates(dates:list=[], delta=datetime.timedelta(days=14), dateformat='%Y-%m-%d'):
    """https://gist.github.com/kallewesterling/9a8d12ce073776ed52865bfb362ad073"""
    
    """
    Chains dates together by looking for the delta between any given dates in a list
    
    Example:
    
    (A.) Provided that the delta is `days=14`,
         the left side will generate the right side:
            [                           [
                1935-01-13,               [1935-01-13, 1935-01-26,
                1935-01-26,                1935-02-11, 1935-02-05],
                1935-02-11,
                1935-02-05,
                1935-04-01,               [1935-04-01, 1935-04-06]
                1935-04-06
            ]                           ]
            
    (B.) Provided that the delta is `days=3`,
         the left side will generate the right side:
            [                           [
                1935-01-13,               [1935-01-13],
                1935-01-26,               [1935-01-26],
                1935-02-11,               [1935-02-11],
                1935-02-05,               [1935-02-05],
                1935-04-01,               [1935-04-01],
                1935-04-06                [1935-04-06]
            ]                           ]

    """

    import re

    try:
        dates = sorted(set([datetime.datetime.strptime(x, dateformat) for x in dates]))
    except ValueError as e:
        date = re.search(r'''['"](.*)['"] does not match format''', str(e))
        if date:
            date = date.groups()[0]
        raise ValueError(f'A date found in list that did not adhere to format (`{date}`). Needs to follow format `{dateformat}`.') from None

    if isinstance(delta, int):
        delta = timedelta(days=delta)

    periods = []

    for ix, date in enumerate(dates):
        min_date = date - delta
        max_date = date + delta

        prev_date, next_date = None, None
        start_chain, end_chain, in_chain, solo_date = None, None, None, None
        prev_date_in_range, next_date_in_range = None, None

        try:
            if ix-1 >= 0:
                prev_date = dates[ix-1]
        except IndexError:
            prev_date = None

        try:
            next_date = dates[ix+1]
        except IndexError:
            next_date = None

        if next_date:
            next_date_in_range = next_date >= min_date and next_date <= max_date

        if prev_date:
            prev_date_in_range = prev_date >= min_date and prev_date <= max_date

        if all([next_date, prev_date, prev_date_in_range, next_date_in_range]):
            # In the loop and in a chain (near previous date and next)
            in_chain = True
        elif all([next_date, prev_date, next_date_in_range]) and not prev_date_in_range:
            # In the loop and beginning of a chain (not near previous date but near next)
            start_chain = True
        elif all([next_date, prev_date, prev_date_in_range]) and not next_date_in_range:
            # In the loop and end of a chain (near previous date but not next)
            end_chain = True
        elif all([next_date, prev_date]) and not all([prev_date_in_range, next_date_in_range]):
            # In the loop but solo date (not not near previous date nor next)
            solo_date = True
        elif next_date and next_date_in_range:
            # In the loop but solo date (not not near previous date nor next)
            start_chain = True
        elif next_date:
            solo_date = True
        elif prev_date and prev_date_in_range:
            end_chain = True
        elif prev_date:
            solo_date = True
        elif not next_date and not prev_date:
            solo_date = True
        else:
            raise RuntimeError('An unexpected error occurred.')

        date_str = date.strftime('%Y-%m-%d')

        if start_chain:
            periods.append([date_str])

        elif end_chain:
            periods[len(periods)-1].append(date_str)

        elif solo_date:
            periods.append([date_str])

        elif in_chain:
            periods[len(periods)-1].append(date_str)

    return periods


In [None]:
df = pd.read_pickle('./network-app/data/_df.pickle')

In [None]:
def get_venue_data(df=None, delta=datetime.timedelta(days=14), filter_unnamed=True):
    if isinstance(df, type(None)):
        raise RuntimeError("Needs DataFrame to proceed.")
    
    venue_data = {}
    venue_data_by_period = {}

    for groups, rows in df.sort_values('Date').groupby(['Venue', 'Date']):
        venue, date = groups
        if filter_unnamed:
            unique_performers = set(x for x in rows.Performer if not 'unnamed' in x.lower())
        else:
            unique_performers = set(x for x in rows.Performer)
        if not len(unique_performers) > 1:
            continue
        if not venue in venue_data:
            venue_data[venue] = set()
        venue_data[venue].add(
            (
                date,
                tuple(sorted(unique_performers)),
            )
        )

    for venue in venue_data:
        if len(venue_data[venue]) == 1:
            continue

        if not venue in venue_data_by_period:
            venue_data_by_period[venue] = set()

        dates = [x[0] for x in venue_data[venue]]
        periods = group_dates(dates, delta=delta)
        for period in periods:
            performers = set()
            for x in venue_data[venue]:
                if x[0] in period:
                    [performers.add(y) for y in x[1]]
            
            venue_data_by_period[venue].add((tuple(period), tuple(performers)))
    
    return venue_data_by_period


In [None]:
def get_network_from_venue_data(venue_data_by_period):
    G = nx.Graph()

    for venue, venue_data in venue_data_by_period.items():
        for d in venue_data:
            period, performers = d
            for node1 in performers:
                for node2 in [x for x in performers if not x == node1]:
                    if not (node1, node2) in G.edges and not (node2, node1) in G.edges:
                        G.add_edge(node1, node2, venues=set(), periods=set(), coLocated={})

                    G.edges[(node1, node2)]['periods'].add(period)
                    G.edges[(node1, node2)]['venues'].add(venue)
                    if not venue in G.edges[(node1, node2)]['coLocated']:
                        G.edges[(node1, node2)]['coLocated'][venue] = set()
                    for date in period:
                        G.edges[(node1, node2)]['coLocated'][venue].add(date)

    for edge in G.edges:
        for make_list in ['periods', 'venues']:
            G.edges[edge][make_list] = sorted(list(G.edges[edge][make_list]))

        for venue in G.edges[edge]['coLocated']:
            G.edges[edge]['coLocated'][venue] = sorted(list(G.edges[edge]['coLocated'][venue]))

        for ix, period in enumerate(G.edges[edge]['periods']):
            G.edges[edge]['periods'][ix] = sorted(list(period))
            
    return G


def unique_periods(G):
    ''' Control function that allows for me to check how many unique periods have been assigned across the entire network's edges. '''
    _ = set()
    for edge in G.edges:
        [_.add(tuple(period)) for period in G.edges[edge]['periods']]
    return len(_)

In [None]:
# networks = {}

for days in [3,14,31,365]:
    networks[days] = get_network_from_venue_data(get_venue_data(df, delta=datetime.timedelta(days=days), filter_unnamed=True))

In [None]:
# Metadata is separate track too

def get_meta_data(df, category=None):
    meta_data = {
        'performers': {},
        'venues': {},
        'cities': {},
        'revues': {}
    }

    MAP = {
        'performers': {
            'cleaned_row_name': 'Performer',
            'MAPPING': {
                'comments': 'Comment on node: performer',
                'legal_names': 'Legal name',
                'alleged_ages': 'Alleged age',
                'assumed_birth_years': 'Assumed birth year',
                'images': 'Has image',
                'exotic_dancer': 'Exotic/erotic/oriental dancer/Gypsy',
                'fan_dancer': 'Fan dancer/Sally Rand',
                'blackface': 'Blackface',
                'sepia': 'Sepia',
            }
        },
        'cities': {
            'cleaned_row_name': 'City',
            'MAPPING': {
                'comments': 'Comment on node: city'
            }
        },
        'venues': {
            'cleaned_row_name': 'Venue',
            'MAPPING': {
                'comments': 'Comment on node: venue'
            }
        },
        'revues': {
            'cleaned_row_name': 'Revue',
            'MAPPING': {
                'comments': 'Comment on edge: revue'
            }
        }
    }
    
    ### No need to change anything below

    for meta_data_category, d in MAP.items():
        if category and not meta_data_category == category:
            continue
        
        log(f'Fetching node meta information for {meta_data_category}...')
        for ix, row in df.iterrows():
            if not row[d['cleaned_row_name']] in meta_data[meta_data_category]:
                meta_data[meta_data_category][row[d['cleaned_row_name']]] = {}

            for key, column_name in d['MAPPING'].items():
                if not key in meta_data[meta_data_category][row[d['cleaned_row_name']]]:
                    meta_data[meta_data_category][row[d['cleaned_row_name']]][key] = []

                if row[column_name]:
                    source = row['Source']
                    content = row[column_name]
                    if isinstance(content, str) and content.lower() == 'true':
                        content = True

                    meta_data[meta_data_category][row[d['cleaned_row_name']]][key].append({
                        'source': source,
                        'content': content
                    })
    
    return meta_data

def get_meta(df=None, category=None):
    if not isinstance(df, pd.DataFrame):
        log('Building new clean data for node meta information...')
        df = get_raw_data(verbose=False)
        df = filter_data(df, max_date=None, min_date=None, verbose=False)
        df = clean_data(df, drop_cols=['Venue'], verbose=False)
    
    all_meta = get_meta_data(df, category=category)
    
    if not category:
        return all_meta
    
    return all_meta[category]

In [None]:
metadata = get_meta()

In [None]:
for days, network in networks.items():
    nx.set_node_attributes(network, metadata['performers'])

In [None]:
t_df = get_clean_network_data(min_date=datetime.datetime(year=1930, month=1, day=1), max_date=datetime.datetime(year=1940, month=12, day=31))

In [None]:
import pathpy as pp

G = pp.TemporalNetwork()

In [None]:
_min = min([datetime.datetime.strptime(x, '%Y-%m-%d') for x in t_df.Date])
edges = zip([x for x in df.Performer],
    [x for x in df.Venue],
    [(datetime.datetime.strptime(x, '%Y-%m-%d')-_min).days for x in t_df.Date]
   )

In [None]:
for edge in edges:
    G.add_edge(edge[0], edge[1], str(edge[2]))
    print('added')

In [None]:
n1 = pp.Node('node1')
n2 = pp.Node('node2')

In [None]:
G.add_edge(n1, n2, uid='2')

In [None]:
for edge in G.edges:
    print(edge)

In [None]:
{x: y for x, y in nx.clustering(networks[14]).items() if y == 1}

In [None]:
Gcc = sorted(nx.connected_components(networks[14]), key=len, reverse=True)
G0 = networks[14].subgraph(Gcc[0])

In [None]:
## Thinking of more measures

In [None]:
def get_top(func, count=10, **args):
    ''' Meta function on which many other functions rely '''
    from collections import Counter
    c = Counter()
    for x,y in func(**args).items():
        c[x] = y
        
    return c.most_common()[:count]


# In the following, we will use the following network:
G = networks[14]

In [None]:
def get_top_triangles(G=None, count=10):
    ''' Returns a Counter, sorted by the most common nodes with the most triangles '''
    
    func = nx.triangles
    return get_top(func, count, G=G)


get_top_triangles(G, count=10)

In [None]:
def get_top_clustering(G=None, count=10):
    ''' Returns a Counter, sorted by the most common nodes with the highest clustering coefficient
        
        What is a clustering coefficient?
        
        A measure of the degree to which nodes in a graph tend to cluster together, computed as
        the proportion of connections among its neighbours which are actually realised compared
        with the number of all possible connections.
        
    '''
    
    func = nx.clustering
    return get_top(func, count=count, G=G)

get_top_clustering(G, count=10)

In [None]:
# Average clustering for network

nx.average_clustering(G)

In [None]:
nx.all_pairs_node_connectivity(G)

In [None]:
from networkx.algorithms import approximation as approx

In [None]:
_ = {}
for node1 in G.nodes:
    for node2 in [x for x in G.nodes if not x == node]:
        if not f'{node1};{node2}' and not f'{node2};{node1}' in _.keys():
            _[f'{node1};{node2}'] = approx.local_node_connectivity(G, node1, node2)

In [None]:
_