In [1]:
from selenium import webdriver
from slugify import slugify
from pathlib import Path
from time import sleep

In [2]:
try:
    b
except:
    b = webdriver.Firefox()

In [111]:
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI",
}
    
# invert the dictionary
abbrev_to_us_state = dict(map(reversed, us_state_to_abbrev.items()))


In [155]:
#!/usr/bin/env python
# coding: utf-8

import community as community_louvain
import copy
from colorama import Fore, Style
import networkx as nx
from collections import Counter
import unicodedata
import re
import pandas as pd
import json
import jellyfish
from pathlib import Path
import datetime
from tqdm import tqdm

try:
    from IPython.display import display, HTML, Markdown, clear_output
except ModuleNotFoundError:
    print("No IPython found.")

settings = {"DAYSPANS": [3, 14, 31, 93, 186, 365]}
urls = [
    {
        "prefix": "v1",
        "url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=254069133&single=true&output=csv",
    },
    {
        "prefix": "live",
        "url": "https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=2042982575&single=true&output=csv",
    },
]


def in_notebook():
    try:
        from IPython import get_ipython

        try:
            if "IPKernelApp" not in get_ipython().config:
                return False
        except AttributeError:
            return False
    except ImportError:
        return False
    return True


def log(msg, color="green", verbose=True):
    now = datetime.datetime.now().strftime("%H:%M%:%S")
    if verbose and in_notebook():
        return display(Markdown(f'<font color="{color}">[{now}] {msg}</font>'))
    elif verbose:
        return print(f"[{now}]:\n{msg}\n\n")
    return None


def slugify(value, allow_unicode=False, verbose=False):
    init_value = str(value)
    value = init_value
    value = (
        unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
    )
    value = re.sub(r"[^\w\s-]", "", value.lower())
    value = re.sub(r"^(\d+)", r"n\1", value)
    value = re.sub(r"[-\s]+", "_", value).strip("-_")
    if verbose:
        clear_output(wait=True)
        log(f"Making slug from {init_value}: {value}", verbose=verbose)
    return value


def get_raw_data(
    verbose=True,
    url="https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=254069133&single=true&output=csv",
):
    df = pd.read_csv(url)

    df.replace("—", "", inplace=True)
    df.replace("—*", "", inplace=True)
    df.replace("–", "", inplace=True)
    df.fillna("", inplace=True)

    log(f"**{df.shape[0]} rows imported.**", verbose=verbose)

    return df


def filter_data(df, min_date=None, max_date=None, verbose=True, skip_unsure=False):
    def has_required_data(row):
        """(internal) for use with DataFrame lambda function to ensure that any given row has the required data present"""
        has_performer = (
            row["Performer"] != ""
            or row["Normalized performer"] != ""
            or (row["Performer first-name"] != "" or row["Performer last-name"]) != ""
        )
        has_venue = row["Venue"] != ""
        if has_performer and has_venue:
            return True
        else:
            return False

    def has_correct_date(row):
        """(internal) for use with DataFrame lambda function to ensure that any given row has a correct date present"""
        return re.search(r"\d{4}\-\d{2}\-\d{2}", row["Date"]) != None

    def string_date(row):
        return row["Date"].strftime("%Y-%m-%d")

    df = df.copy()

    df["has_required_data"] = df.apply(lambda row: has_required_data(row), axis=1)
    df.drop(df[df["has_required_data"] == False].index, inplace=True)
    log(f"**{df.shape[0]} rows after filtering**: Required data.", verbose=verbose)

    df.drop(df[df["Exclude from visualization"] == True].index, inplace=True)
    df.drop(df[df["Exclude from visualization"] == "TRUE"].index, inplace=True)
    log(
        f"**{df.shape[0]} rows after filtering**: Exclusion from visulization.",
        verbose=verbose,
    )

    if skip_unsure == False:
        df.drop(df[df["Unsure whether drag artist"] == True].index, inplace=True)
        df.drop(df[df["Unsure whether drag artist"] == "TRUE"].index, inplace=True)
        log(
            f"**{df.shape[0]} rows after filtering**: Unsure whether drag artist.",
            verbose=verbose,
        )

    df["has_correct_date"] = df.apply(lambda row: has_correct_date(row), axis=1)
    df.drop(df[df["has_correct_date"] == False].index, inplace=True)
    log(
        f"**{df.shape[0]} rows after filtering**: Full date in `Date` column.",
        verbose=verbose,
    )

    if min_date or max_date:
        df["Date"] = pd.to_datetime(df["Date"])
        df = df[(df["Date"] > min_date) & (df["Date"] < max_date)]
        df["Date"] = df.apply(lambda row: string_date(row), axis=1)
        log(
            f"**{df.shape[0]} rows after filtering**: Min and max date set.",
            verbose=verbose,
        )

    return df


def clean_data(df, drop_cols=[], verbose=True, forbidden=["?", "[", "]"]):
    def get_performer(row, null_value=""):
        """(internal) for use with DataFrame lambda function to return the cleaned-up version of a performer's name (in an order of priority)"""

        first_name = row["Performer first-name"]
        last_name = row["Performer last-name"]

        returnVal = None

        if not returnVal and (last_name and not first_name):
            returnVal = last_name

        if not returnVal and (
            row["Normalized performer"]
            and not "—" in row["Normalized performer"]
            and not "–" in row["Normalized performer"]
        ):
            returnVal = row["Normalized performer"]

        if not returnVal and (first_name and last_name):
            if not "—" in first_name and not "—" in last_name:
                returnVal = f"{first_name} {last_name}"

            elif not "—" in last_name and "—" in first_name:
                returnVal = last_name

            elif not "—" in first_name and "—" in last_name:
                returnVal = first_name

        if not returnVal and row["Performer"]:
            returnVal = row["Performer"]

        if not returnVal:
            return null_value

        return "".join([x for x in returnVal if not x in forbidden])

    def get_city(row, null_value=""):
        """(internal) for use with DataFrame lambda function to return the cleaned-up version of a city's name (in an order of priority)"""
        for r in ["Normalized City", "City"]:
            if row[r]:
                return row[r]

        return null_value

    def get_unique_venue(row, null_value=""):
        """(internal) for use with DataFrame lambda function to return the cleaned-up version of a venue's name (in an order of priority)"""
        if row["Venue"] and row["City"]:
            return row["Venue"] + " (" + row["City"] + ")"

        for r in ["Venue", "City"]:
            if row[r]:
                return row[r]

        return null_value

    def get_source(row, null_value=""):
        """(internal) for use with DataFrame lambda function to return the cleaned-up version of a source (in an order of priority)"""
        for r in ["Source clean", "Source"]:
            if row[r]:
                g = re.search(
                    r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)", row[r]
                )
                if not g:
                    g = re.search(r"\d{4}-\d{2}-\d{2}", row[r])
                    if not g:
                        return f"{row[r]} ({datetime.datetime.strptime(row['Date'], '%Y-%m-%d').strftime('%B %d, %Y')})"
                return row[r]

        return (null_value,)

    def get_revue(row, null_value=""):
        """(internal) for use with DataFrame lambda function to return the cleaned-up version of a revue's name (in an order of priority)"""
        for r in ["Normalized Revue Name", "Revue name"]:
            if row[r]:
                return row[r]

        return null_value

    df["Performer"] = df.apply(lambda row: get_performer(row), axis=1)
    df["City"] = df.apply(lambda row: get_city(row), axis=1)
    df["Source"] = df.apply(lambda row: get_source(row), axis=1)
    df["Revue"] = df.apply(lambda row: get_revue(row), axis=1)
    df["Unique venue"] = df.apply(lambda row: get_unique_venue(row), axis=1)
    log(f"**Cleaned up all names**.", verbose=verbose)

    for col in drop_cols:
        try:
            del df[col]
        except KeyError:
            pass

    df = df.rename(columns={"Unique venue": "Venue"})

    log(
        f"**Fixed columns**: Renamed some columns and removed all unneccesary columns.",
        verbose=verbose,
    )

    return df


def get_clean_network_data(
    min_date=None,
    max_date=None,
    drop_cols=None,
    verbose=True,
    url="https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=254069133&single=true&output=csv",
):
    """A "collector" function that runs through `get_raw_data`, `filter_data` and `clean_data` in that order and then resets the index."""

    df = get_raw_data(verbose=verbose, url=url)
    df = filter_data(df, min_date=min_date, max_date=max_date, verbose=verbose)

    if not drop_cols:
        drop_cols = [
            "EIMA_Search",
            "EIMA_ID",
            "Newspaper",
            "Imported from former archive",
            "Search (newspapers.com)",
            "Search (fulton)",
            "Venue",
            "Revue name",
            "Normalized Revue Name",
            "Legal name",
            "Alleged age",
            "Assumed birth year",
            "Source clean",
            "Category",
            "2020-12-31 ID",
            "Normalized City",
            "Performer first-name",
            "Performer last-name",
            "Normalized performer",
            "has_required_data",
            "has_correct_date",
            "Exclude from visualization",
            "Blackface",
            "Sepia",
            "Fan dancer/Sally Rand",
            "Exotic/erotic/oriental dancer/Gypsy",
            "Has image",
            "Address",
            "Vaudeville Circuit/Circus",
            "Edge Comment",
            "Comment on node: performer",
            "Comment on node: venue",
            "Comment on node: city",
            "Comment on edge: revue",
            "Normalized Venue",
        ]

    df = clean_data(df, drop_cols, verbose=verbose)

    df = df.reset_index(drop=True)
    log(f"**Index has been reset**.", verbose=verbose)

    return df


def test_same_df(df1, df2):
    try:
        for cols in [[x for x in df1.columns], [x for x in df2.columns]]:
            for col in cols:
                for ix, row in (df1 == df2).iterrows():
                    if not all([row[col] for col in cols]):
                        return False
                if not [x for x in df1[col]] == [x for x in df2[col]]:
                    return False
    except ValueError:
        return False

    return True


def get_performers_who_were_there(df, where=None, when=[]):
    """Returns a list of all the performers from any list of dates and venue"""

    """
    How this function works:
    get_performers_who_were_there(df, 'Band Box (Syracuse, NY)', ['1935-03-29', '1935-04-05', '1935-04-12', '1935-04-19'])
    """
    if not isinstance(when, list):
        when = [when]

    all_values = []
    for when in when:
        if isinstance(when, datetime.datetime):
            when = when.strftime("%Y-%m-%d")

        selected_rows = df[(df["Date"] == when) & (df["Venue"] == where)]

        all_values.extend(selected_rows["Performer"])

    return sorted(list(set(all_values)))


def group_dates(
    dates: list = [], delta=datetime.timedelta(days=14), dateformat="%Y-%m-%d"
):
    """https://gist.github.com/kallewesterling/9a8d12ce073776ed52865bfb362ad073"""

    """
    Chains dates together by looking for the delta between any given dates in a list
    
    Example:
    
    (A.) Provided that the delta is `days=14`,
         the left side will generate the right side:
            [                           [
                1935-01-13,               [1935-01-13, 1935-01-26,
                1935-01-26,                1935-02-11, 1935-02-05],
                1935-02-11,
                1935-02-05,
                1935-04-01,               [1935-04-01, 1935-04-06]
                1935-04-06
            ]                           ]
            
    (B.) Provided that the delta is `days=3`,
         the left side will generate the right side:
            [                           [
                1935-01-13,               [1935-01-13],
                1935-01-26,               [1935-01-26],
                1935-02-11,               [1935-02-11],
                1935-02-05,               [1935-02-05],
                1935-04-01,               [1935-04-01],
                1935-04-06                [1935-04-06]
            ]                           ]

    """

    try:
        dates = sorted([datetime.datetime.strptime(x, dateformat) for x in dates])
    except ValueError as e:
        date = re.search(r"""['"](.*)['"] does not match format""", str(e))
        if date:
            date = date.groups()[0]
        raise ValueError(
            f"A date found in list that did not adhere to format (`{date}`). Needs to follow format `{dateformat}`."
        ) from None

    if isinstance(delta, int):
        delta = datetime.timedelta(days=delta)

    periods = []

    for ix, date in enumerate(dates):
        min_date = date - delta
        max_date = date + delta

        prev_date, next_date = None, None
        start_chain, end_chain, in_chain, solo_date = None, None, None, None
        prev_date_in_range, next_date_in_range = None, None

        try:
            if ix - 1 >= 0:
                prev_date = dates[ix - 1]
        except IndexError:
            prev_date = None

        try:
            next_date = dates[ix + 1]
        except IndexError:
            next_date = None

        if next_date:
            next_date_in_range = next_date >= min_date and next_date <= max_date

        if prev_date:
            prev_date_in_range = prev_date >= min_date and prev_date <= max_date

        if all([next_date, prev_date, prev_date_in_range, next_date_in_range]):
            in_chain = True
        elif all([next_date, prev_date, next_date_in_range]) and not prev_date_in_range:
            start_chain = True
        elif all([next_date, prev_date, prev_date_in_range]) and not next_date_in_range:
            end_chain = True
        elif all([next_date, prev_date]) and not all(
            [prev_date_in_range, next_date_in_range]
        ):
            solo_date = True
        elif next_date and next_date_in_range:
            start_chain = True
        elif next_date:
            solo_date = True
        elif prev_date and prev_date_in_range:
            end_chain = True
        elif prev_date:
            solo_date = True
        elif not next_date and not prev_date:
            solo_date = True
        else:
            raise RuntimeError("An unexpected error occurred.")

        date_str = date.strftime("%Y-%m-%d")

        if start_chain:
            periods.append([date_str])

        elif end_chain:
            periods[len(periods) - 1].append(date_str)

        elif solo_date:
            periods.append([date_str])

        elif in_chain:
            periods[len(periods) - 1].append(date_str)

    return periods


def get_group_data(df, days=settings["DAYSPANS"], verbose=False):
    data_dict = {}

    venue_count = len(df.groupby("Venue"))
    i = 1
    for venue, row in df.groupby("Venue"):
        i += 1
        for num_days in days:
            log(
                f'Generating group data for spans of {", ".join([str(x) for x in days])} days.',
                verbose=verbose,
            )
            log(
                f"   [{i}/{venue_count}] processing venue {venue} (date span {num_days} days)...",
                verbose=verbose,
            )
            clear_output(wait=True)
            all_dates = list(set(row.Date))
            grouped_dates = group_dates(
                all_dates, delta=datetime.timedelta(days=num_days)
            )
            for ix, date_group in enumerate(grouped_dates, start=1):
                if not venue in data_dict:
                    data_dict[venue] = {}
                if not f"grouped-by-{num_days}-days" in data_dict[venue]:
                    data_dict[venue][f"grouped-by-{num_days}-days"] = {}

                revues = list(set([x for x in row.Revue if x]))
                cities = list(set([x for x in row.City if x]))

                data_dict[venue][f"grouped-by-{num_days}-days"][f"date_group-{ix}"] = {
                    "dates": date_group,
                    "performers": get_performers_who_were_there(df, venue, date_group),
                    "revues": revues,
                    "cities": cities,
                }
    log(f"Generated group data for {venue_count} venues.", verbose=verbose)
    return data_dict


def drop_unnamed(n):
    return not "unnamed" in n.lower()


def get_meta_data(df, category=None, verbose=False):
    meta_data = {"performers": {}, "venues": {}, "cities": {}, "revues": {}}

    MAP = {
        "performers": {
            "cleaned_row_name": "Performer",
            "MAPPING": {
                "comments": "Comment on node: performer",
                "legal_names": "Legal name",
                "alleged_ages": "Alleged age",
                "assumed_birth_years": "Assumed birth year",
                "images": "Has image",
                "exotic_dancer": "Exotic/erotic/oriental dancer/Gypsy",
                "fan_dancer": "Fan dancer/Sally Rand",
                "blackface": "Blackface",
                "sepia": "Sepia",
            },
        },
        "cities": {
            "cleaned_row_name": "City",
            "MAPPING": {"comments": "Comment on node: city"},
        },
        "venues": {
            "cleaned_row_name": "Venue",
            "MAPPING": {"comments": "Comment on node: venue"},
        },
        "revues": {
            "cleaned_row_name": "Revue",
            "MAPPING": {"comments": "Comment on edge: revue"},
        },
    }

    for meta_data_category, d in MAP.items():
        if category and not meta_data_category == category:
            continue

        log(
            f"Fetching node meta information for {meta_data_category}...",
            verbose=verbose,
        )
        for ix, row in df.iterrows():
            if not row[d["cleaned_row_name"]] in meta_data[meta_data_category]:
                meta_data[meta_data_category][row[d["cleaned_row_name"]]] = {}

            for key, column_name in d["MAPPING"].items():
                if not key in meta_data[meta_data_category][row[d["cleaned_row_name"]]]:
                    meta_data[meta_data_category][row[d["cleaned_row_name"]]][key] = []

                if row[column_name]:
                    source = row["Source"]
                    content = row[column_name]
                    if isinstance(content, str) and content.lower() == "true":
                        content = True

                    meta_data[meta_data_category][row[d["cleaned_row_name"]]][
                        key
                    ].append({"source": source, "content": content})

    return meta_data


def get_meta(
    df=None,
    category=None,
    verbose=False,
    url="https://docs.google.com/spreadsheets/d/e/2PACX-1vT0E0Y7txIa2pfBuusA1cd8X5OVhQ_D0qZC8D40KhTU3xB7McsPR2kuB7GH6ncmNT3nfjEYGbscOPp0/pub?gid=254069133&single=true&output=csv",
):
    if not isinstance(df, pd.DataFrame):
        log("Building new clean data for node meta information...", verbose=verbose)
        df = get_raw_data(verbose=False, url=url)
        df = filter_data(df, max_date=None, min_date=None, verbose=False)
        df = clean_data(df, drop_cols=["Venue"], verbose=False)

    all_meta = get_meta_data(df, category=category)

    if not category:
        return all_meta

    return all_meta[category]


def get_connected_nodes_per_node(G):
    return {node: sorted(nx.bfs_tree(G, node, reverse=False).nodes) for node in G.nodes}


def get_unique_networks(connected_nodes_per_node):
    if isinstance(connected_nodes_per_node, dict):
        pass
    elif isinstance(connected_nodes_per_node, nx.classes.graph.Graph):
        connected_nodes_per_node = get_connected_nodes_per_node(
            connected_nodes_per_node
        )
    else:
        raise RuntimeError(
            "connected_nodes_per_node provided must be either a dictionary of nodes connected together or a networkx Graph object."
        )

    unique_networks = []
    for network in list(connected_nodes_per_node.values()):
        if not network in unique_networks:
            unique_networks.append(network)
    return unique_networks


def merge_community_dicts(*args):
    _ = {}
    for dictionary in args:
        for performer, data in dictionary.items():
            if not performer in _:
                _[performer] = {}
            for key, value in data.items():
                if not key in _[performer]:
                    if isinstance(value, dict):
                        _[performer][key] = {}
                    else:
                        raise NotImplemented("Nope")
                for key2, value2 in value.items():
                    if not key2 in _[performer][key]:
                        _[performer][key][key2] = value2
                    else:
                        raise NotImplemented("This should not happen")

    return _


def get_degrees(G, node):
    indegree = sum([1 for edge in G.edges if edge[0] == node])
    outdegree = sum([1 for edge in G.edges if edge[1] == node])
    degree = indegree + outdegree

    return {"indegree": indegree, "outdegree": outdegree, "degree": degree}


def get_state(row):
    if not row.City or 'Canada' in row.City or 'Cuba' in row.City or 'Mexico' in row.City:
        return ''
    if 'Washington DC' in row.City:
        return 'DC'
    if row.City == 'OH':
        return 'OH'
    split_city = row.City.split(', ')
    two_components = len(split_city) == 2
    if two_components:
        is_state = len(split_city[1]) == 2
    
        if is_state:
            return abbrev_to_us_state[split_city[1]]
        else:
            print('error:', split_city, len(split_city[1]))
    else:
        print('error:', split_city)


In [157]:
alternative_spellings = {
    'Billy': [
        'Billie',
        'Billi'
    ],
    'Billie': [
        'Billy',
        'Billi'
    ],
    'Jean': [
        'Gene',
    ],
    'Gene': [
        'Jean',
    ],
    'Bobbie': [
        "Bobby",
        "Bobbi"
    ],
    "Bobby": [
        'Bobbie',
        'Bobbi'
    ],
    'Lengel': [
        'Lengle'
    ],
    'Jackie': [
        'Jacky'
    ],
    'Johnny': [
        'Johnnie',
        'Johnni'
    ],
    'Johnnie': [
        'Johnny',
        'Johnni'
    ],
    'Max': [
        'Maxie'
    ],
    "Rae": [
        'Ray'
    ],
    'Merry': [
        'Murry'
    ],
    'Del': [
        'Delle'
    ],
    'Francis': [
        'Frances'
    ],
    'Frances': [
        'Francis'
    ],
    'Li': [
        'Lee'
    ],
    'Nicki': [
        'Nickie',
        'Nicky'
    ],
    'Lloyd': ['Loyd'],
    'Tangara': ['Tangarra'],
    'Lester': ['Lestra'],
    'Eddie': ['Eddy','Eddi'],
    'Chickie': ['Chicky', 'Chicki'],
    'Gray': ['Grey'],
    'Grey': ['Gray'],
    'Roni': ['Ronni', 'Ronnie', 'Ronny']
}

alternative_names = {
    'Billy "Senorita" Herrera': [
        'Billy Herrera'
    ]
}




def get_alternative_spellings(performer):
    alternatives = [performer]
    name_alternatives = [performer]
    if performer in alternative_names:
        for name in alternative_names[performer]:
            name_alternatives.append(name)
    for name in name_alternatives:
        print(name)
        name = name.replace('"', '')
        words = name.split(' ')
        for word in words:
            if word in alternative_spellings:
                for alt in alternative_spellings[word]:
                    alternatives.append(name.replace(word, alt))
    return list(set(alternatives))



{'Adrian Ames': ['488668361', '677021057'],
 'Al DeMarco': ['171133088', '543772371', '693861340'],
 'Alberta Vaughn': ['364902662',
  '107714322',
  '511239508',
  '260604709',
  '362837118',
  '362832400',
  '365614795',
  '511239574',
  '76214419',
  '362665149',
  '180331608',
  '41228006',
  '365619640',
  '2031592134',
  '76214902'],
 'Arica Wild': ['219421893'],
 'Art Bernard': ['228655028', '228732511'],
 'Art West': ['616997324',
  '616999452',
  '649739903',
  '617000964',
  '5136612',
  '616998178',
  '616997684',
  '294151264',
  '617001816',
  '617000848',
  '616997011',
  '616997863'],
 'Arthur G. West': ['253885638',
  '204078187',
  '314305822',
  '314304884',
  '204079537',
  '37492970'],
 'Arthur La Delle': ['78756942', '78757014'],
 'Babe Allen': ['616998078',
  '616999649',
  '616998936',
  '616998801',
  '617000964',
  '616999642',
  '616999983',
  '616999527',
  '617000848',
  '657013636',
  '616998665',
  '616999774',
  '374010811',
  '616998534',
  '616999989'],

In [159]:
def scroll_to_bottom(b):
    start_len = len(get_search_record_elements(b))
    has_more = True
    max_repetitions = 5
    repetition = 0
    while has_more:
        repetition += 1
        if repetition > max_repetitions:
            has_more = False
        for i in range(0,5):
            b.execute_script('window.scrollTo(0,document.body.scrollHeight);')
            sleep(0.5)
            b.execute_script('window.scrollTo(0,-document.body.scrollHeight);')
            sleep(0.5)
            end_len = len(get_search_record_elements(b))
            if start_len <= end_len:
                has_more = False

def get_search_record_elements(b):
    return b.find_elements_by_css_selector('[id*="search-record"]')

def get_dict_for_search_record(b):
    paper = record.find_element_by_css_selector('h2').text
    link = record.find_element_by_css_selector('a').get_attribute('href')
    doc_id = link.split('/')[4]
    location = record.find_element_by_css_selector('[title="Location"]').text
    date = record.find_element_by_css_selector('.ml-n1.mb-1.text-dark:not([title="Location"])').text
    date = pd.to_datetime(date)
    return {
        'paper': paper,
        'id': doc_id,
        'location': location,
        'date': date.strftime('%Y-%m-%d')
    }


In [168]:


df = get_clean_network_data(
    min_date=datetime.datetime(year=1930, month=1, day=1),
    max_date=datetime.datetime(year=1940, month=12, day=31),
    verbose=False,
    url=urls[0]['url'],
)

df['State'] = df.apply(lambda row: get_state(row), axis=1)




newspaper_id_per_performer = {performer: list(set([y for y in x.Newspaper_ID if y])) for performer, x in df.groupby('Performer') if list(set([y for y in x.Newspaper_ID if y]))}
newspaper_id_per_performer

{'Adrian Ames': ['488668361', '677021057'],
 'Al DeMarco': ['171133088', '543772371', '693861340'],
 'Alberta Vaughn': ['364902662',
  '107714322',
  '511239508',
  '260604709',
  '362837118',
  '362832400',
  '365614795',
  '511239574',
  '76214419',
  '362665149',
  '180331608',
  '41228006',
  '365619640',
  '2031592134',
  '76214902'],
 'Arica Wild': ['219421893'],
 'Art Bernard': ['228655028', '228732511'],
 'Art West': ['616997324',
  '616999452',
  '649739903',
  '617000964',
  '5136612',
  '616998178',
  '616997684',
  '294151264',
  '617001816',
  '617000848',
  '616997011',
  '616997863'],
 'Arthur G. West': ['253885638',
  '204078187',
  '314305822',
  '314304884',
  '204079537',
  '37492970'],
 'Arthur La Delle': ['78756942', '78757014'],
 'Babe Allen': ['616998078',
  '616999649',
  '616998936',
  '616998801',
  '617000964',
  '616999642',
  '616999983',
  '616999527',
  '617000848',
  '657013636',
  '616998665',
  '616999774',
  '374010811',
  '616998534',
  '616999989'],

In [174]:
MAX = 10000
i = 0

top_100_performers = [x for x in dict(df.Performer.value_counts())]
irrelevant_newspaper_ids = [x for x in Path('irrelevant_newspaper_ids.txt').read_text().splitlines() if x]

for groups, rows in df.groupby(['State', 'Performer']):
    if i >= MAX:
        continue
    state, performer = groups
    if not state:
        continue
    if 'unnamed' in performer.lower():
        continue
        
    if not performer in top_100_performers:
        continue
        
    dates = [x for x in rows.Date]
    
    dates_dict = {}
    for date in dates:
        date = pd.to_datetime(date)
        if not date.year in dates_dict:
            dates_dict[date.year] = {}
        if not date.month in dates_dict[date.year]:
            dates_dict[date.year][date.month] = []
        dates_dict[date.year][date.month].append(date.day)
        
    dates = dates_dict
    years = list(dates.keys())
    years.sort()
    performer_orig = performer
    for year in years:
        if year >= 1940:
            continue
        short = us_state_to_abbrev[state].lower()
        datafile = f'performer-searches-by-year/{slugify(performer_orig)}/{slugify(str(year))}-{slugify(state)}.json'
        if not Path(datafile).exists():
            i += 1 # Only performers who haven't been downloaded count
            
            if not Path(datafile).parent.exists():
                Path(datafile).parent.mkdir(parents=True)
            alt_names = get_alternative_spellings(performer)
            records = []
            for performer in alt_names:
                print(performer_orig, 'in', state, year, f'(spelling {performer})')
                querypath = f"%22{performer.replace(' ', '+')}%22&p_province=us-{short}&dr_year={year}-{year}"
                b.get('https://www.apple.com')
                b.get('https://www.newspapers.com/search/#query=' + querypath)

                scroll_to_bottom(b)
                
                for record in get_search_record_elements(b):
                    records.append(get_dict_for_search_record(record))
                    
            article_df = pd.DataFrame.from_dict(records)
            if len(article_df):
                article_df.set_index('id', inplace=True)
                article_df.drop_duplicates(inplace=True)

                Path(datafile).write_text(article_df.to_json())
            else:
                Path(datafile).write_text(json.dumps({'error': 'no records found'}))
            
        '''
        # Now we will check and open tabs
        if performer in newspaper_id_per_performer:
            newspaper_ids_in_dataset = newspaper_id_per_performer[performer]
            json_data = json.loads(Path(datafile).read_text())
            for newspaper_id in [x for x in json_data.get('paper', [])]:
                if newspaper_id in irrelevant_newspaper_ids:
                    continue

                if not newspaper_id in newspaper_ids_in_dataset:
                    print(performer)
                    print(newspaper_id, 'not in dataset yet', f'{json_data["date"][newspaper_id]}')
                    b.execute_script(f"window.open('http://www.newspapers.com/image/{newspaper_id}?terms={performer}');")
        '''


Leon Fredericks
Leon Fredericks in California 1933 (spelling Leon Fredericks)
Leon Fredericks
Leon Fredericks in California 1935 (spelling Leon Fredericks)
Leon Fredericks
Leon Fredericks in California 1936 (spelling Leon Fredericks)
Leon La Verde
Leon La Verde in California 1938 (spelling Leon La Verde)
Leon La Verde
Leon La Verde in California 1939 (spelling Leon La Verde)
Leonard Jans
Leonard Jans in California 1936 (spelling Leonard Jans)
Leonard Soules
Leonard Soules in California 1933 (spelling Leonard Soules)
Li Kar
Li Kar in California 1938 (spelling Li Kar)
Li Kar in California 1938 (spelling Lee Kar)
Linden
Linden in California 1932 (spelling Linden)
Loyce Trent
Loyce Trent in California 1932 (spelling Loyce Trent)
Merry Pickford
Merry Pickford in California 1936 (spelling Murry Pickford)
Merry Pickford in California 1936 (spelling Merry Pickford)
Milton
Milton in California 1932 (spelling Milton)
Neil Dornay
Neil Dornay in California 1932 (spelling Neil Dornay)
Neil Dornay
N

KeyError: 'DC'

In [118]:
# Compare dates in the datasets

In [151]:
v1_df = get_clean_network_data(
    min_date=datetime.datetime(year=1930, month=1, day=1),
    max_date=datetime.datetime(year=1940, month=12, day=31),
    verbose=False,
    url=urls[0]['url'],
)

live_df = get_clean_network_data(
    min_date=datetime.datetime(year=1930, month=1, day=1),
    max_date=datetime.datetime(year=1940, month=12, day=31),
    verbose=False,
    url=urls[1]['url'],
)

diff_dates = set([x for x in v1_df.sort_values('Date').Date]).difference(set([x for x in live_df.sort_values('Date').Date]))
diff_dates = list(diff_dates)
diff_dates.sort()

for date in diff_dates:
    if len(v1_df[v1_df.Date == date]) > len(live_df[live_df.Date == date]):
        print(date, 'exists in v1 but not in live')
    if len(v1_df[v1_df.Date == date]) < len(live_df[live_df.Date == date]):
        print(date, 'exists in live but not in v1')

1935-01-06 exists in v1 but not in live
1935-03-03 exists in v1 but not in live
1935-03-31 exists in v1 but not in live
1935-05-26 exists in v1 but not in live
1935-06-16 exists in v1 but not in live
1935-07-14 exists in v1 but not in live
1935-09-19 exists in v1 but not in live
1935-09-29 exists in v1 but not in live
1935-10-10 exists in v1 but not in live
1935-10-13 exists in v1 but not in live
1935-10-20 exists in v1 but not in live
1935-11-03 exists in v1 but not in live
1935-11-12 exists in v1 but not in live
1935-11-15 exists in v1 but not in live
1938-01-03 exists in v1 but not in live
1938-01-27 exists in v1 but not in live
1938-02-02 exists in v1 but not in live
1938-02-07 exists in v1 but not in live
1938-02-14 exists in v1 but not in live
1938-04-15 exists in v1 but not in live
1938-10-11 exists in v1 but not in live
