# Predict the number of members missing from the dataset

We've discovered that approximately 82 percent of the logbook subscription events survived. Since all members would have been listed in the logbooks, can we guess what proportion of the membership we have?

In [1]:
import pandas as pd

In [2]:
LOGBOOK_PROPORTION_ESTIMATE = 0.8165

In [3]:
csv_urls = {
    'members': 'https://dataspace.princeton.edu/bitstream/88435/dsp01b5644v608/2/SCoData_members_v1.1_2021-01.csv',
    'books': 'https://dataspace.princeton.edu/bitstream/88435/dsp016d570067j/2/SCoData_books_v1.1_2021-01.csv',
    'events': 'https://dataspace.princeton.edu/bitstream/88435/dsp012n49t475g/2/SCoData_events_v1.1_2021-01.csv'

}

# load members, books, events as csv
members_df = pd.read_csv(csv_urls['members'])
books_df = pd.read_csv(csv_urls['books'])
events_df = pd.read_csv(csv_urls['events'])

In [9]:
# How many members are sourced from the logbooks?
logbooks = events_df[events_df['source_type'].str.contains('Logbook')]

logbook_members = set()
for member_uris in logbooks['member_uris']:
    for member in member_uris.split(';'):
        logbook_members.add(member)

logbook_member_count = len(logbook_members)
logbook_member_count

5016

In [10]:
# Given the proportion of logbooks that we think we have, what is the size of the community?
community_estimate = round(logbook_member_count / LOGBOOK_PROPORTION_ESTIMATE)
community_estimate

6143

In [11]:
# What is the total number of people that we have names for in the community?
dataset_community_count = members_df['uri'].unique().shape[0]
dataset_community_count

5601

In [12]:
# What proportion of the entire community do we have?
round(dataset_community_count / community_estimate * 100, 2)

91.18