# Estimate the number of books missing from the dataset

We know that we have approximately 20 percent of the missing borrow events, but does that mean that we have 20 percent of the books? How does the distribution of books and the power law help us get a more accurate picture?

In [33]:
import pandas as pd
from datetime import datetime, date, timedelta
import warnings
import math
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
csv_urls = {
    'members': 'https://dataspace.princeton.edu/bitstream/88435/dsp01b5644v608/2/SCoData_members_v1.1_2021-01.csv',
    'books': 'https://dataspace.princeton.edu/bitstream/88435/dsp016d570067j/2/SCoData_books_v1.1_2021-01.csv',
    'events': 'https://dataspace.princeton.edu/bitstream/88435/dsp012n49t475g/2/SCoData_events_v1.1_2021-01.csv'

}

# load members, books, events as csv
members_df = pd.read_csv(csv_urls['members'])
books_df = pd.read_csv(csv_urls['books'])
events_df = pd.read_csv(csv_urls['events'])

In [29]:
# How many books do we currently get from borrow events?
events_df[events_df.event_type == 'Borrow'].item_uri.unique().shape

(5681,)

In [48]:
def generate_n(event_count):
    n = {}
    for i in range(1, event_count.max() + 1):
        n[i] = event_count[event_count == i].shape[0]
    return n

def calculate_delta(n):
    return sum([v*math.exp(-1*k) for k, v in n.items()])

n = generate_n(books_df['event_count'])
delta = calculate_delta(n)
delta

1023.2873133295311

In [38]:
# What percent of the books do we have?
round(books_df.shape[0] / (books_df.shape[0] + delta) * 100, 2)

85.47