In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
sns.set_style("whitegrid")

First we load the data (data comes from cleaning the full dataset; see Clean_Full_Dataset.ipynb for details), drop any rows with NAs, and convert the 'CheckoutDate' column to a DateTime datatype. We also sort by CheckoutDate and reset the index; this step is useful later when attempting to find the first CheckoutDate of each book.

In [3]:
#Read in cleaned data
#df=pd.read_csv('data/Checkouts_2005_cleaned.csv')
#df=pd.read_csv('data/Checkouts_2005_cleaned_w_colon.csv')
df=pd.read_csv('../../data/FirstYearCheckouts/Checkouts_2005_no_whitespace.csv')
df = df.dropna()
df['CheckoutDate'] = pd.to_datetime(df['CheckoutDate'])
df = df.sort_values(by='CheckoutDate')
df =df.reset_index(drop=True)

df.head()

Unnamed: 0,UsageClass,MaterialType,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear,CleanedTitle,CleanedCreator,CheckoutDate
0,Physical,BOOK,3,The rosary girls / Richard Montanari.,"Montanari, Richard","Balzano Jessica Fictitious character Fiction, ...","Ballantine Books,",2005,therosarygirls,montanaririchard,2005-04-01
1,Physical,BOOK,2,Secrets of the millionaire mind : mastering th...,"Eker, T. Harv","Money Psychological aspects, Millionaires Psyc...","HarperBusiness,",2005,secretsofthemillionairemind,ekerharvt,2005-04-01
2,Physical,BOOK,7,In the company of liars / David Ellis.,"Ellis, David, 1967-",Thrillers Fiction,"G.P. Putnam's Sons,",2005,inthecompanyofliars,davidellis,2005-04-01
3,Physical,BOOK,21,The motive / John Lescroart.,"Lescroart, John T.","Hardy Dismas Fictitious character Fiction, Soc...","Dutton,",2005,themotive,johnlescroartt,2005-04-01
4,Physical,BOOK,1,The rottweiler [text (large print)] / Ruth Ren...,"Rendell, Ruth, 1930-2015","Police England London Fiction, Lisson Grove Lo...","Thorndike Press,",2005,therottweiler,rendellruth,2005-04-01


## Aggregating Data: 

Our goal is to make a new dataset where each row corresponds to a unique set of
(CleanedTitle, CleanedCreator, UsageClass, MaterialType). The dataframe will have a column for each month of checkout data we have (April 2005 - April 2024), where these columns contain the number of checkouts in that month for each book respectively. Note that we have data up to August 2024; however, the months of May 2024 and June 2024 are empty. Thus, we take the last month of data as April 2024. 

We first create a list of the months we have checkout data from, where each month is a string of the form MM/DD/YYYY. This list of strings will be used to populate the columns of our dataframe later.

In [4]:
#Get a list of the months we have checkout data from 
months_of_interest=pd.date_range(start='2005-04-01', end='2024-04-01',freq='MS')


In [5]:
print(months_of_interest)

DatetimeIndex(['2005-04-01', '2005-05-01', '2005-06-01', '2005-07-01',
               '2005-08-01', '2005-09-01', '2005-10-01', '2005-11-01',
               '2005-12-01', '2006-01-01',
               ...
               '2023-07-01', '2023-08-01', '2023-09-01', '2023-10-01',
               '2023-11-01', '2023-12-01', '2024-01-01', '2024-02-01',
               '2024-03-01', '2024-04-01'],
              dtype='datetime64[ns]', length=229, freq='MS')


In [6]:
months_of_interest[-1]


Timestamp('2024-04-01 00:00:00')

In [7]:
#Get the list of all months as strings; will be used as column names later
month_columns = months_of_interest.strftime('%m/%d/%Y')
month_columns = month_columns.to_list()


In [8]:
# Remove rows of df with checkout dates after '04/01/2024'
df = df[df['CheckoutDate'] <= months_of_interest[-1]]

#Verify we have the months we want
check_months= df.CheckoutDate.value_counts().index.sort_values()
print('First month: ', check_months[0])
print('Last month: ', check_months[-1])
#for i in df.CheckoutDate.value_counts().index.sort_values():
 #   print(i)


First month:  2005-04-01 00:00:00
Last month:  2024-04-01 00:00:00


The dataset has a wide variety of Material Types: 

['BOOK' 'SOUNDDISC' 'SOUNDREC' 'SOUNDDISC, VIDEODISC' 'MUSIC' 'AUDIOBOOK'
 'EBOOK' 'VIDEODISC' 'REGPRINT' 'ER' 'VISUAL' 'ATLAS' 'MAP' 'SOUNDCASS'
 'VIDEO' 'ER, MAP' 'ER, SOUNDDISC' 'ER, SOUNDDISC, VIDEODISC'
 'MUSICSNDREC' 'FLASHCARD, SOUNDDISC' 'ER, PRINT' 'ER, NONPROJGRAPH'
 'ER, VIDEODISC' 'LARGEPRINT' 'NOTATEDMUSIC' 'MAP, VIEW'
 'REGPRINT, VIDEOREC' 'KIT' 'REGPRINT, SOUNDDISC' 'ER, REGPRINT'
 'FLASHCARD' 'UNSPECIFIED' 'MIXED' 'BOOK, ER' 'ER, SOUNDREC'
 'ER, SOUNDDISC, SOUNDREC' 'PHOTO']

 To simplify our analysis, we take the five categories with the most checkouts and categorize all other types as 'OTHER'. 

In [9]:
#Valid material types
material_types = ['BOOK','EBOOK','SOUNDDISC','VIDEODISC','AUDIOBOOK', 'OTHER']


In [10]:
#Print original list of MaterialTypes found in dataset
print(df.MaterialType.unique())
#Filter out the material types not in material_types and replace them with "OTHER"
df['MaterialType'] = df['MaterialType'].where(df['MaterialType'].isin(material_types), 'OTHER')
#Print the new list of MaterialTypes found in dataset
print(df.MaterialType.unique())

['BOOK' 'SOUNDDISC' 'SOUNDREC' 'SOUNDDISC, VIDEODISC' 'MUSIC' 'AUDIOBOOK'
 'EBOOK' 'VIDEODISC' 'REGPRINT' 'ER' 'VISUAL' 'ATLAS' 'MAP' 'SOUNDCASS'
 'VIDEO' 'ER, MAP' 'ER, SOUNDDISC' 'ER, SOUNDDISC, VIDEODISC'
 'MUSICSNDREC' 'FLASHCARD, SOUNDDISC' 'ER, PRINT' 'ER, NONPROJGRAPH'
 'ER, VIDEODISC' 'LARGEPRINT' 'NOTATEDMUSIC' 'MAP, VIEW'
 'REGPRINT, VIDEOREC' 'KIT' 'REGPRINT, SOUNDDISC' 'ER, REGPRINT'
 'FLASHCARD' 'UNSPECIFIED' 'MIXED' 'BOOK, ER' 'ER, SOUNDREC'
 'ER, SOUNDDISC, SOUNDREC' 'PHOTO']
['BOOK' 'SOUNDDISC' 'OTHER' 'AUDIOBOOK' 'EBOOK' 'VIDEODISC']


We get the unique sets from (CleanedTitle, CleanedCreator, UsageClass, MaterialType) using value_counts(). We see that we have 609183 different possible combinations of these features out of the ~25 million rows in the dataframe. Note that when we only used unique pairs of (CleanedTitle, CleanedCreator), we had around 440,000 different combinations. 

In [11]:
#Get the unique sets from (CleanedTitle, CleanedCreator, UsageClass, MaterialType)
pairs = df[['CleanedTitle', 'CleanedCreator','UsageClass', 'MaterialType']].value_counts().reset_index()
pairs = pairs.drop(columns='count')
    
pairs.shape

(609183, 4)

In [12]:
pairs.head()

Unnamed: 0,CleanedTitle,CleanedCreator,UsageClass,MaterialType
0,fancynancy,connorjaneo,Physical,BOOK
1,yugioh,kazukitakahashi,Physical,BOOK
2,thewalkingdead,kirkmanrobert,Physical,BOOK
3,nationalgeographicreaders,lauramarsh,Digital,EBOOK
4,avatar,geneluenyang,Digital,EBOOK


We create a numpy array to store the checkout data for each month with rows corresponding to the rows in pairs and columns corresponding to the month in month_columns. We will then convert this to a dataframe to be merged with pairs later, which is significantly faster than updating pairs along the way. 

In [13]:
#Create numpy array of zeros to store the number of checkouts in each month
#Rows: indicate corresponding row in pairs
#Columns: indicate corresponding month in month_columns
months_count = np.zeros((pairs.shape[0], len(month_columns)),dtype = int)

In [14]:
#Get the column names of the features we want to add back in
features = df.columns.to_list()

#Remove the column names we already have in pairs
features.remove('CleanedTitle')
features.remove('CleanedCreator')
features.remove('UsageClass')
features.remove('MaterialType')


print(features)

['Checkouts', 'Title', 'Creator', 'Subjects', 'Publisher', 'PublicationYear', 'CheckoutDate']


The main processing is done in the cell below: 
### Preprocessing: 
To speed up the runtime, we create a few columns with corresponding dictionaries that will enable us to grab row/column indices without searching through the dataframe. We use dictionaries because they are implemented via a hash map so that the average lookup time is O(1), compared with the O(N) we would need if we searched through the dataframe directly. The 'CheckoutDate_str' column converts the 'CheckoutDate' to a string that matches the month strings in month_columns; the dictionary date_to_idx will allow us to go from this string to the appropriate column index of the month we want to update in month_counts. The column 'index_str' contains the unique combination of the (CleanedTitle, CleanedCreator, UsageClass, MaterialType) converted to a single string separated by commas. This simplifies the processing so that we only need to consider one column from the row we have instead of four. We use this column and the indices in pairs to create a dictionary pairs_lookup which will allow us to go from the 'index_str' in df to the corresponding row index in pairs that it corresponds to. This index will be used to update month_counts and the other feature data for each row in pairs. Finally we initialize an array of zeros called filled, which signifies if the additional data corresponding to the column names found in features has been found for each row in pairs. Since df is sorted by CheckoutDate and .apply() is applied top down (first row to last row), we should grab the first CheckoutDate for the book and use its features information to update the row. Note that there is no guarentee that all books of type (CleanedTitle, CleanedCreator, UsageClass, MaterialType) have the same Publisher, Subject, PublicationYear, etc., but they should be similar so we choose to grab the first one. 

### Main Processing: 
updates_months_full is applied to each row of our dataframe df via df.apply(). As it traverses each row, it updates month_counts month index and pairs index it obtains from the date_to_idx and pairs_lookup dictionaries based on the 'CheckoutDate_str' and 'index_str' columns of the row. Additionally, it checks if we have found the additional data corresponding to the column names in features by checking the appropriate index in filled. If not, we return the features values from the row in addition to the row index of pairs. 

### Output: 
After applying updates_months_full to each row in df, the months_count array will contain checkout data from each month for each row in pairs. Additionally, for each row in pairs, updates will contain the corresponding features column information as a list of lists. 

In [15]:
# Convert 'CheckoutDate' to string in the desired format beforehand
df['CheckoutDate_str'] = df['CheckoutDate'].dt.strftime('%m/%d/%Y')

#Create columns for easier lookup in pairs and df; we treat this column as an index column 
pairs['index_str'] = pairs['CleanedTitle'] + " , " + pairs['CleanedCreator'] + " , " \
    + pairs['UsageClass'] + " , " + pairs['MaterialType'] 

df['index_str'] =  df['CleanedTitle'] + " , " + df['CleanedCreator'] + " , " \
                 + df['UsageClass'] + " , " + df['MaterialType'] 

#Create dictionary with keys given by index_str and values the corresponding index in pairs
vals = pairs['index_str'].values
indices = pairs.index
pairs_lookup = dict(zip(vals, indices))

#Dictionary to go from CheckoutDate_str to index in list of months
date_to_idx = {date_str: idx for idx, date_str in enumerate(month_columns)}


#Create array indicating which rows of pairs we have already filled in
# 1 indicates we already filled it; 0 indicates we have not
filled= [0] * len(indices)

#Function to aggregate all the needed info
def update_months_full(row):
    #Update months count with checkout information
    if row['CheckoutDate_str'] in date_to_idx:
        date_idx  = date_to_idx[row['CheckoutDate_str']]
    #Only take the months in the month_columns list; else drop row
    else:
        print(row['CheckoutDate_str'])
        return None
    pairs_idx = pairs_lookup[row['index_str']]
    months_count[pairs_idx, date_idx] +=  row['Checkouts']

    #Update additional information 
    title_creator_str = row['index_str']
    #Check if we have updated the row yet; if not, update it
    if filled[pairs_lookup[title_creator_str]]==0:
       
        #Change indicator to 1 to indicate we have visited row
        filled[pairs_lookup[title_creator_str]]=1
        #Return the index in pairs with the corresponding features from df
        return list(np.insert(row[features].values, 0, pairs_lookup[title_creator_str]))
    return None



# Apply the update_row function to each row in df
updates = df.apply(update_months_full, axis=1)
updates = updates.dropna()
assert(updates.shape[0]==pairs.shape[0])
updates.head()



0    [161204, 3, The rosary girls / Richard Montana...
1    [2697, 2, Secrets of the millionaire mind : ma...
2    [61033, 7, In the company of liars / David Ell...
3    [23674, 21, The motive / John Lescroart., Lesc...
4    [146945, 1, The rottweiler [text (large print)...
dtype: object

Now that we have all the information we need, we will create dataframes for updates and month_counts and merge them with pairs. We create dataframes via pd.DataFrame on month_counts and updates. updates already contains an index column which corresponds to the indices in pairs. To merge on these indices, we create corresponding 'index' columns in pairs and months_df via reset_index(). Note that the indicies of months_df already match those of pairs due to the construction of month_counts.  Finally, we merge updates_df with pairs and then the result of that with month_df on the index column to obtain the final dataframe merged_df. We drop some of the extra columns we created for processing and finally write the data to the file cleaned_months_with_types.csv in the data folder. 

In [16]:
# Features we obtained in updates 
# We added a column for the corresponding index of pairs, so we add the column name here
features_new = ['index', *features]
print(features_new)

['index', 'Checkouts', 'Title', 'Creator', 'Subjects', 'Publisher', 'PublicationYear', 'CheckoutDate']


In [17]:
#Remove the extra column we made for indexing convenience
pairs = pairs.drop(columns=['index_str'])

#Add column with index of each row in pairs for merging 
pairs = pairs.reset_index()
pairs.head()

Unnamed: 0,index,CleanedTitle,CleanedCreator,UsageClass,MaterialType
0,0,fancynancy,connorjaneo,Physical,BOOK
1,1,yugioh,kazukitakahashi,Physical,BOOK
2,2,thewalkingdead,kirkmanrobert,Physical,BOOK
3,3,nationalgeographicreaders,lauramarsh,Digital,EBOOK
4,4,avatar,geneluenyang,Digital,EBOOK


In [18]:
#Create updates dataframe to be joined with pairs
#The index column corresponds to the index column of pairs
updates_df =pd.DataFrame(updates.to_list(), columns=features_new)
updates_df.head()



Unnamed: 0,index,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear,CheckoutDate
0,161204,3,The rosary girls / Richard Montanari.,"Montanari, Richard","Balzano Jessica Fictitious character Fiction, ...","Ballantine Books,",2005,2005-04-01
1,2697,2,Secrets of the millionaire mind : mastering th...,"Eker, T. Harv","Money Psychological aspects, Millionaires Psyc...","HarperBusiness,",2005,2005-04-01
2,61033,7,In the company of liars / David Ellis.,"Ellis, David, 1967-",Thrillers Fiction,"G.P. Putnam's Sons,",2005,2005-04-01
3,23674,21,The motive / John Lescroart.,"Lescroart, John T.","Hardy Dismas Fictitious character Fiction, Soc...","Dutton,",2005,2005-04-01
4,146945,1,The rottweiler [text (large print)] / Ruth Ren...,"Rendell, Ruth, 1930-2015","Police England London Fiction, Lisson Grove Lo...","Thorndike Press,",2005,2005-04-01


In [19]:
#Create months dataframe containing the counts for each month; to be joined with pairs
# Note arrays index with (0,0) as top left so the index column is aligned with the index
# columns of pairs
months_df = pd.DataFrame(months_count, columns=month_columns)
#Add index column to match with pairs
months_df = months_df.reset_index()
months_df.head()


Unnamed: 0,index,04/01/2005,05/01/2005,06/01/2005,07/01/2005,08/01/2005,09/01/2005,10/01/2005,11/01/2005,12/01/2005,...,07/01/2023,08/01/2023,09/01/2023,10/01/2023,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024
0,0,0,0,0,0,0,0,0,0,0,...,86,90,80,86,82,52,71,69,54,68
1,1,0,0,0,0,0,0,0,0,0,...,4,15,4,7,5,3,1,2,1,1
2,2,0,0,0,0,0,0,0,0,0,...,4,4,6,7,6,6,7,14,15,1
3,3,0,0,0,0,0,0,0,0,0,...,2,5,2,6,2,3,4,4,2,5
4,4,0,0,0,0,0,0,0,0,0,...,210,151,176,169,137,166,190,246,382,348


In [20]:
merged1_df = pd.merge(updates_df, pairs, on='index')
merged_df = pd.merge(merged1_df, months_df, on='index')

#Sort by index so that when 
merged_df = merged_df.sort_values(by='index')
#Remove the index and Checkouts columns as they are unnecessary. 
merged_df = merged_df.drop(columns=['index', 'Checkouts'])
merged_df.head()


Unnamed: 0,Title,Creator,Subjects,Publisher,PublicationYear,CheckoutDate,CleanedTitle,CleanedCreator,UsageClass,MaterialType,...,07/01/2023,08/01/2023,09/01/2023,10/01/2023,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024
8266,Fancy Nancy / by Jane O'Connor ; pictures by R...,"O'Connor, Jane",Fancy Nancy Fictitious character Juvenile fict...,"HarperCollins,",2006,2006-03-01,fancynancy,connorjaneo,Physical,BOOK,...,86,90,80,86,82,52,71,69,54,68
12978,"Yu-Gi-Oh! : duelist. Vol. 12, Magician vs. mag...","Takahashi, Kazuki, 1961-",Games Comic books strips etc Juvenile literatu...,"Viz Media,",2005,2006-08-01,yugioh,kazukitakahashi,Physical,BOOK,...,4,15,4,7,5,3,1,2,1,1
45510,"The walking dead . [Volume 2, Miles behind us]...","Kirkman, Robert","Zombies Comic books strips etc, Survivalism Co...","Image Comics,",2006,2008-10-01,thewalkingdead,kirkmanrobert,Physical,BOOK,...,4,4,6,7,6,6,7,14,15,1
285605,National Geographic Readers: Lizards,Laura Marsh,"Beginning Reader, Juvenile Nonfiction, Nature","Random House, Inc.",2015,2015-07-01,nationalgeographicreaders,lauramarsh,Digital,EBOOK,...,2,5,2,6,2,3,4,4,2,5
296436,"Avatar: The Last Airbender - Smoke and Shadow,...",Gene Luen Yang,"Comic and Graphic Books, Juvenile Fiction, Juv...","Random House, Inc.",2015,2015-10-01,avatar,geneluenyang,Digital,EBOOK,...,210,151,176,169,137,166,190,246,382,348


In [21]:
#Make sure data matches 
merged_df[['Title', 'Creator', 'CleanedTitle', 'CleanedCreator']].head(15)

Unnamed: 0,Title,Creator,CleanedTitle,CleanedCreator
8266,Fancy Nancy / by Jane O'Connor ; pictures by R...,"O'Connor, Jane",fancynancy,connorjaneo
12978,"Yu-Gi-Oh! : duelist. Vol. 12, Magician vs. mag...","Takahashi, Kazuki, 1961-",yugioh,kazukitakahashi
45510,"The walking dead . [Volume 2, Miles behind us]...","Kirkman, Robert",thewalkingdead,kirkmanrobert
285605,National Geographic Readers: Lizards,Laura Marsh,nationalgeographicreaders,lauramarsh
296436,"Avatar: The Last Airbender - Smoke and Shadow,...",Gene Luen Yang,avatar,geneluenyang
214882,The Year's Best Science Fiction: Fifth Annual ...,Gardner Dozois,theyearsbestsciencefiction,dozoisgardner
12113,Ranma 1/2 Vol. 34 / story & art by Rumiko Taka...,"Takahashi, Rumiko, 1957-",ranma1,rumikotakahashi
200986,Fly Guy presents : sharks / Tedd Arnold.,"Arnold, Tedd",flyguypresents,arnoldtedd
220080,Magi : the labyrinth of magic. 1 / story & art...,"Ōtaka, Shinobu",magi,shinobuōtaka
175438,Invincible : ultimate collection. Volume 2 / c...,"Kirkman, Robert",invincible,kirkmanrobert


In [22]:
merged_df.shape

(609183, 239)

In [23]:
#merged_df.to_csv('data/cleaned_months_with_type_2005.csv', index=False)
merged_df.to_csv('../../data/FirstYearCheckouts/cleaned_months_no_whitespace_2005.csv', index=False)

### Number of Publications per Month by Author:

We would also like to collect the number of publications per month by each author. We assume the first checkout date is the publication date and that the library keeps the item in its system after that. To do so, we go through each row of the file 'data/Checkouts_2005_no_whitespace.csv' and check if we have seen the book before; if not, we add 1 to each month in num_count including and after that first checkout date. Note that this assumes that df is sorted by CheckoutDate.

In [24]:
#Get the unique sets from (CleanedTitle, CleanedCreator, UsageClass, MaterialType)
authors = df[['CleanedCreator']].value_counts().reset_index()
authors = authors.drop(columns='count')

author_title = df[['CleanedTitle', 'CleanedCreator']].value_counts().reset_index()
author_title = author_title.drop(columns='count')


In [25]:
# Convert 'CheckoutDate' to string in the desired format beforehand
df['CheckoutDate_str'] = df['CheckoutDate'].dt.strftime('%m/%d/%Y')

#Create dictionary with keys given by index_str and values the corresponding index in pairs
authors_lookup = dict(zip(authors['CleanedCreator'].values, authors.index))

#Dictionary to go from CheckoutDate_str to index in list of months
date_to_idx = {date_str: idx for idx, date_str in enumerate(month_columns)}

#Create columns for easier lookup in pairs and df; we treat this column as an index column 
author_title['title_str'] = author_title['CleanedTitle'] + " , " + author_title['CleanedCreator'] 

df['title_str'] =  df['CleanedTitle'] + " , " + df['CleanedCreator'] 
#Create dictionary with keys given by index_str and values the corresponding index in pairs
vals = author_title['title_str'].values
indices = author_title.index
author_title_lookup = dict(zip(vals, indices))


# Create numpy array of zeros to store number of books published by author in each month
# For each new book, we find we add one to each month following and including the first
# checkout date
#Rows: correspecting row in authors 
#Column: corresponding month in month_columns
num_count = np.zeros((authors.shape[0], len(month_columns)),dtype = int)


#Create array indicating which rows of pairs we have already filled in
# 1 indicates we already filled it; 0 indicates we have not
filled= [0] * len(indices)

# Function to get number of published books per month by author
# First checkout month is taken as the publication date and we assume
# the book remains at the library after its first checkout date
def get_num_published_books(row):
    #Update months count with checkout information
    if row['CheckoutDate_str'] in date_to_idx:
        date_idx  = date_to_idx[row['CheckoutDate_str']]
    #Only take the months in the month_columns list; else drop row
    else:
        print(row['CheckoutDate_str'])
    #     return None
    author_idx = authors_lookup[row['CleanedCreator']]
    author_title_idx = author_title_lookup[row['title_str']]

    if filled[author_title_idx]==0:
        #Update all dates after first checkout to count # books by author per month
        num_count[author_idx, date_idx:] +=1
        #Change indicator to 1 to indicate we have visited row
        filled[author_title_idx]=1



# Apply the update_row function to each row in df
df.apply(get_num_published_books, axis=1)



0           None
1           None
2           None
3           None
4           None
            ... 
24830543    None
24830544    None
24830545    None
24830546    None
24830547    None
Length: 24830548, dtype: object

In [26]:
print(num_count)

[[  1   2   3 ... 417 418 421]
 [  0   0   0 ... 153 153 153]
 [  1   3   4 ... 209 209 209]
 ...
 [  0   0   0 ...   1   1   1]
 [  0   0   0 ...   1   1   1]
 [  0   0   0 ...   1   1   1]]


In [27]:
#Create months dataframe containing the counts for each month; to be joined with pairs
# Note arrays index with (0,0) as top left so the index column is aligned with the index
# columns of pairs
# months_df = pd.DataFrame(num_count, columns=month_columns)
# #Add index column to match with pairs
# months_df = months_df.reset_index()
# months_df.head()


Now we want to store the publications per month in a new dataframe df_pubs. For convenience, we read in the dataframe we just created and replace the months columns with the publication numbers. 

In [28]:
# Read in data in which to replace the months columns
df_pubs = pd.read_csv('../../data/FirstYearCheckouts/cleaned_months_no_whitespace_2005.csv')
df_pubs['CheckoutDate'] = pd.to_datetime(df_pubs['CheckoutDate'])


df_pubs['Index_col'] = df_pubs.index.values

df_pubs.head()

Unnamed: 0,Title,Creator,Subjects,Publisher,PublicationYear,CheckoutDate,CleanedTitle,CleanedCreator,UsageClass,MaterialType,...,08/01/2023,09/01/2023,10/01/2023,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024,Index_col
0,Fancy Nancy / by Jane O'Connor ; pictures by R...,"O'Connor, Jane",Fancy Nancy Fictitious character Juvenile fict...,"HarperCollins,",2006,2006-03-01,fancynancy,connorjaneo,Physical,BOOK,...,90,80,86,82,52,71,69,54,68,0
1,"Yu-Gi-Oh! : duelist. Vol. 12, Magician vs. mag...","Takahashi, Kazuki, 1961-",Games Comic books strips etc Juvenile literatu...,"Viz Media,",2005,2006-08-01,yugioh,kazukitakahashi,Physical,BOOK,...,15,4,7,5,3,1,2,1,1,1
2,"The walking dead . [Volume 2, Miles behind us]...","Kirkman, Robert","Zombies Comic books strips etc, Survivalism Co...","Image Comics,",2006,2008-10-01,thewalkingdead,kirkmanrobert,Physical,BOOK,...,4,6,7,6,6,7,14,15,1,2
3,National Geographic Readers: Lizards,Laura Marsh,"Beginning Reader, Juvenile Nonfiction, Nature","Random House, Inc.",2015,2015-07-01,nationalgeographicreaders,lauramarsh,Digital,EBOOK,...,5,2,6,2,3,4,4,2,5,3
4,"Avatar: The Last Airbender - Smoke and Shadow,...",Gene Luen Yang,"Comic and Graphic Books, Juvenile Fiction, Juv...","Random House, Inc.",2015,2015-10-01,avatar,geneluenyang,Digital,EBOOK,...,151,176,169,137,166,190,246,382,348,4


In [29]:
# num_counts but now we account for each row of df; thus there will be duplicate rows for different 
# MaterialTypes of the same book. 
num_count_full = np.zeros((df_pubs.shape[0], len(month_columns)),dtype = int)


def replace_vals(row):
    # Get author name and corresponding index in for num_count
    author_idx = authors_lookup[row['CleanedCreator']]

    # Update the entire for with the corresponding authro row of num_count
    num_count_full[row['Index_col'], :] =num_count[author_idx, :]

df_pubs.apply(replace_vals, axis=1)
df_pubs = df_pubs.drop(columns=['Index_col'])

# Replace the months columns of df with the publication number data
df_pubs[month_columns].head()
print('Data we are inputting')
print(num_count_full)

df_pubs[month_columns] = num_count_full
df_pubs[month_columns].head()

Data we are inputting
[[  0   0   0 ...  94  94  94]
 [  0   0   0 ...  10  10  10]
 [  0   0   0 ... 100 101 101]
 ...
 [  0   0   0 ...   1   1   1]
 [  0   0   0 ...   1   1   1]
 [  0   0   0 ...  12  12  13]]


Unnamed: 0,04/01/2005,05/01/2005,06/01/2005,07/01/2005,08/01/2005,09/01/2005,10/01/2005,11/01/2005,12/01/2005,01/01/2006,...,07/01/2023,08/01/2023,09/01/2023,10/01/2023,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024
0,0,0,0,0,0,0,0,0,0,0,...,94,94,94,94,94,94,94,94,94,94
1,0,0,0,0,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,10
2,0,0,0,0,0,0,0,0,0,0,...,99,99,99,99,99,99,99,100,101,101
3,0,0,0,0,0,0,0,0,0,0,...,28,28,28,28,28,29,29,29,29,29
4,0,0,0,0,0,0,0,0,0,0,...,49,50,50,50,50,50,51,51,51,51


## Some additional exploration: 

We note that of the materials we have, most of them are physical books, followed by ebooks, audiobooks, sounddiscs. A small proportion is composed of other with both physical and digital classes and finally there are few videodiscs.

In [30]:
pairs = merged_df[['UsageClass', 'MaterialType']].value_counts().reset_index()
pairs.head(pairs.shape[0])

Unnamed: 0,UsageClass,MaterialType,count
0,Physical,BOOK,295250
1,Digital,EBOOK,186458
2,Digital,AUDIOBOOK,73360
3,Physical,SOUNDDISC,44315
4,Digital,OTHER,5186
5,Physical,OTHER,4244
6,Physical,VIDEODISC,370


We verify that the date in CheckoutDate does contain the first checkout date. It should since we sorted our dataframe by CheckoutDate before processing, and here we verify this: 

In [31]:
#Format CheckoutDate column to match column name
merged_df['CheckoutDate_str'] = merged_df['CheckoutDate'].dt.strftime('%m/%d/%Y')

#Get indices of first and last months
first_index= merged_df.columns.get_loc('04/01/2005')
last_index =  merged_df.columns.get_loc('04/01/2024')

#Create new dataframe with just month data to process
checkouts = merged_df.iloc[:,first_index:last_index+1]

#Function to grab the first nonzero column in each row of checkouts
def find_first_nonzero_column(row):
    return row[row.ne(0)].index[0]

#Apply function to each row in df_sample
merged_df['first_nonzero_month'] = checkouts.apply(find_first_nonzero_column, axis=1)

#Check that the values match in each column
checking = merged_df['first_nonzero_month']== merged_df['CheckoutDate']
#This prints True if all match; false otherwise
print(all(checking))

#Drop the columns added
merged_df = merged_df.drop(columns=['CheckoutDate_str', 'first_nonzero_month'])


True


## Additional Post-Processing

We would like to clean the Subject column and assign specific genres to each book. We do so via the classify_genre, in which we lower the Subject column and find matches with our predefined genre list. Note that since we want just one genre per book, the function prioritizes genres in the order they are listed with very specific things first and more generic genres like "fiction" and "nonfiction" last.

In [32]:
#Read in data again or rename as df for convenience
df = pd.read_csv('../../data/FirstYearCheckouts/cleaned_months_no_whitespace_2005.csv')
df['CheckoutDate'] = pd.to_datetime(df['CheckoutDate'])

In [33]:
df.head()

Unnamed: 0,Title,Creator,Subjects,Publisher,PublicationYear,CheckoutDate,CleanedTitle,CleanedCreator,UsageClass,MaterialType,...,07/01/2023,08/01/2023,09/01/2023,10/01/2023,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024
0,Fancy Nancy / by Jane O'Connor ; pictures by R...,"O'Connor, Jane",Fancy Nancy Fictitious character Juvenile fict...,"HarperCollins,",2006,2006-03-01,fancynancy,connorjaneo,Physical,BOOK,...,86,90,80,86,82,52,71,69,54,68
1,"Yu-Gi-Oh! : duelist. Vol. 12, Magician vs. mag...","Takahashi, Kazuki, 1961-",Games Comic books strips etc Juvenile literatu...,"Viz Media,",2005,2006-08-01,yugioh,kazukitakahashi,Physical,BOOK,...,4,15,4,7,5,3,1,2,1,1
2,"The walking dead . [Volume 2, Miles behind us]...","Kirkman, Robert","Zombies Comic books strips etc, Survivalism Co...","Image Comics,",2006,2008-10-01,thewalkingdead,kirkmanrobert,Physical,BOOK,...,4,4,6,7,6,6,7,14,15,1
3,National Geographic Readers: Lizards,Laura Marsh,"Beginning Reader, Juvenile Nonfiction, Nature","Random House, Inc.",2015,2015-07-01,nationalgeographicreaders,lauramarsh,Digital,EBOOK,...,2,5,2,6,2,3,4,4,2,5
4,"Avatar: The Last Airbender - Smoke and Shadow,...",Gene Luen Yang,"Comic and Graphic Books, Juvenile Fiction, Juv...","Random House, Inc.",2015,2015-10-01,avatar,geneluenyang,Digital,EBOOK,...,210,151,176,169,137,166,190,246,382,348


In [34]:
df['CleanedSubject'] = df['Subjects'].str.lower().str.strip()

# Define the list of genres in the desired hierarchy order
genres = ['juvenile', 'young adult', 'fantasy', 'romance', 'thriller',\
            'horror', 'mystery', 'science fiction', 'biography', \
                'history', 'novel', 'nonfiction', 'fiction']
# Make list of true genre names we want to consider for genre in genres
# This list aligns with the placement in genres
# For example, fantasy becomes fantasy/sci-fi as does science fiction
genres_match = ['juvenile', 'young adult', 'fantasy/sci-fi', 'romance', 'horror/thriller',\
            'horror/thriller', 'mystery', 'fantasy/sci-fi', 'biography', \
                'history', 'fiction', 'nonfiction', 'fiction']
#Make another list of the desired final genre list without duplicates
genres_final = ['juvenile', 'young adult', 'fantasy/sci-fi', 'romance', \
                'horror/thriller', 'mystery', 'biography', 'history', \
                 'nonfiction', 'fiction']

#Create dictionary combining genres found in Subject string
#  with genre names we want to use
genres_final_dict = dict(zip(genres, genres_match))

# Create a dictionary for priority of each genre
priority = {genre: i for i, genre in enumerate(genres_final)}

# Function to classify genre based on the CleanedSubject column
def classify_genre(cleaned_subject):
    if pd.isna(cleaned_subject):  # If the entry is NaN
        return 'other'

    # Convert the cleaned_subject to lower case for case-insensitive matching
    cleaned_subject = cleaned_subject.lower()

    # Create a dictionary for priority of each genre
    #priority = {genre: i for i, genre in enumerate(genres)} 

    # Find all matching genres in the cleaned_subject string
    #found_genres = [genre for genre in genres if genre in cleaned_subject]
    found_genres = [genres_final_dict[genre] for genre in genres if genre in cleaned_subject]


    if not found_genres:  # If no genre is found
        return 'other'
    

    # Sort genres based on their priority
    found_genres.sort(key=lambda g: priority[g])

    # Return the genre with the highest priority
    return found_genres[0]

# Apply the function to the dataframe
df['Genre'] = df['CleanedSubject'].apply(classify_genre)

We consider the distribution of the genres, and see that out of the ~610000 possibilities, we have around 122000 'other' options. Thus, most of the books can be characterized into one of the possible genres. 

In [35]:
genre_counts = df['Genre'].value_counts().reset_index()

genre_counts.head(20)

Unnamed: 0,Genre,count
0,juvenile,123928
1,other,122386
2,fiction,82675
3,nonfiction,53144
4,biography,42914
5,fantasy/sci-fi,37867
6,mystery,35584
7,history,32934
8,romance,31169
9,horror/thriller,26350


We also would like to classify the publisher column into a select number of publishers. To do so, we choose the most popular publishers by value_count and categorize the ones not in this list as 'other pubs' 

In [36]:
def clean_publisher(text): 
    #Remove non-digit or alphabetical characters; leave whitespace
    text = re.sub(r'[^\w]','',text)
    # Strip any leading/trailing whitespace and make lowercase
    return text.strip().lower()

In [37]:
df['CleanedPublisher'] = df['Publisher'].apply(clean_publisher)

In [38]:
# List of publishers keywords we search for in Publisher column
publishers = ['tantor', 'penguin', 'randomhouse', 'harpercollins', 'harper', \
              'booksontape', 'listeninglibrary', 'schuster', 'blackstone', 
              'hachette', 'scholastic', 'harlequin', 'macmillan',\
                'mifflin', 'brilliance', 'lightningsource', 'recordedbooks' ]

# List of publishers mapped to the corresponding publishers in publishers_final
publishers_mapped = ['recorded books', 'penguin random house', 'penguin random house',\
                     'harpercollins', 'harpercollins', 'penguin random house', 'penguin random house', \
                      'simon & schuster', 'blackstone', 'hachette', \
                      'scholastic', 'harlequin', 'macmillan', 'harpercollins', \
                        'brilliance', 'lightning source', 'recorded books']

# Final list of publishers we 
publishers_final =['recorded books', 'penguin random house', 'harpercollins', 'simon & schuster', \
                   'blackstone' , 'scholastic', 'macmillan', 'hachette', 'harlequin',\
                    'brilliance', 'lightning source', 'other publisher']

In [39]:
#Create dictionary combining genres found in Subject string
#  with genre names we want to use
pubs_final_dict = dict(zip(publishers, publishers_mapped))

# Create a dictionary for priority of each genre
priority = {pubs: i for i, pubs in enumerate(publishers_final)}

# Function to classify genre based on the CleanedSubject column
def classify_publishers(cleaned_pubs):
    if pd.isna(cleaned_pubs):  # If the entry is NaN
        return 'other publisher'


    # Find all matching publishers in the cleaned_pubs string
    found_pubs = [pubs_final_dict[pubs] for pubs in publishers if pubs in cleaned_pubs]


    if not found_pubs:  # If no publisher is found
        return 'other publisher'
    

    # Sort genres based on their priority
    found_pubs.sort(key=lambda g: priority[g])

    # Return the genre with the highest priority
    return found_pubs[0]

# Apply the function to the dataframe
df['CleanedPublisher'] = df['CleanedPublisher'].apply(classify_publishers)

We now want to aggregate the number of checkouts in the first year, with the first checkout date denoting the publication date. We do so via the sum_checkouts_first_year function defined below, which is applied to each row in our dataframe. This function grabs the first checkoutdate of each item, located in the CheckoutDate column, and calculates the date that is a year out. From the columns_dict dictionary, which has keys the column names of df and values the corresponding indices, we can them grab the appropriate columns indices to sum over to get the total number of checkouts in the first year.  Note that if the first checkout date is after 09/01/2023, we only sum the partial year up to 08/01/2024. We store the result into a new column "FirstYearCheckout"

In [40]:
#Create dictionary of column, index pairs to 
indices = list(range(len(df.columns)))
columns = df.columns
columns_dict = dict(zip(columns, indices))


#Function to sum checkouts from first checkout date to one year later
def sum_checkouts_first_year(row):
    first_checkout = row['CheckoutDate'] #Get first checkout date
    first_index= columns_dict[first_checkout.strftime('%m/%d/%Y')] #Convert to column name form and get index of column
    end_year  = (first_checkout.replace(year=first_checkout.year +1)).strftime('%m/%d/%Y') #Add one to year to get end date
    if end_year in columns_dict: #Get index of end year column
        end_index= columns_dict[end_year]
    else:
        #Get last possible month of data and include it; which gives the +1
        end_index =columns_dict['04/01/2024'] +1 
    checkout_sum = 0
    for i in range(first_index, end_index): #Sum over all checkouts in range
        checkout_sum += row.iloc[i]
    return checkout_sum

df['FirstYearCheckouts'] = df.apply(sum_checkouts_first_year, axis = 1)
    

We would like to qunatify the popularity of an author using checkout data; we consider a few options below to be used in our models later. 

### Author Popularity Quantified Option #1: sum of checkouts in prior year

Our next task it get the sum of checkouts per month per author in the year prior to the first checkout. Note that if the start of the previous year is outside the scope of the data we have, we do a partial sum of whatever months we are able; e.g. if the checkout date was 01/01/2006, then we would sum the authors checkouts from 04/01/2005 to 12/01/2005 since our first checkout is April 2005. To create the previous checkout column, we first get the sum of all book checkouts per month per author, which is stored in author_summed. Then for each row in our dataframe, we get the first checkout date and sum the appriopriate columns in author_summed to get total checkouts in the previous year for author corresponding to that row. We store the result into a column named PreviousYearCheckouts. 

In [41]:
indices = list(range(len(df.columns)))
columns =df.columns
columns_dict = dict(zip(columns, indices))

#Get index range for columns of interest
first_index= columns_dict['04/01/2005']
last_index = columns_dict['04/01/2024']

#Get dataframe with CleanedCreator column plus all corresponding months column
#months_df = df.iloc[:, first_index:last_index+1 ]
months_df = df.iloc[:, first_index:last_index+1 ]
months_df.insert(loc = 0,column='CleanedCreator', value=df.CleanedCreator)

#Sum the checkouts for each author in each month
# For each author, we have all months columns filled with all checkouts in that month
author_summed = months_df.groupby('CleanedCreator').sum()
#author_summed= months_df.replace(0, np.nan).groupby('CleanedCreator').mean()

author_summed = author_summed.reset_index()
author_summed = author_summed.replace(np.nan, 0)

author_summed.head()


Unnamed: 0,CleanedCreator,04/01/2005,05/01/2005,06/01/2005,07/01/2005,08/01/2005,09/01/2005,10/01/2005,11/01/2005,12/01/2005,...,07/01/2023,08/01/2023,09/01/2023,10/01/2023,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024
0,aa,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,aaaguirre,0,0,0,0,0,0,0,0,0,...,0,2,0,0,1,0,0,1,3,0
2,aaakerdavid,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,aaanuel,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
4,aabalaskovits,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,1,0,1,0


In [42]:
#Create dictionary of column, index pairs for author_summed
indices = list(range(len(author_summed.columns)))
columns = author_summed.columns
author_columns_dict = dict(zip(columns, indices))

#Create dictionary to get row of author_summed by CleanedCreator 
author_dict = dict(zip(author_summed.CleanedCreator, author_summed.index))

# Function to calcuate number of checkouts of the author in the year before first checkout
def previous_year_sum(row):
    first_checkout = row['CheckoutDate']
    author = row['CleanedCreator']
    idx = author_dict[author]
    end_index= author_columns_dict[first_checkout.strftime('%m/%d/%Y')] #Convert to column name form and get index of column
    previous_year  = (first_checkout.replace(year=first_checkout.year -1)).strftime('%m/%d/%Y') #Add one to year to get end date
    if previous_year in author_columns_dict: #Get index of end year column
        first_index= author_columns_dict[previous_year]
    else:
        # Get first possible month of data
        first_index =author_columns_dict['04/01/2005']
    checkout_sum = 0
    for i in range(first_index+1, end_index): #Sum over all checkouts in range
        checkout_sum += author_summed.iloc[idx, i]
    return checkout_sum


df['PreviousYearCheckoutsSum'] = df.apply(previous_year_sum, axis=1)
    

### Author Popularity Quantified Option #2: mean of checkouts in prior year

Similar to PreviousYearCheckoutsSum, but instead of storing the sum of checkouts, we consider the mean of the nonzero checkouts by the author in each month and then take the mean of these in the first checkout year. The result is stored in PreviousYearCheckoutsMean column

In [43]:
#Get dataframe with CleanedCreator column plus all corresponding months column from df_pubs
#months_df = df.iloc[:, first_index:last_index+1 ]
months_df = df.iloc[:, first_index:last_index+1 ]
months_df.insert(loc = 0,column='CleanedCreator', value=df.CleanedCreator)

#Mean of the checkouts for each author in each month
# For each author, we have all months columns filled with all checkouts in that month
# We take the mean of the nonzero elements in each column
author_mean= months_df.replace(0, np.nan).groupby('CleanedCreator').mean()

author_mean = author_mean.reset_index()
author_mean = author_mean.replace(np.nan, 0)

author_mean.head()


Unnamed: 0,CleanedCreator,04/01/2005,05/01/2005,06/01/2005,07/01/2005,08/01/2005,09/01/2005,10/01/2005,11/01/2005,12/01/2005,...,07/01/2023,08/01/2023,09/01/2023,10/01/2023,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024
0,aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,aaaguirre,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,1.5,0.0
2,aaakerdavid,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,aaanuel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,aabalaskovits,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0


In [44]:
#Create dictionary of column, index pairs for author_summed
indices = list(range(len(author_summed.columns)))
columns = author_summed.columns
author_columns_dict = dict(zip(columns, indices))

#Create dictionary to get row of author_summed by CleanedCreator 
author_dict = dict(zip(author_summed.CleanedCreator, author_summed.index))

# Function to calcuate number of checkouts of the author in the year before first checkout
def previous_year_mean(row):
    first_checkout = row['CheckoutDate']
    author = row['CleanedCreator']
    idx = author_dict[author]
    end_index= author_columns_dict[first_checkout.strftime('%m/%d/%Y')] #Convert to column name form and get index of column
    previous_year  = (first_checkout.replace(year=first_checkout.year -1)).strftime('%m/%d/%Y') #Add one to year to get end date
    if previous_year in author_columns_dict: #Get index of end year column
        first_index= author_columns_dict[previous_year]
    else:
        # Get first possible month of data
        first_index =author_columns_dict['04/01/2005']

    checkout_sum=0
    for i in range(first_index, end_index): #Mean over all checkouts in range 
       # checkout_min = min(checkout_min, author_summed.iloc[idx, i])
        checkout_sum += author_mean.iloc[idx, i]
    
    if (end_index - first_index >0):
        return checkout_sum *1.0 / (end_index - first_index)
    else:
        return checkout_sum
    



df['PreviousYearCheckoutsMean'] = df.apply(previous_year_mean, axis=1)
    

### Author Popularity Quantified Option #3: number of publications in prior year

Using data in df_pubs, we can find the number of publications by the author in the last year. The result is stored in PreviousYearPublished.

In [45]:
# Function to calcuate number of checkouts of the author in the year before first checkout
def previous_year_books(row):
    first_checkout = row['CheckoutDate']
    previous_year  = (first_checkout.replace(year=first_checkout.year -1)).strftime('%m/%d/%Y')
    if previous_year in month_columns:
        return row[first_checkout.strftime('%m/%d/%Y')] - row[previous_year]
    else:
        return row[first_checkout.strftime('%m/%d/%Y')] - row['04/01/2005']
    


df['PreviousYearPublished'] = df_pubs.apply(previous_year_books, axis=1) 

### Author Popularity Quantified Option #4: mean checkout per month of all books published in last year

For each row in df, we find the books that were published by the same author in the previous year, take the mean of all the checkout data for each month in that year and then sum the resulting means. In PreviousYearNewMean, we take the average including the months with zero checkouts, but in PreviousYearNewMeanNonzero, we ignore the zero months. In the final dataframe, we only take PreviousYearNewMeanNonzero. 

In [46]:
#Sort by CleanedCreator
df = df.sort_values(by='CleanedCreator')

In [47]:
authors = df['CleanedCreator'].value_counts().reset_index()
authors = authors.sort_values(by='CleanedCreator').reset_index()
authors = authors.drop(columns = 'index')
authors.head()

Unnamed: 0,CleanedCreator,count
0,aa,1
1,aaaguirre,4
2,aaakerdavid,1
3,aaanuel,3
4,aabalaskovits,4


In [48]:
count = authors['count']
# Create index for the start of each author name corresponding to the authors list in authors
author_indicies = [0] * len(authors)
for i in range(len(authors)-1):
    author_indicies[i+1] = author_indicies[i] + count[i]

In [49]:
# Create dictionary for author count values
vals = authors['CleanedCreator'].values
authors_lookup = dict(zip(vals, authors.index))

# Create dictionary for df columns 
indices = list(range(len(df.columns)))
columns = df.columns
df_columns_dict = dict(zip(columns, indices))

In [50]:
def mean_checkouts_year_prior(row):
    first_checkout = row['CheckoutDate']
    previous_year  = (first_checkout.replace(year=first_checkout.year -1)).strftime('%m/%d/%Y') 
    author = row['CleanedCreator']
    idx = authors_lookup[author]
    #print(idx)
    if len(author_indicies) >= idx:
        author_df = df.iloc[author_indicies[idx]:len(author_indicies)]
    else:
        author_df = df.iloc[author_indicies[idx]:author_indicies[idx+1]]
    #Get books first checkout in the prior year
    author_df = author_df[(author_df['CheckoutDate']< first_checkout) & (author_df['CheckoutDate'] >= previous_year )]

    if len(author_df) == 0:
        return 0

    # Get index for previous year or the first possible month
    first_index = df_columns_dict.get(previous_year, df_columns_dict['04/01/2005'])
    end_index=df_columns_dict[first_checkout.strftime('%m/%d/%Y')]
    
    months_data= author_df.iloc[:, first_index:end_index].values
    mean_data = months_data.mean(axis=0)
    return sum(mean_data)
 


df['PreviousYearPublishedMean'] = df.apply(mean_checkouts_year_prior, axis=1)


In [51]:
def mean_checkouts_year_prior_nonzero(row):
    first_checkout = row['CheckoutDate']
    previous_year  = (first_checkout.replace(year=first_checkout.year -1)).strftime('%m/%d/%Y') 
    author = row['CleanedCreator']
    idx = authors_lookup[author]
    #print(idx)
    if len(author_indicies) >= idx:
        author_df = df.iloc[author_indicies[idx]:len(author_indicies)]
    else:
        author_df = df.iloc[author_indicies[idx]:author_indicies[idx+1]]
    #Get books first checkout in the prior year
    author_df = author_df[(author_df['CheckoutDate']< first_checkout) & (author_df['CheckoutDate'] >= previous_year )]

    if len(author_df) == 0:
        return 0

    # Get index for previous year or the first possible month
    first_index = df_columns_dict.get(previous_year, df_columns_dict['04/01/2005'])
    end_index=df_columns_dict[first_checkout.strftime('%m/%d/%Y')]
    
    months_data= author_df.iloc[:, first_index:end_index].replace(0,np.nan).values
    mean_data = np.nanmean(months_data, axis=0)
    return np.nansum(mean_data)


df['PreviousYearPublishedMeanNonzero'] = df.apply(mean_checkouts_year_prior_nonzero, axis=1)

  mean_data = np.nanmean(months_data, axis=0)


In [52]:
df.columns

Index(['Title', 'Creator', 'Subjects', 'Publisher', 'PublicationYear',
       'CheckoutDate', 'CleanedTitle', 'CleanedCreator', 'UsageClass',
       'MaterialType',
       ...
       '04/01/2024', 'CleanedSubject', 'Genre', 'CleanedPublisher',
       'FirstYearCheckouts', 'PreviousYearCheckoutsSum',
       'PreviousYearCheckoutsMean', 'PreviousYearPublished',
       'PreviousYearPublishedMean', 'PreviousYearPublishedMeanNonzero'],
      dtype='object', length=248)

In [53]:
df = df.drop(columns='PreviousYearPublishedMean')
df  = df.rename(columns={"PreviousYearPublishedMeanNonzero": "PreviousYearPublishedMean"})
df.head()

Unnamed: 0,Title,Creator,Subjects,Publisher,PublicationYear,CheckoutDate,CleanedTitle,CleanedCreator,UsageClass,MaterialType,...,03/01/2024,04/01/2024,CleanedSubject,Genre,CleanedPublisher,FirstYearCheckouts,PreviousYearCheckoutsSum,PreviousYearCheckoutsMean,PreviousYearPublished,PreviousYearPublishedMean
260665,Alcoholics Anonymous: Big Book (unabridged) (U...,AA,"Nonfiction, Recovery",BN Publishing,2009,2009-11-01,alcoholicsanonymous,aa,Digital,AUDIOBOOK,...,0,0,"nonfiction, recovery",nonfiction,other publisher,9,0,0.0,1,61.543439
148601,"Silver Mirrors: Apparatus Infernum Series, Book 2",A. A. Aguirre,"Fantasy, Fiction, Romance, Science Fiction","Penguin Group (USA), Inc.",2014,2014-05-01,silvermirrors,aaaguirre,Digital,EBOOK,...,2,0,"fantasy, fiction, romance, science fiction",fantasy/sci-fi,penguin random house,34,53,3.375,2,54.281051
51784,Bronze Gods,A. A. Aguirre,"Fantasy, Fiction","Penguin Group (USA), Inc.",2013,2013-10-01,bronzegods,aaaguirre,Digital,EBOOK,...,1,0,"fantasy, fiction",fantasy/sci-fi,penguin random house,31,26,2.166667,1,58.704558
505157,Silver mirrors / A.A. Aguirre.,"Aguirre, A. A.","Magic Fiction, Fantasy fiction, Detective and ...","Ace Books,",2014,2014-04-01,silvermirrors,aaaguirre,Physical,BOOK,...,0,0,"magic fiction, fantasy fiction, detective and ...",fantasy/sci-fi,other publisher,13,51,3.208333,2,53.69212
345126,Bronze Gods / A.A. Aguirre.,"Aguirre, A. A.","Fantasy fiction, Detective and mystery fiction","Ace Books,",2013,2013-06-01,bronzegods,aaaguirre,Physical,BOOK,...,0,0,"fantasy fiction, detective and mystery fiction",fantasy/sci-fi,other publisher,40,0,0.0,1,53.645467


With our clean data, we want to create a train test split. We first drop data before April 2006 and after 2023 so that our FirstYearCheckouts and PreviousYearCheckouts have a full year of data for every year. We also create and drop some columns to make processing easier later and then make a train test split.

In [54]:
# Drop data before April 2006 and after May 2023
earliest = datetime.datetime(2006, 4, 1, 0, 0, 0)
latest = datetime.datetime(2023, 5, 1, 0, 0, 0)

df = df[(df['CheckoutDate'] >= earliest) & (df['CheckoutDate'] <= latest)]

In [55]:
# Verify that the months we have left are the correct months
check_months= df.CheckoutDate.value_counts().index.sort_values()
print(check_months[0])
print(check_months[-1])

2006-04-01 00:00:00
2023-05-01 00:00:00


In [56]:
# Create columns for month and year to be used as possible features later
df['CheckoutMonth'] = df['CheckoutDate'].dt.month
df['CheckoutYear'] = df['CheckoutDate'].dt.year

In [57]:
#Remove the columns of all the months of the form MM/DD/YYYY
indices = list(range(len(df.columns)))
columns = df.columns
columns_dict = dict(zip(columns, indices))

#Find the indices of the first and last columns
first_index= columns_dict['04/01/2005']
last_index = columns_dict['04/01/2024']

dropped_months = columns[first_index:last_index+1].to_list()

dropped_columns = [*dropped_months, 'Subjects', 'CleanedSubject', 'Publisher', \
                   'CleanedTitle', 'CleanedCreator', 'PublicationYear', 'CheckoutDate']

df = df.drop(columns=dropped_columns)

In [58]:
df.head()

Unnamed: 0,Title,Creator,UsageClass,MaterialType,Genre,CleanedPublisher,FirstYearCheckouts,PreviousYearCheckoutsSum,PreviousYearCheckoutsMean,PreviousYearPublished,PreviousYearPublishedMean,CheckoutMonth,CheckoutYear
260665,Alcoholics Anonymous: Big Book (unabridged) (U...,AA,Digital,AUDIOBOOK,nonfiction,other publisher,9,0,0.0,1,61.543439,11,2009
148601,"Silver Mirrors: Apparatus Infernum Series, Book 2",A. A. Aguirre,Digital,EBOOK,fantasy/sci-fi,penguin random house,34,53,3.375,2,54.281051,5,2014
51784,Bronze Gods,A. A. Aguirre,Digital,EBOOK,fantasy/sci-fi,penguin random house,31,26,2.166667,1,58.704558,10,2013
505157,Silver mirrors / A.A. Aguirre.,"Aguirre, A. A.",Physical,BOOK,fantasy/sci-fi,other publisher,13,51,3.208333,2,53.69212,4,2014
345126,Bronze Gods / A.A. Aguirre.,"Aguirre, A. A.",Physical,BOOK,fantasy/sci-fi,other publisher,40,0,0.0,1,53.645467,6,2013


In [59]:
# Split data into 80% training data and 20% testing data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2,\
                                      shuffle=True, random_state=216)

In [60]:
df_train.to_csv('../../data/FirstYearCheckouts/cleaned_train_full_2005.csv', index=False)
df_test.to_csv('../../data/FirstYearCheckouts/cleaned_test_full_2005.csv', index=False)

In [61]:
print(df_train.shape)
print(df_test.shape)

(452808, 13)
(113202, 13)
