In [33]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
import swifter
#!pip install swifter

# from langdetect import detect
# from langdetect.lang_detect_exception import LangDetectException
sns.set_style("whitegrid")

First we load the data, and convert the 'dates' column to a DateTime datatype. Note that we dropna values even though there shoulnd't be any since there aren't any when writing to the file; however, when reading it back in, there seems to be a few and so we drop them. 

In [34]:
df=pd.read_csv('data/Checkouts_2005_cleaned_TC.csv')
df = df.dropna()
df['CheckoutDate'] = pd.to_datetime(df['CheckoutDate'])

df.head()

Unnamed: 0,UsageClass,MaterialType,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear,CleanedTitle,CheckoutDate,CleanedCreator
0,Physical,BOOK,3,The hole / Hiroko Oyamada ; translated from th...,"Oyamada, Hiroko, 1983-","Man woman relationships Fiction, Country life ...","New Directions Books,",2020,the hole,2023-04-01,hiroko oyamada
1,Digital,AUDIOBOOK,2,Anything But Fine (unabridged),Tobias Madden,"Romance, Young Adult Fiction, Young Adult Lite...","Recorded Books, LLC",2022,anything but fine,2023-04-01,madden tobias
2,Digital,EBOOK,2,True,Erin McCarthy,"Fiction, Literature, Romance","Penguin Group (USA), Inc.",2015,true,2023-04-01,erin mccarthy
3,Physical,BOOK,1,The ninth metal / Benjamin Percy.,"Percy, Benjamin","Comets Fiction, Mines and mineral resources Fi...","Mariner Books, Houghton Mifflin Harcourt,",2021,the ninth metal,2023-04-01,benjamin percy
4,Digital,EBOOK,2,Middlemarch (World Digital Library Edition),George Eliot,"Fiction, Literature",Barnes & Noble World Digital Library,2005,middlemarch,2023-04-01,eliot george


In [36]:
months_of_interest=pd.date_range(start='2005-04-01', end='2024-08-01',freq='MS')
print(months_of_interest)


DatetimeIndex(['2005-04-01', '2005-05-01', '2005-06-01', '2005-07-01',
               '2005-08-01', '2005-09-01', '2005-10-01', '2005-11-01',
               '2005-12-01', '2006-01-01',
               ...
               '2023-11-01', '2023-12-01', '2024-01-01', '2024-02-01',
               '2024-03-01', '2024-04-01', '2024-05-01', '2024-06-01',
               '2024-07-01', '2024-08-01'],
              dtype='datetime64[ns]', length=233, freq='MS')


In [37]:
#Get the list of all the column names
month_columns = months_of_interest.strftime('%m/%d/%Y')
month_columns = month_columns.to_list()

We read in the partial pairs data we generated beforehand. This has columns of CleanedTitle, CleanedCreator, and all the months with the corresponding checkout data filled in. 

In [38]:
pairs = pd.read_csv('data/cleaned_months.csv')
pairs.head()

Unnamed: 0,CleanedTitle,CleanedCreator,04/01/2005,05/01/2005,06/01/2005,07/01/2005,08/01/2005,09/01/2005,10/01/2005,11/01/2005,...,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024,05/01/2024,06/01/2024,07/01/2024,08/01/2024
0,fancy nancy,jane o'connor,0,0,0,0,0,0,0,0,...,116,78,103,93,92,104,0,0,39,78
1,pride and prejudice,austen jane,0,0,0,0,0,2,11,13,...,413,476,473,451,473,424,0,0,364,391
2,the wind in the willows,grahame kenneth,0,0,0,0,0,2,0,1,...,35,36,32,33,35,29,0,0,27,39
3,yu-gi-oh!,kazuki takahashi,0,0,0,0,0,0,0,0,...,5,3,1,2,1,1,0,0,4,2
4,the walking dead.,kirkman robert,0,0,0,0,0,0,0,0,...,5,5,6,12,15,1,0,0,4,2


Our goal is to merge back in the other feature data we have deleted (Title, Creator, Subject, ect.). Since there are multiple rows corresonding to each row in pairs, we will select the data from the first checkout of the book. To do so, we sort the full dataframe df, so that when we apply our function add_full_data to each row working top down, we grab the earliest CheckoutDate for each book and use that row to populate the full dataframe. 

In [39]:
#Sort by CheckoutDate so that when we iterate through the rows via apply, we grab the 
# row corresponding to the first CheckoutDate for each (title, creator) pair in pairs
df = df.sort_values(by='CheckoutDate')
df =df.reset_index(drop=True)
df.head()


Unnamed: 0,UsageClass,MaterialType,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear,CleanedTitle,CheckoutDate,CleanedCreator
0,Physical,BOOK,3,The rosary girls / Richard Montanari.,"Montanari, Richard","Balzano Jessica Fictitious character Fiction, ...","Ballantine Books,",2005,the rosary girls,2005-04-01,montanari richard
1,Physical,BOOK,2,Secrets of the millionaire mind : mastering th...,"Eker, T. Harv","Money Psychological aspects, Millionaires Psyc...","HarperBusiness,",2005,secrets of the millionaire mind,2005-04-01,eker t
2,Physical,BOOK,7,In the company of liars / David Ellis.,"Ellis, David, 1967-",Thrillers Fiction,"G.P. Putnam's Sons,",2005,in the company of liars,2005-04-01,david ellis
3,Physical,BOOK,21,The motive / John Lescroart.,"Lescroart, John T.","Hardy Dismas Fictitious character Fiction, Soc...","Dutton,",2005,the motive,2005-04-01,john lescroart
4,Physical,BOOK,1,The rottweiler [text (large print)] / Ruth Ren...,"Rendell, Ruth, 1930-2015","Police England London Fiction, Lisson Grove Lo...","Thorndike Press,",2005,the rottweiler,2005-04-01,rendell ruth


In [40]:
#Get the column names of the features we want to add back in
features = df.columns.to_list()
print(features)

['UsageClass', 'MaterialType', 'Checkouts', 'Title', 'Creator', 'Subjects', 'Publisher', 'PublicationYear', 'CleanedTitle', 'CheckoutDate', 'CleanedCreator']


In [41]:
#Remove the column names we already have in pairs
features.remove('CleanedTitle')
features.remove('CleanedCreator')

print(features)

['UsageClass', 'MaterialType', 'Checkouts', 'Title', 'Creator', 'Subjects', 'Publisher', 'PublicationYear', 'CheckoutDate']


To reduce our lookup cost, we create a column named TitleCreator_str with a string of the joined CleanedTitle and CleanedCreator pairs in both the pairs and df dataframes. We then create a dictionary with keys the values in TitleCreator_str and values the corresponding index in pairs. This is to reduce the lookup cost of finding which index in pairs corresponds to the (CleanedTitle, CleanedCreator) pair we have in each row of df. We a dictionary because they are implemented via a hash map, which has average look up O(1). 

In [44]:
#We add a column with a string of the joined CleanedTitle and CleanedCreator
pairs['TitleCreator_str'] = pairs['CleanedTitle'] + " , " + pairs['CleanedCreator']
df['TitleCreator_str'] = df['CleanedTitle'] + " , " + df['CleanedCreator']


#Create dictionary with keys given by TitleCreator_str and values the corresponding index in pairs
vals = pairs['TitleCreator_str'].values
indices = pairs.index
pairs_lookup_full = dict(zip(vals, indices))

#Create array indicating which rows of pairs we have already filled in
# 1 indicates we already filled it; 0 indicates we have not
filled= [0] * len(indices)

#Function to iterate through each row in df and grab the data from the first checkout of each 
# (CleanedTitle, CleanedCreator) pair. We grab the first checkout assuming df has been sorted by 
# CheckoutDate beforehand. Otherwise we get a random row of data. 
def add_full_data(row):
    #Get index of row in pairs_full and grab row
    title_creator_str = row['TitleCreator_str']
    #Check if we have updated the row yet; if not, update it
    if filled[pairs_lookup_full[title_creator_str]]==0:
        #Change indicator to 1 to indicate we have visited row
        filled[pairs_lookup_full[title_creator_str]]=1
        #Return the index in pairs with the corresponding features from df
        return list(np.insert(row[features].values, 0, pairs_lookup_full[title_creator_str]))
    return None
    

# Apply function across all rows of df_sample and collect updates
updates = df.apply(add_full_data, axis=1)

# Filter out None values from updates
updates = updates.dropna()
updates.head()

0    [20296, Physical, BOOK, 3, The rosary girls / ...
1    [18344, Physical, BOOK, 2, Secrets of the mill...
2    [19957, Physical, BOOK, 7, In the company of l...
3    [7830, Physical, BOOK, 21, The motive / John L...
4    [29397, Physical, BOOK, 1, The rottweiler [tex...
dtype: object

In [49]:
#Make sure pairs and updates have the same number of rows
print(pairs.shape)
print(updates.shape)

(440731, 236)
(440731,)


Now that we have the additional data in updates, we want to add it to pairs. We create a dataframe from updates with the first column labeled 'index'; this column tells us what row in pairs the data corresponds to. We use reset_index on pairs to generate a column called 'index' with the corresponding indices in pairs. We then merge on those indices via pd.merge to obtain the full dataset.

In [45]:
#Features we obtained in updates 
features_new = ['index', *features]
print(features_new)

['index', 'UsageClass', 'MaterialType', 'Checkouts', 'Title', 'Creator', 'Subjects', 'Publisher', 'PublicationYear', 'CheckoutDate']


In [50]:
#Remove the extra column we made for indexing convenience
pairs = pairs.drop(columns=['TitleCreator_str'])

#Add column with index of each row in pairs for merging 
pairs = pairs.reset_index()
pairs.head()

Unnamed: 0,index,CleanedTitle,CleanedCreator,04/01/2005,05/01/2005,06/01/2005,07/01/2005,08/01/2005,09/01/2005,10/01/2005,...,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024,05/01/2024,06/01/2024,07/01/2024,08/01/2024
0,0,fancy nancy,jane o'connor,0,0,0,0,0,0,0,...,116,78,103,93,92,104,0,0,39,78
1,1,pride and prejudice,austen jane,0,0,0,0,0,2,11,...,413,476,473,451,473,424,0,0,364,391
2,2,the wind in the willows,grahame kenneth,0,0,0,0,0,2,0,...,35,36,32,33,35,29,0,0,27,39
3,3,yu-gi-oh!,kazuki takahashi,0,0,0,0,0,0,0,...,5,3,1,2,1,1,0,0,4,2
4,4,the walking dead.,kirkman robert,0,0,0,0,0,0,0,...,5,5,6,12,15,1,0,0,4,2


In [52]:
#Create updates dataframe to be joined with pairs
updates_df =pd.DataFrame(updates.to_list(), columns=features_new)
updates_df.head()

Unnamed: 0,index,UsageClass,MaterialType,Checkouts,Title,Creator,Subjects,Publisher,PublicationYear,CheckoutDate
0,20296,Physical,BOOK,3,The rosary girls / Richard Montanari.,"Montanari, Richard","Balzano Jessica Fictitious character Fiction, ...","Ballantine Books,",2005,2005-04-01
1,18344,Physical,BOOK,2,Secrets of the millionaire mind : mastering th...,"Eker, T. Harv","Money Psychological aspects, Millionaires Psyc...","HarperBusiness,",2005,2005-04-01
2,19957,Physical,BOOK,7,In the company of liars / David Ellis.,"Ellis, David, 1967-",Thrillers Fiction,"G.P. Putnam's Sons,",2005,2005-04-01
3,7830,Physical,BOOK,21,The motive / John Lescroart.,"Lescroart, John T.","Hardy Dismas Fictitious character Fiction, Soc...","Dutton,",2005,2005-04-01
4,29397,Physical,BOOK,1,The rottweiler [text (large print)] / Ruth Ren...,"Rendell, Ruth, 1930-2015","Police England London Fiction, Lisson Grove Lo...","Thorndike Press,",2005,2005-04-01


In [53]:

#Merge the dataframes on the index column
merged_df = pd.merge(updates_df, pairs, on='index')
merged_df[['Title', 'Creator', 'CleanedTitle', 'CleanedCreator']].head(50)


Unnamed: 0,Title,Creator,CleanedTitle,CleanedCreator
0,The rosary girls / Richard Montanari.,"Montanari, Richard",the rosary girls,montanari richard
1,Secrets of the millionaire mind : mastering th...,"Eker, T. Harv",secrets of the millionaire mind,eker t
2,In the company of liars / David Ellis.,"Ellis, David, 1967-",in the company of liars,david ellis
3,The motive / John Lescroart.,"Lescroart, John T.",the motive,john lescroart
4,The rottweiler [text (large print)] / Ruth Ren...,"Rendell, Ruth, 1930-2015",the rottweiler,rendell ruth
5,Shadow life : a portrait of Anne Frank and her...,"Denenberg, Barry",shadow life,barry denenberg
6,Over under / Marthe Jocelyn ; [illustrated by]...,"Jocelyn, Marthe",over under,jocelyn marthe
7,"Jimi Hendrix : the man, the magic, the truth /...","Lawrence, Sharon, 1948-",jimi hendrix,lawrence sharon
8,Cold service / Robert B. Parker.,"Parker, Robert B., 1932-2010",cold service,parker robert
9,Eleanor Rigby : a novel / Douglas Coupland.,"Coupland, Douglas",eleanor rigby,coupland douglas


In [60]:
#Remove the index and Checkouts columns as they are unnecessary. 
merged_df = merged_df.drop(columns=['index', 'Checkouts'])
merged_df.head()

Unnamed: 0,UsageClass,MaterialType,Title,Creator,Subjects,Publisher,PublicationYear,CheckoutDate,CleanedTitle,CleanedCreator,...,11/01/2023,12/01/2023,01/01/2024,02/01/2024,03/01/2024,04/01/2024,05/01/2024,06/01/2024,07/01/2024,08/01/2024
0,Physical,BOOK,The rosary girls / Richard Montanari.,"Montanari, Richard","Balzano Jessica Fictitious character Fiction, ...","Ballantine Books,",2005,2005-04-01,the rosary girls,montanari richard,...,0,0,1,0,1,0,0,0,0,1
1,Physical,BOOK,Secrets of the millionaire mind : mastering th...,"Eker, T. Harv","Money Psychological aspects, Millionaires Psyc...","HarperBusiness,",2005,2005-04-01,secrets of the millionaire mind,eker t,...,2,0,1,1,2,1,0,0,0,2
2,Physical,BOOK,In the company of liars / David Ellis.,"Ellis, David, 1967-",Thrillers Fiction,"G.P. Putnam's Sons,",2005,2005-04-01,in the company of liars,david ellis,...,0,0,2,1,1,0,0,0,0,1
3,Physical,BOOK,The motive / John Lescroart.,"Lescroart, John T.","Hardy Dismas Fictitious character Fiction, Soc...","Dutton,",2005,2005-04-01,the motive,john lescroart,...,1,1,3,2,0,0,0,0,1,0
4,Physical,BOOK,The rottweiler [text (large print)] / Ruth Ren...,"Rendell, Ruth, 1930-2015","Police England London Fiction, Lisson Grove Lo...","Thorndike Press,",2005,2005-04-01,the rottweiler,rendell ruth,...,0,1,0,0,1,0,0,0,0,1


In [61]:
merged_df.to_csv('data/cleaned_months_full.csv')