In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
sns.set_style("whitegrid")

In [76]:
#Read in cleaned data
df_train = pd.read_csv('data/cleaned_train_2005.csv')
df_train.head()

Unnamed: 0,Title,Creator,Publisher,UsageClass,MaterialType,Genre,FirstYearCheckouts,PreviousYearCheckouts,CheckoutMonth,CheckoutYear
0,Whose poop is that? / Darrin Lunde ; illustrat...,"Lunde, Darrin P.","Charlesbridge,",Physical,BOOK,juvenile,59,124,2,2017
1,The Power of Language: How the Codes We Use to...,Viorica Marian,"Penguin Group (USA), Inc.",Digital,EBOOK,nonfiction,57,0,4,2023
2,If I Were Your Woman,Donna Hill,"Harlequin Enterprises, Ltd.",Digital,EBOOK,romance,3,47,3,2008
3,Where are the Great Pyramids? / by Dorothy and...,"Hoobler, Dorothy","Grosset & Dunlap, an imprint of Penguin Random...",Physical,BOOK,juvenile,24,19,11,2015
4,Donde cantan las ballenas / Sara Jaramillo Kli...,"Jaramillo Klinkert, Sara","Lumen,",Physical,BOOK,fiction,1,0,9,2021


## Baseline Model: 

For our baseline model, we predict the number of first year checkouts for a library item as the average of all the first year checkout data. 

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error

In [9]:
## Make KFold object to be used on training dataset
kfold = KFold(n_splits = 5,
              shuffle = True,
              random_state = 216)

In [None]:
rmses = np.zeros(5)

for i, (train_index, test_index) in enumerate(kfold.split(df_train, df_train['FirstYearCheckouts'])):
    ## get the kfold training and holdout data
    X_tt = df_train.iloc[train_index]
    X_ho = df_train.iloc[test_index]

    #Get average of the first year checkouts for our train set 
    baseline_pred = np.ones(len(X_ho)) * X_tt['FirstYearCheckouts'].mean() 

    ## Record the rmses
    rmses[i] = root_mean_squared_error(X_ho['FirstYearCheckouts'], baseline_pred)

In [16]:
print(rmses)
print(rmses.mean())

[120.72933696 120.74623903 114.23789139 115.63958754 111.41788744]
116.55418847177882


In [None]:
print('The average number of first year checkouts for the whole dataset is', df_train['FirstYearCheckouts'].mean())
print('The minimum and maximum first year checkouts in our dataset are', df_train['FirstYearCheckouts'].min(), \
      'and', df_train['FirstYearCheckouts'].max(), 'respectively')

The average number of first year checkouts for the whole dataset is 44.82706356778149
The minimum and maximum first year checkouts in our dataset are 1 and 9633 respectively


## First Modeling Attempt: Linear Regression

For our first model, we will consider linear regression. We will train on most of the features, including Publisher, which we first clean and then one-hot encode. 

In [77]:
def clean_publisher(text): 
    #Remove non-digit or alphabetical characters; leave whitespace
    text = re.sub(r'[^\w]','',text)
    # Strip any leading/trailing whitespace and make lowercase
    return text.strip().lower()

In [78]:
df_train['CleanedPublisher'] = df_train['Publisher'].apply(clean_publisher)
pubs = df_train['CleanedPublisher'].value_counts().reset_index()
pubs.head()

Unnamed: 0,CleanedPublisher,count
0,randomhouseinc,24592
1,harpercollinspublishersinc,18052
2,penguingroupusainc,18031
3,booksontape,11592
4,macmillanpublishers,11230


In [79]:
# Strings we filter from the  cleaned publisher colum
publishers = ['tantor', 'penguin', 'randomhouse', 'harpercollins', 'harper', \
              'booksontape', 'listeninglibrary', 'schuster', 'blackstone', 
              'hachette', 'scholastic', 'harlequin', 'macmillan',\
                'mifflin', 'brilliance', 'lightningsource', 'recordedbooks' ]

# Mapping of strings in publishers to the publisher categories we will actually use 
publishers_mapped = ['recorded books', 'penguin random house', 'penguin random house',\
                     'harpercollins', 'harpercollins', 'penguin random house', 'penguin random house', \
                      'simon & schuster', 'blackstone', 'hachette', \
                      'scholastic', 'harlequin', 'macmillan', 'harpercollins', \
                        'brilliance', 'lightning source', 'recorded books']

# The final list of the publisher categories used 
publishers_final =['recorded books', 'penguin random house', 'harpercollins', 'simon & schuster', \
                   'blackstone' , 'scholastic', 'macmillan', 'hachette', 'harlequin',\
                    'brilliance', 'lightning source', 'other publisher']

In [80]:
#Create dictionary combining genres found in Subject string
#  with genre names we want to use
pubs_final_dict = dict(zip(publishers, publishers_mapped))

# Create a dictionary for priority of each genre
priority = {pubs: i for i, pubs in enumerate(publishers_final)}

# Function to classify genre based on the CleanedSubject column
def classify_publishers(cleaned_pubs):
    if pd.isna(cleaned_pubs):  # If the entry is NaN
        return 'other'


    # Find all matching publishers in the cleaned_pubs string
    found_pubs = [pubs_final_dict[pubs] for pubs in publishers if pubs in cleaned_pubs]


    if not found_pubs:  # If no publisher is found
        return 'other publisher'
    

    # Sort genres based on their priority
    found_pubs.sort(key=lambda g: priority[g])

    # Return the genre with the highest priority
    return found_pubs[0]

# Apply the function to the dataframe
df_train['CleanedPublisher'] = df_train['CleanedPublisher'].apply(classify_publishers)
df_train = df_train.drop(columns = ['Publisher'])

In [36]:
from sklearn.linear_model import LinearRegression

In [81]:
#Get list of cateories for columns of one-hot encoding
genre_list = df_train['Genre'].unique().tolist()
material_list = df_train['MaterialType'].unique().tolist()
#publisher_list = df_train['CleanedPublisher'].unique().tolist()

# One-hot encoding of 'Genre' variable
df_train[genre_list] = pd.get_dummies(df_train['Genre'])

# One-hot encoding of 'MaterialType' variables
df_train[material_list] = pd.get_dummies(df_train['MaterialType'])

# One-hot encoding of 'CleanedPublisher' variable
#df_train[publisher_list] = pd.get_dummies(df_train['CleanedPublisher'])

#One hot encode UsageClass into single column with 1 indicating 'Physical' and 0 indicating 'Digital'
df_train['UsageClass'] = pd.get_dummies(df_train['UsageClass'])['Physical']
df_train = df_train.drop(columns = ['Genre', 'MaterialType'])
#df_train = df_train.drop(columns = ['Genre', 'MaterialType', 'CleanedPublisher'])

In [82]:
df_train.columns

Index(['Title', 'Creator', 'UsageClass', 'FirstYearCheckouts',
       'PreviousYearCheckouts', 'CheckoutMonth', 'CheckoutYear',
       'CleanedPublisher', 'juvenile', 'nonfiction', 'romance', 'fiction',
       'mystery', 'other', 'horror/thriller', 'history', 'biography',
       'fantasy/sci-fi', 'young adult', 'BOOK', 'EBOOK', 'AUDIOBOOK',
       'SOUNDDISC', 'OTHER', 'VIDEODISC'],
      dtype='object')

In [83]:
# Features we want to train our linear regression on
features=['UsageClass', 'PreviousYearCheckouts', 'CheckoutMonth', 'CheckoutYear', 'juvenile', 'nonfiction', 'romance', 'fiction',
       'mystery', 'other', 'horror/thriller', 'history', 'biography',
       'fantasy/sci-fi', 'young adult', 'BOOK', 'EBOOK', 'AUDIOBOOK',
       'SOUNDDISC', 'OTHER', 'VIDEODISC']

In [84]:
# Kfold split
kfold = KFold(n_splits = 5,
              shuffle = True,
              random_state = 216)

In [85]:
# Array to store rmse
rmse = np.zeros(5)

# Initialize LinearRegression Model
lr = LinearRegression()

for i, (train_index, test_index) in enumerate(kfold.split(df_train, df_train['FirstYearCheckouts'])):

    ## get the kfold training and holdout data
    X_tt = df_train.iloc[train_index]
    X_ho = df_train.iloc[test_index]

    ## Fit model
    lr.fit(X_tt[features], X_tt['FirstYearCheckouts'])

    ## Generate predictions on the holdout set
    lr_preds = lr.predict(X_ho[features])

    ## Record the rmses
    rmses[i] = root_mean_squared_error(X_ho['FirstYearCheckouts'], lr_preds)

In [86]:
print(rmses)

[118.70998867 118.79886248 112.47832067 113.86938046 109.57269062]


In [87]:
print(rmses.mean())

114.68584857752083


The above analysis worked without taking into account Publisher; we now do the same modeling but adding the publisher as a feature

In [88]:
#Get list of cateories for columns of one-hot encoding
publisher_list = df_train['CleanedPublisher'].unique().tolist()

# One-hot encoding of 'CleanedPublisher' variable
df_train[publisher_list] = pd.get_dummies(df_train['CleanedPublisher'])
df_train = df_train.drop(columns = ['CleanedPublisher'])

In [90]:
new_features=['UsageClass', 'PreviousYearCheckouts', 'CheckoutMonth', 'CheckoutYear', 'juvenile', 'nonfiction', 'romance', 'fiction',
       'mystery', 'other', 'horror/thriller', 'history', 'biography',
       'fantasy/sci-fi', 'young adult', 'BOOK', 'EBOOK', 'AUDIOBOOK',
       'SOUNDDISC', 'OTHER', 'VIDEODISC','other publisher', 'penguin random house', 'harlequin',
       'blackstone', 'harpercollins', 'brilliance', 'macmillan',
       'simon & schuster', 'recorded books', 'hachette', 'lightning source',
       'scholastic' ]

In [93]:
# Kfold split
kfold = KFold(n_splits = 5,
              shuffle = True,
              random_state = 216)

# Array to store rmse
rmse = np.zeros(5)

# Initialize LinearRegression Model
lr = LinearRegression()

for i, (train_index, test_index) in enumerate(kfold.split(df_train, df_train['FirstYearCheckouts'])):

    ## get the kfold training and holdout data
    X_tt = df_train.iloc[train_index]
    X_ho = df_train.iloc[test_index]

    ## Fit model
    lr.fit(X_tt[new_features], X_tt['FirstYearCheckouts'])

    ## Generate predictions on the holdout set
    lr_preds = lr.predict(X_ho[new_features])

    ## Record the rmses
    rmses[i] = root_mean_squared_error(X_ho['FirstYearCheckouts'], lr_preds)


In [94]:
print(rmses)
print(rmses.mean())

[118.38257533 118.47005855 112.17974951 113.56904321 109.25002301]
114.37028992054891
