In [1]:
# import libraries and read in data
import pandas as pd
from IPython.display import display, HTML

# BookCorpus, from directory downloaded from https://yknzhu.wixsite.com/mbweb
bc_books = pd.read_csv('data/BookCorpus/books_in_bookcorpus.csv')

# BookCorpusOpen, downloaded from https://huggingface.co/datasets/bookcorpusopen
bcopen_books = pd.read_csv('data/BookCorpusOpen/2020-08-27-epub_urls.txt',header=None,names=['EpubLink'])

# Smashwords, scraped from smashwords.com in April 2021
sw_thru14 = pd.read_csv('data/Smashwords/smashwords_thru_2014.csv')
sw_after14 = pd.read_csv('data/Smashwords/smashwords_after_2014.csv')
sw_books = pd.concat([sw_thru14,sw_after14])

In [2]:
# merge data for clearer processing
bcopen_books['smashwords_id'] = bcopen_books.EpubLink.str.split('/',expand=True)[5]
bcopen_books['in_bcopen'] = True

sw_books['smashwords_id'] = sw_books.Link.str.split('/',expand=True)[5]
sw_books = sw_books.merge(bcopen_books, how='outer',on='smashwords_id')
sw_books.fillna(value='',inplace=True)

## General Composition

In [3]:
# How many instances (books) are there in total?
print(len(bc_books))

11040


In [4]:
# To align with the 11,038 reported by Zhu and Kiros et al., remove "adventure-all.txt" and "romance-all.txt"
extra_books = bc_books[(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]
bc_books = bc_books[~(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]

In [5]:
len(bc_books)

11038

In [6]:
# How many unique books are there in total?
# Base this on the fingerprint of each entry: filename + word count + disk usage
bc_books_unique=bc_books.drop_duplicates(subset=['fname','word_count','disk_usage'])
len(bc_books_unique)

7189

In [7]:
# How many words in the files?
bc_books.word_count.sum()

811601031

In [8]:
# Zhu and Kiros et al. report 984,846,357, so this does not align
# For now, we hypothesize that this is due to empty text files
# e.g. 'All_I_Want_for_Christmas_Is_a_Vampire.txt' is empty in the download from https://yknzhu.wixsite.com/mbweb

# If we add the words from the "extra books," the word count exceeds the original reported
extra_books.word_count.sum() + bc_books.word_count.sum()

1042409939

## Duplicate Listings

In [9]:
# How many listings may be duplicated?
# (based on the filename + word count + disk usage fingerprint)
len(bc_books) - len(bc_books_unique)

3849

In [10]:
# Look for filenames occuring multiple times
potential_dups = pd.DataFrame(bc_books.fname.value_counts())
potential_dups.columns = ['n']
potential_dups=potential_dups[potential_dups.n>1]

# how many file names occur multiple times?
print(len(potential_dups))

2930


In [11]:
# for now, remove filenames with unknown name scheme (e.g. 'u3041.txt', 'et4358.txt', 'b5096.txt')
named_dups=potential_dups[~potential_dups.index.str.match(r'(u|et|b)')]

# inspect 5 random filenames occuring multiple times, inspect listings
for index, row in named_dups.sample(5).iterrows():
    print("File name: {}".format(index))
    print("{} Occurrences:".format(len(bc_books[bc_books.fname==index])))
    display(HTML(bc_books[bc_books.fname==index].to_html()))

File name: 347487.txt
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
2828,./Humor/347487.txt,Humor,347487.txt,42903,236K
2829,./Teen/347487.txt,Teen,347487.txt,42903,236K


File name: 20201.txt
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
9904,./Horror/20201.txt,Horror,20201.txt,120324,668K
9905,./Mystery/20201.txt,Mystery,20201.txt,120324,668K


File name: Ruby_Red_Trilogy-1.txt
3 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
5779,./Fantasy/Ruby_Red_Trilogy-1.txt,Fantasy,Ruby_Red_Trilogy-1.txt,73422,400K
5780,./Romance/Ruby_Red_Trilogy-1.txt,Romance,Ruby_Red_Trilogy-1.txt,73422,400K
5781,./Young_Adult/Ruby_Red_Trilogy-1.txt,Young_Adult,Ruby_Red_Trilogy-1.txt,73422,400K


File name: 486890.txt
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
2413,./Fantasy/486890.txt,Fantasy,486890.txt,37493,192K
2414,./Teen/486890.txt,Teen,486890.txt,37493,192K


File name: 488068.txt
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
9860,./Fantasy/488068.txt,Fantasy,488068.txt,118992,632K
9861,./Teen/488068.txt,Teen,488068.txt,118992,632K


## Sample Comparisons

#### Genre Distribution

In [12]:
# Genre distribution in original BookCorpus
genre_df = pd.DataFrame(bc_books.category.value_counts())
genre_df.columns = ['BookCorpusN']
genre_df['BookCorpusP'] = (genre_df.BookCorpusN/genre_df.BookCorpusN.sum()*100).round(1).astype(str)+'%'
genre_df['BookCorpus'] = genre_df.BookCorpusP + ' (' + genre_df.BookCorpusN.astype(str) + ')'


# BookCorpusOpen
genre_df['BookCorpusOpenN'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'BookCorpusOpenN'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(g,case=False))])

total_bcopen = len(sw_books[(sw_books.in_bcopen==True)])
genre_df.BookCorpusOpenN = genre_df.BookCorpusOpenN.astype(int)
genre_df['BookCorpusOpenP'] = (genre_df.BookCorpusOpenN/total_bcopen*100).round(1).astype(str)+'%'
genre_df['BookCorpusOpen'] = genre_df.BookCorpusOpenP + ' (' + genre_df.BookCorpusOpenN.astype(str) + ')'



# Smashwords21
genre_df['Smashwords21'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'Smashwords21N'] = len(sw_books[sw_books.Categories.str.contains(g,case=False)])

    
total_smashwords = len(sw_books)
genre_df.Smashwords21N = genre_df.Smashwords21N.astype(int)
genre_df['Smashwords21P'] = (genre_df.Smashwords21N/total_smashwords*100).round(1).astype(str)+'%'
genre_df['Smashwords21'] = genre_df.Smashwords21P + ' (' + genre_df.Smashwords21N.astype(str) + ')'



to_print = genre_df[['BookCorpus','BookCorpusOpen','Smashwords21']]
display(HTML(to_print.to_html()))

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Romance,26.1% (2880),18.1% (3266),16.2% (62775)
Fantasy,13.6% (1502),17.2% (3107),10.7% (41460)
Science_fiction,7.5% (823),13.2% (2385),7.8% (30421)
New_Adult,6.9% (766),1.0% (178),0.7% (2745)
Young_Adult,6.8% (748),9.4% (1693),4.6% (17953)
Thriller,5.9% (646),7.1% (1283),5.7% (22078)
Mystery,5.6% (621),5.2% (935),4.7% (18258)
Vampires,5.4% (600),0.0% (0),0.0% (0)
Horror,4.1% (448),3.8% (686),3.9% (15070)
Teen,3.9% (430),9.4% (1697),4.7% (18083)


#### Religious Viewpoint Distribution

In [13]:
# based on recommendations by Dhamala and Sun et al. https://dl.acm.org/doi/abs/10.1145/3442188.3445924
religion_list = ['Sikhism', 'Judaism', 'Islam', 'Hinduism', 'Christianity', 'Buddhism', 'Atheism']

religion_df = pd.DataFrame(columns=['BookCorpus','BookCorpusOpen','Smashwords21'],index=religion_list)

In [14]:
for religion in religion_list:
    religion_df.at[religion,'BookCorpusOpen'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(religion,case=False))])
    religion_df.at[religion,'Smashwords21'] = len(sw_books[sw_books.Categories.str.contains(religion,case=False)])
    
religion_df

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Sikhism,,0,15
Judaism,,18,348
Islam,,236,1263
Hinduism,,11,244
Christianity,,162,2544
Buddhism,,32,485
Atheism,,18,163
