In [23]:
# import libraries and read in data
import pandas as pd
from IPython.display import display, HTML

# BookCorpus metadata, from files downloaded directly from https://yknzhu.wixsite.com/mbweb
bc_books = pd.read_csv('data/BookCorpus/books_in_bookcorpus.csv')

# BookCorpusOpen url list, downloaded from https://huggingface.co/datasets/bookcorpusopen
bcopen_books = pd.read_csv('data/BookCorpusOpen/2020-08-27-epub_urls.txt',header=None,names=['EpubLink'])

# Smashwords21, scraped from smashwords.com in April 2021
sw_books = pd.read_csv('data/Smashwords21/smashwords_april_2021.csv')

In [24]:
# merge data for clearer processing
bcopen_books['smashwords_id'] = bcopen_books.EpubLink.str.split('/',expand=True)[5]
bcopen_books['in_bcopen'] = True

sw_books['smashwords_id'] = sw_books.Link.str.split('/',expand=True)[5]
sw_books = sw_books.merge(bcopen_books, how='outer',on='smashwords_id')
sw_books.fillna(value='',inplace=True)

## General Composition

In [25]:
# How many instances (books) are there in total?
print(len(bc_books))

11040


In [26]:
# To align with the 11,038 reported by Zhu and Kiros et al., remove "adventure-all.txt" and "romance-all.txt"
extra_books = bc_books[(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]
bc_books = bc_books[~(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]

In [27]:
len(bc_books)

11038

In [28]:
# How many words in the files?
bc_books.word_count.sum()

811601031

In [29]:
# Zhu and Kiros et al. report 984,846,357, so this does not align
# We hypothesize that this is mainly due to empty text files
# e.g. 'All_I_Want_for_Christmas_Is_a_Vampire.txt' is empty in the download from https://yknzhu.wixsite.com/mbweb

# Also, if we add the words from the "extra books," the word count exceeds the original reported
extra_books.word_count.sum() + bc_books.word_count.sum()

1042409939

## Books with Duplicate Copies

In [30]:
# How many unique file names are there?
file_name_counts = pd.DataFrame(bc_books.fname.value_counts()).reset_index()
file_name_counts.columns = ['fname','n']
print(len(file_name_counts))

7185


*We confirmed each unique filename corresponded to one unique book, even in cases where word counts and disk usage differed.*

*See the datasheet for details.*

In [31]:
# How many file names occur multiple times?
print(len(file_name_counts[file_name_counts.n>1]))

2930


In [32]:
# How many books do these file names represent?
# That is, how many books are duplicates?
len(bc_books) - len(file_name_counts)

3853

In [33]:
# How many books occur once, twice, thrice, etc.?
print(file_name_counts.n.value_counts())

1    4255
2    2101
3     741
4      82
5       6
Name: n, dtype: int64


In [34]:
# What are some of the duplicate books?

# for now, remove filenames with unknown name scheme (e.g. 'u3041.txt', 'et4358.txt', 'b5096.txt')
named_dups=file_name_counts[(~file_name_counts.fname.str.match(r'(u|et|b)'))]
named_dups = named_dups[(named_dups.n>1)]

# inspect 5 random filenames occuring multiple times, inspect listings
for index, row in named_dups.sample(5).iterrows():
    print("File name: {}".format(index))
    print("{} Occurrences:".format(len(bc_books[bc_books.fname==row.fname])))
    display(HTML(bc_books[bc_books.fname==row.fname].to_html()))

File name: 2059
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
8008,./Fantasy/Changeless.txt,Fantasy,Changeless.txt,92799,532K
8009,./Vampires/Changeless.txt,Vampires,Changeless.txt,92799,532K


File name: 2463
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
1951,./Mystery/207086.txt,Mystery,207086.txt,31407,172K
1952,./Thriller/207086.txt,Thriller,207086.txt,31407,172K


File name: 1645
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
6583,./Adventure/371011.txt,Adventure,371011.txt,80424,440K
6584,./Thriller/371011.txt,Thriller,371011.txt,80424,440K


File name: 1981
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
6223,./Horror/322557.txt,Horror,322557.txt,77476,420K
6226,./Science_fiction/322557.txt,Science_fiction,322557.txt,77476,420K


File name: 889
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage
3561,./Romance/Spellbinder.txt,Romance,Spellbinder.txt,51335,284K
3562,./Vampires/Spellbinder.txt,Vampires,Spellbinder.txt,51335,284K


## Sample Comparisons

#### Genre Distribution

In [35]:
# Genre distribution in original BookCorpus
genre_df = pd.DataFrame(bc_books.category.value_counts())
genre_df.columns = ['BookCorpusN']
genre_df['BookCorpusP'] = (genre_df.BookCorpusN/genre_df.BookCorpusN.sum()*100).round(1).astype(str)+'%'
genre_df['BookCorpus'] = genre_df.BookCorpusP + ' (' + genre_df.BookCorpusN.astype(str) + ')'


# BookCorpusOpen
genre_df['BookCorpusOpenN'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'BookCorpusOpenN'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(g,case=False))])

total_bcopen = len(sw_books[(sw_books.in_bcopen==True)])
genre_df.BookCorpusOpenN = genre_df.BookCorpusOpenN.astype(int)
genre_df['BookCorpusOpenP'] = (genre_df.BookCorpusOpenN/total_bcopen*100).round(1).astype(str)+'%'
genre_df['BookCorpusOpen'] = genre_df.BookCorpusOpenP + ' (' + genre_df.BookCorpusOpenN.astype(str) + ')'



# Smashwords21
genre_df['Smashwords21'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'Smashwords21N'] = len(sw_books[sw_books.Categories.str.contains(g,case=False)])

    
total_smashwords = len(sw_books)
genre_df.Smashwords21N = genre_df.Smashwords21N.astype(int)
genre_df['Smashwords21P'] = (genre_df.Smashwords21N/total_smashwords*100).round(1).astype(str)+'%'
genre_df['Smashwords21'] = genre_df.Smashwords21P + ' (' + genre_df.Smashwords21N.astype(str) + ')'



to_print = genre_df[['BookCorpus','BookCorpusOpen','Smashwords21']]
display(HTML(to_print.to_html()))

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Romance,26.1% (2880),17.7% (3265),15.9% (69963)
Fantasy,13.6% (1502),16.9% (3108),10.6% (46565)
Science_fiction,7.5% (823),13.0% (2396),7.7% (33908)
New_Adult,6.9% (766),1.0% (176),0.7% (3072)
Young_Adult,6.8% (748),9.1% (1684),4.6% (20015)
Thriller,5.9% (646),6.9% (1279),5.7% (24939)
Mystery,5.6% (621),5.1% (940),4.7% (20512)
Vampires,5.4% (600),0.0% (0),0.0% (0)
Horror,4.1% (448),3.7% (687),3.8% (16904)
Teen,3.9% (430),9.2% (1688),4.6% (20165)


#### Religious Viewpoint Distribution

In [36]:
# based on recommendations by Dhamala and Sun et al. https://dl.acm.org/doi/abs/10.1145/3442188.3445924
religion_list = ['Sikhism', 'Judaism', 'Islam', 'Hinduism', 'Christianity', 'Buddhism', 'Atheism']

religion_df = pd.DataFrame(columns=['BookCorpus','BookCorpusOpen','Smashwords21'],index=religion_list)

In [37]:
for religion in religion_list:
    religion_df.at[religion,'BookCorpusOpen'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(religion,case=False))])
    religion_df.at[religion,'Smashwords21'] = len(sw_books[sw_books.Categories.str.contains(religion,case=False)])
    
religion_df

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Sikhism,,0,16
Judaism,,19,400
Islam,,238,1372
Hinduism,,12,273
Christianity,,161,2851
Buddhism,,33,541
Atheism,,18,183
