In [19]:
# import libraries and read in data
import pandas as pd
from IPython.display import display, HTML

# BookCorpus metadata, from files downloaded directly from https://yknzhu.wixsite.com/mbweb
bc_books = pd.read_csv('data/BookCorpus/books_in_bookcorpus.csv')

# BookCorpusOpen url list, downloaded from https://huggingface.co/datasets/bookcorpusopen
bcopen_books = pd.read_csv('data/BookCorpusOpen/2020-08-27-epub_urls.txt',header=None,names=['EpubLink'])

# Smashwords21, scraped from smashwords.com in April 2021
sw_books = pd.read_csv('data/Smashwords21/smashwords_april_2021.csv')

In [20]:
# merge Smashwords21 with BookCorpusOpen
bcopen_books['smashwords_id'] = bcopen_books.EpubLink.str.split('/',expand=True)[5]
bcopen_books['in_bcopen'] = True

sw_books['smashwords_id'] = sw_books.Link.str.split('/',expand=True)[5]
sw_books = sw_books.merge(bcopen_books, how='outer',on='smashwords_id')
sw_books.fillna(value='',inplace=True)

In [21]:
# partially merge Smashwords21 with original BookCorpus
bc_books['smashwords_id'] = bc_books.fname.str.replace('.txt','')
bc_books['in_bc_books'] = True
sw_books = sw_books.merge(bc_books, how='left',on='smashwords_id')

In [22]:
# How many books from BookCorpus were found in Smashwords21 using only the smashwords_id?
len(sw_books[sw_books.in_bc_books==True])

2743

In [23]:
# Check if everything looks okay
sw_books[sw_books.in_bc_books==True].sample(5)

Unnamed: 0,Link,Title,Author,Price,Words,Language,Published,Categories,smashwords_id,EpubLink,in_bcopen,location,category,fname,word_count,disk_usage,in_bc_books
409442,https://www.smashwords.com/books/view/367930,The Two Schillings,Anthony O'Brian,$4.99 USD,183680,English,"October 15, 2013","Fiction » Romance » Historical , Fiction » His...",367930,,,./Historical/367930.txt,Historical,367930.txt,183688.0,980K,True
2361,https://www.smashwords.com/books/view/333133,AMP Messenger,Stephen Arseneault,$0.00 USD,77330,English,"July 4, 2013","Fiction » Science fiction » Adventure , Fictio...",333133,https://www.smashwords.com/books/download/3331...,True,./Science_fiction/333133.txt,Science_fiction,333133.txt,76839.0,412K,True
35240,https://www.smashwords.com/books/view/478928,Dead Echo,C.G. Banks,$0.00 USD,154110,English,"September 22, 2014",Fiction » Horror » Occult,478928,https://www.smashwords.com/books/download/4789...,True,./Horror/478928.txt,Horror,478928.txt,154113.0,836K,True
37520,https://www.smashwords.com/books/view/445348,Dreamweavers: Awakening,P J G Robbins,$0.00 USD,102130,English,"June 5, 2014",Fiction » Young adult or teen » Adventure,445348,https://www.smashwords.com/books/download/4453...,True,./Teen/445348.txt,Teen,445348.txt,102135.0,568K,True
21462,https://www.smashwords.com/books/view/346708,Dead Ringers: Volumes 1-3,Darlene Gardner,$0.00 USD,78650,English,"August 12, 2013","Fiction » Young adult or teen » Paranormal , F...",346708,https://www.smashwords.com/books/download/3467...,True,./Romance/346708.txt,Romance,346708.txt,78824.0,440K,True


# Stolen Books?

In [24]:
# How many books are in BookCorpus that now cost money?
# (Lower bound since metadata is missing for lots of BookCorpus books)
len(sw_books[(sw_books.in_bc_books==True)&(sw_books.Price.str.contains('USD'))&(sw_books.Price!='$0.00 USD')])

438

In [25]:
# What are some books that are in BookCorpus that now cost money?
sw_books[(sw_books.in_bc_books==True)&(sw_books.Price.str.contains('USD'))&(sw_books.Price!='$0.00 USD')].sample(5)

Unnamed: 0,Link,Title,Author,Price,Words,Language,Published,Categories,smashwords_id,EpubLink,in_bcopen,location,category,fname,word_count,disk_usage,in_bc_books
122891,https://www.smashwords.com/books/view/32286,Drums: a Novel,Brad Henderson,$4.95 USD,58140,English,"December 4, 2010","Fiction » Literature » Literary , Fiction » Co...",32286,,,./Literature/32286.txt,Literature,32286.txt,58145.0,328K,True
251917,https://www.smashwords.com/books/view/419395,"Deviation, Breaking the Pattern #1",P.D. Workman,$4.99 USD,57680,English,"March 15, 2014",Fiction » Young adult or teen » Social Issues ...,419395,,,./Teen/419395.txt,Teen,419395.txt,57680.0,332K,True
243179,https://www.smashwords.com/books/view/513563,"The Hunter, The Dragon And The Smokey Mountain...",M. Modak,$0.99 USD,95360,English,"January 23, 2015","Fiction » Science fiction » Adventure , Fictio...",513563,,,./Science_fiction/513563.txt,Science_fiction,513563.txt,95348.0,504K,True
140917,https://www.smashwords.com/books/view/316177,Genetic Cleansing,John Pearce,$2.99 USD,22240,English,"May 14, 2013","Fiction » Science fiction » Adventure , Fictio...",316177,,,./Adventure/316177.txt,Adventure,316177.txt,22245.0,132K,True
193606,https://www.smashwords.com/books/view/517771,Unnatural Selection,Bill Langford,$2.99 USD,90730,English,"February 7, 2015","Fiction » Thriller & suspense » General , Fict...",517771,,,./Thriller/517771.txt,Thriller,517771.txt,92292.0,544K,True


## General Composition

In [26]:
# How many instances (books) are there in total?
print(len(bc_books))

11040


In [27]:
# To align with the 11,038 reported by Zhu and Kiros et al., remove "adventure-all.txt" and "romance-all.txt"
extra_books = bc_books[(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]
bc_books = bc_books[~(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]

In [28]:
len(bc_books)

11038

In [29]:
# How many words in the files?
bc_books.word_count.sum()

811601031

In [30]:
# Zhu and Kiros et al. report 984,846,357, so this does not align
# We hypothesize that this is mainly due to empty text files
# e.g. 'All_I_Want_for_Christmas_Is_a_Vampire.txt' is empty in the download from https://yknzhu.wixsite.com/mbweb

# Also, if we add the words from the "extra books," the word count exceeds the original reported
extra_books.word_count.sum() + bc_books.word_count.sum()

1042409939

## Missing or Truncated Files

In [31]:
# How many book files have no disk usage?
len(bc_books[bc_books.disk_usage=='0B'])

98

In [32]:
# What are some of the empty book files?
bc_books[bc_books.disk_usage=='0B'].sample(5)

Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
68,./Science_fiction/Asimov42.txt,Science_fiction,Asimov42.txt,0,0B,Asimov42,True
17,./Fantasy/u4112.txt,Fantasy,u4112.txt,0,0B,u4112,True
64,./Science_fiction/Asimov38.txt,Science_fiction,Asimov38.txt,0,0B,Asimov38,True
59,./Science_fiction/Asimov32.txt,Science_fiction,Asimov32.txt,0,0B,Asimov32,True
47,./Romance/VAMPS_AND_THE_CITY.txt,Romance,VAMPS_AND_THE_CITY.txt,0,0B,VAMPS_AND_THE_CITY,True


In [33]:
# How many book files have fewer than 20,000 words (the cutoff specified in the paper)?
len(bc_books[bc_books.word_count<20000])

655

In [34]:
# How many book files have fewer than 10,000 words?
len(bc_books[bc_books.word_count<10000])

291

## Books with Duplicate Copies

In [35]:
# How many unique file names are there?
file_name_counts = pd.DataFrame(bc_books.fname.value_counts()).reset_index()
file_name_counts.columns = ['fname','n']
print(len(file_name_counts))

7185


*We confirmed each unique filename corresponded to one unique book, even in cases where word counts and disk usage differed.*

*See the datasheet for details.*

In [36]:
# How many file names occur multiple times?
print(len(file_name_counts[file_name_counts.n>1]))

2930


In [37]:
# How many books do these file names represent?
# That is, how many books are duplicates?
len(bc_books) - len(file_name_counts)

3853

In [38]:
# How many books occur once, twice, thrice, etc.?
print(file_name_counts.n.value_counts())

1    4255
2    2101
3     741
4      82
5       6
Name: n, dtype: int64


In [39]:
# What are some of the duplicate books?

# for now, remove filenames with unknown name scheme (e.g. 'u3041.txt', 'et4358.txt', 'b5096.txt')
named_dups=file_name_counts[(~file_name_counts.fname.str.match(r'(u|et|b)'))]
named_dups = named_dups[(named_dups.n>1)]

# inspect 5 random filenames occuring multiple times, inspect listings
for index, row in named_dups.sample(5).iterrows():
    print("File name: {}".format(index))
    print("{} Occurrences:".format(len(bc_books[bc_books.fname==row.fname])))
    display(HTML(bc_books[bc_books.fname==row.fname].to_html()))

File name: 2415
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
1471,./Romance/346942.txt,Romance,346942.txt,26366,148K,346942,True
1472,./Teen/346942.txt,Teen,346942.txt,26366,148K,346942,True


File name: 1118
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
818,./Horror/124292.txt,Horror,124292.txt,21001,112K,124292,True
819,./Thriller/124292.txt,Thriller,124292.txt,21001,112K,124292,True


File name: 617
3 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
8619,./New_Adult/Fenbrook_Academy-2.txt,New_Adult,Fenbrook_Academy-2.txt,98607,524K,Fenbrook_Academy-2,True
8620,./Romance/Fenbrook_Academy-2.txt,Romance,Fenbrook_Academy-2.txt,98607,524K,Fenbrook_Academy-2,True
8621,./Young_Adult/Fenbrook_Academy-2.txt,Young_Adult,Fenbrook_Academy-2.txt,98607,524K,Fenbrook_Academy-2,True


File name: 1067
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
9939,./Historical/296579.txt,Historical,296579.txt,121541,708K,296579,True
9940,./Literature/296579.txt,Literature,296579.txt,121541,708K,296579,True


File name: 1963
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
9860,./Fantasy/488068.txt,Fantasy,488068.txt,118992,632K,488068,True
9861,./Teen/488068.txt,Teen,488068.txt,118992,632K,488068,True


## Sample Comparisons

#### Genre Distribution

In [40]:
# Genre distribution in original BookCorpus
genre_df = pd.DataFrame(bc_books.category.value_counts())
genre_df.columns = ['BookCorpusN']
genre_df['BookCorpusP'] = (genre_df.BookCorpusN/genre_df.BookCorpusN.sum()*100).round(1).astype(str)+'%'
genre_df['BookCorpus'] = genre_df.BookCorpusP + ' (' + genre_df.BookCorpusN.astype(str) + ')'


# BookCorpusOpen
genre_df['BookCorpusOpenN'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'BookCorpusOpenN'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(g,case=False))])

total_bcopen = len(sw_books[(sw_books.in_bcopen==True)])
genre_df.BookCorpusOpenN = genre_df.BookCorpusOpenN.astype(int)
genre_df['BookCorpusOpenP'] = (genre_df.BookCorpusOpenN/total_bcopen*100).round(1).astype(str)+'%'
genre_df['BookCorpusOpen'] = genre_df.BookCorpusOpenP + ' (' + genre_df.BookCorpusOpenN.astype(str) + ')'



# Smashwords21
genre_df['Smashwords21'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'Smashwords21N'] = len(sw_books[sw_books.Categories.str.contains(g,case=False)])

    
total_smashwords = len(sw_books)
genre_df.Smashwords21N = genre_df.Smashwords21N.astype(int)
genre_df['Smashwords21P'] = (genre_df.Smashwords21N/total_smashwords*100).round(1).astype(str)+'%'
genre_df['Smashwords21'] = genre_df.Smashwords21P + ' (' + genre_df.Smashwords21N.astype(str) + ')'



to_print = genre_df[['BookCorpus','BookCorpusOpen','Smashwords21']]
display(HTML(to_print.to_html()))

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Romance,26.1% (2880),17.7% (3333),15.9% (70049)
Fantasy,13.6% (1502),17.1% (3223),10.6% (46712)
Science_fiction,7.5% (823),13.3% (2496),7.7% (34038)
New_Adult,6.9% (766),0.9% (176),0.7% (3072)
Young_Adult,6.8% (748),9.3% (1755),4.6% (20100)
Thriller,5.9% (646),7.4% (1383),5.7% (25074)
Mystery,5.6% (621),5.3% (994),4.7% (20581)
Vampires,5.4% (600),0.0% (0),0.0% (0)
Horror,4.1% (448),3.9% (737),3.9% (16969)
Teen,3.9% (430),9.3% (1759),4.6% (20250)


#### Religious Viewpoint Distribution

In [41]:
# based on recommendations by Dhamala and Sun et al. https://dl.acm.org/doi/abs/10.1145/3442188.3445924
religion_list = ['Sikhism', 'Judaism', 'Islam', 'Hinduism', 'Christianity', 'Buddhism', 'Atheism']

religion_df = pd.DataFrame(columns=['BookCorpus','BookCorpusOpen','Smashwords21'],index=religion_list)

In [42]:
for religion in religion_list:
    religion_df.at[religion,'BookCorpusOpen'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(religion,case=False))])
    religion_df.at[religion,'Smashwords21'] = len(sw_books[sw_books.Categories.str.contains(religion,case=False)])
    religion_df.at[religion,'BookCorpus'] = len(sw_books[(sw_books.in_bc_books==True)&(sw_books.Categories.str.contains(religion,case=False))])

religion_df

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Sikhism,0,0,16
Judaism,0,19,400
Islam,0,238,1372
Hinduism,0,12,273
Christianity,0,161,2851
Buddhism,0,33,541
Atheism,0,18,183


In [43]:
# sample of BookCorpusOpen books about Islam
sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains('Islam',case=False))].sample(5)[['Link','Title']]

Unnamed: 0,Link,Title
56651,https://www.smashwords.com/books/view/793396,Allah's Artistry in Colour
71452,https://www.smashwords.com/books/view/839408,A Chain of Miracles
3270,https://www.smashwords.com/books/view/66020,Some Secrets of the Qur'an
104079,https://www.smashwords.com/books/view/803694,Solution the Values of the Qur'an
5234,https://www.smashwords.com/books/view/21831,The Exemplar Beyond Compare Muhammad Mustafa


In [44]:
# sample of BookCorpusOpen books about Christianity
sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains('Christianity',case=False))].sample(5)[['Link','Title']]

Unnamed: 0,Link,Title
19276,https://www.smashwords.com/books/view/11001,How To Defend The Baptism In The Holy Spirit
34208,https://www.smashwords.com/books/view/654879,Living in the Will of God
69523,https://www.smashwords.com/books/view/82244,"Jesus Christ, Who He Is And What He Says"
66428,https://www.smashwords.com/books/view/469246,Enlightenment for Christian Controversies
97915,https://www.smashwords.com/books/view/493831,Freedom Of The Heart
