In [50]:
# import libraries and read in data
import pandas as pd
from IPython.display import display, HTML

# BookCorpus metadata, from files downloaded directly from https://yknzhu.wixsite.com/mbweb
bc_books = pd.read_csv('data/BookCorpus/books_in_bookcorpus.csv')

# BookCorpusOpen url list, downloaded from https://huggingface.co/datasets/bookcorpusopen
bcopen_books = pd.read_csv('data/BookCorpusOpen/2020-08-27-epub_urls.txt',header=None,names=['EpubLink'])

# Smashwords21, scraped from smashwords.com in April 2021
sw_books = pd.read_csv('data/Smashwords21/smashwords_april_2021_dedup.csv')
sw_books['in_smashwords21'] = True

In [51]:
# merge Smashwords21 with BookCorpusOpen

# get the "smashwords id" and add a field for bcopen
bcopen_books['smashwords_id'] = bcopen_books.EpubLink.str.split('/',expand=True)[5]
bcopen_books['in_bcopen'] = True

# add "smashwords id" for smashwords21 and merge
sw_books['smashwords_id'] = sw_books.Link.str.split('/',expand=True)[5]
sw_books = sw_books.merge(bcopen_books, how='outer',on='smashwords_id')
sw_books.fillna(value='',inplace=True)

In [52]:
# partially merge Smashwords21 with original BookCorpus
bc_books['smashwords_id'] = bc_books.fname.str.replace('.txt','')
bc_books['in_bc_books'] = True
sw_books = sw_books.merge(bc_books, how='left',on='smashwords_id')

In [53]:
# How many books from BookCorpus were found in Smashwords21 using only the smashwords_id?
len(sw_books[sw_books.in_bc_books==True])

2680

In [54]:
# Check if everything looks okay
sw_books[sw_books.in_bc_books==True].sample(5)

Unnamed: 0,Link,Title,Author,Price,Words,Language,Published,Categories,in_smashwords21,smashwords_id,EpubLink,in_bcopen,location,category,fname,word_count,disk_usage,in_bc_books
24495,https://www.smashwords.com/books/view/432689,Scarlet Runner,Lily Ennis,$0.00 USD,81640,English,"April 26, 2014",Fiction » Historical » Australia & New Zealand...,True,432689,https://www.smashwords.com/books/download/4326...,True,./Historical/432689.txt,Historical,432689.txt,81641.0,464K,True
85588,https://www.smashwords.com/books/view/344755,The Village Cafe,Eraine Rivera,$0.00 USD,23280,English,"August 6, 2013",Fiction » Horror » Undead,True,344755,https://www.smashwords.com/books/download/3447...,True,./Horror/344755.txt,Horror,344755.txt,23288.0,124K,True
14685,https://www.smashwords.com/books/view/346868,Under Mary's Oak,Tanya Carlysle,$0.00 USD,49800,English,"August 13, 2013",Fiction » Mystery & detective » Women Sleuths ...,True,346868,https://www.smashwords.com/books/download/3468...,True,./Mystery/346868.txt,Mystery,346868.txt,49801.0,280K,True
27235,https://www.smashwords.com/books/view/296973,The War Journals: Resistance,Cory Mccoy,$0.00 USD,89650,English,"March 18, 2013",Fiction » Thriller & suspense » Action & suspense,True,296973,https://www.smashwords.com/books/download/2969...,True,./Adventure/296973.txt,Adventure,296973.txt,89647.0,484K,True
12306,https://www.smashwords.com/books/view/415743,Just One Night,Rachel Lenna,$0.00 USD,21180,English,"March 5, 2014","Fiction » Young adult or teen » Romance , Fict...",True,415743,https://www.smashwords.com/books/download/4157...,True,./Romance/415743.txt,Romance,415743.txt,21185.0,116K,True


# Stolen Books?

In [55]:
# How many books are in BookCorpus that now cost money?
# (Lower bound since metadata is missing for lots of BookCorpus books)
stolen_books = sw_books[(sw_books.in_bc_books==True)&(sw_books.Price.str.contains('USD'))&(sw_books.Price!='$0.00 USD')]
print(len(stolen_books))

406


In [56]:
# What are some examples of stolen books?
stolen_books.sample(5)

Unnamed: 0,Link,Title,Author,Price,Words,Language,Published,Categories,in_smashwords21,smashwords_id,EpubLink,in_bcopen,location,category,fname,word_count,disk_usage,in_bc_books
346836,https://www.smashwords.com/books/view/471530,Wicked Bartender PG-13 Version,KuroKoneko Kamen,$2.99 USD,138600,English,"August 29, 2014","Fiction » Romance » Paranormal , Fiction » Rom...",True,471530,,,./Romance/471530.txt,Romance,471530.txt,138559.0,784K,True
229952,https://www.smashwords.com/books/view/41120,Prometheus Fit To Be Tied,Paul Hawkins,$2.99 USD,89790,English,"February 7, 2011","Fiction » Literature » Literary , Fiction » Hu...",True,41120,,,./Humor/41120.txt,Humor,41120.txt,89834.0,476K,True
350722,https://www.smashwords.com/books/view/484482,Shrapnel's Kiss,Amy Rachiele,$2.99 USD,33510,English,"November 4, 2014","Fiction » Romance » Action/adventure , Fiction...",True,484482,,,./Romance/484482.txt,Romance,484482.txt,33517.0,180K,True
229953,https://www.smashwords.com/books/view/41120,Prometheus Fit To Be Tied,Paul Hawkins,$2.99 USD,89790,English,"February 7, 2011","Fiction » Literature » Literary , Fiction » Hu...",True,41120,,,./Literature/41120.txt,Literature,41120.txt,89834.0,476K,True
282728,https://www.smashwords.com/books/view/505347,The Rainbow Maker Project,Katherine Gae T. Yamar,$0.99 USD,31660,English,"December 28, 2014","Fiction » Fantasy » General , Fiction » Advent...",True,505347,,,./Fantasy/505347.txt,Fantasy,505347.txt,31669.0,184K,True


In [57]:
# How much would these stolen books cost?
stolen_books.loc[:,'dollar_cost'] = stolen_books.Price.str.split(expand=True)[0]
stolen_books.loc[:,'dollar_cost'] = stolen_books.dollar_cost.str.replace('$','')
stolen_books.loc[:,'dollar_cost'] = stolen_books.dollar_cost.str.replace(',','')
stolen_books.loc[:,'dollar_cost'] = pd.to_numeric(stolen_books.dollar_cost)

print(stolen_books.dollar_cost.sum())

1182.21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


## General Composition

In [58]:
# How many instances (books) are there in total?
print(len(bc_books))

11040


In [59]:
# To align with the 11,038 reported by Zhu and Kiros et al., remove "adventure-all.txt" and "romance-all.txt"
extra_books = bc_books[(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]
bc_books = bc_books[~(bc_books.fname.str.contains('adventure-all.txt')|bc_books.fname.str.contains('romance-all.txt'))]

In [60]:
len(bc_books)

11038

In [61]:
# How many words in the files?
bc_books.word_count.sum()

811601031

In [62]:
# Zhu and Kiros et al. report 984,846,357, so this does not align
# We hypothesize that this is mainly due to empty text files
# e.g. 'All_I_Want_for_Christmas_Is_a_Vampire.txt' is empty in the download from https://yknzhu.wixsite.com/mbweb

# Also, if we add the words from the "extra books," the word count exceeds the original reported
extra_books.word_count.sum() + bc_books.word_count.sum()

1042409939

## Missing or Truncated Files

In [63]:
# How many book files have no disk usage?
len(bc_books[bc_books.disk_usage=='0B'])

98

In [64]:
# What are some of the empty book files?
bc_books[bc_books.disk_usage=='0B'].sample(5)

Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
29,./Horror/Paranoid-A-Chant.txt,Horror,Paranoid-A-Chant.txt,0,0B,Paranoid-A-Chant,True
7,./Fantasy/One_Eyed_Jacks.txt,Fantasy,One_Eyed_Jacks.txt,0,0B,One_Eyed_Jacks,True
80,./Science_fiction/Shock.txt,Science_fiction,Shock.txt,0,0B,Shock,True
50,./Science_fiction/Asimov14.txt,Science_fiction,Asimov14.txt,0,0B,Asimov14,True
63,./Science_fiction/Asimov37.txt,Science_fiction,Asimov37.txt,0,0B,Asimov37,True


In [65]:
# How many book files have fewer than 20,000 words (the cutoff specified in the paper)?
len(bc_books[bc_books.word_count<20000])

655

In [66]:
# How many book files have fewer than 10,000 words?
len(bc_books[bc_books.word_count<10000])

291

## Books with Duplicate Copies

In [67]:
# How many unique file names are there?
file_name_counts = pd.DataFrame(bc_books.fname.value_counts()).reset_index()
file_name_counts.columns = ['fname','n']
print(len(file_name_counts))

7185


*We confirmed each unique filename corresponded to one unique book, even in cases where word counts and disk usage differed.*

*See the datasheet for details.*

In [68]:
# How many file names occur multiple times?
print(len(file_name_counts[file_name_counts.n>1]))

2930


In [69]:
# How many books do these file names represent?
# That is, how many books are duplicates?
len(bc_books) - len(file_name_counts)

3853

In [70]:
# How many books occur once, twice, thrice, etc.?
print(file_name_counts.n.value_counts())

1    4255
2    2101
3     741
4      82
5       6
Name: n, dtype: int64


In [71]:
# What are some of the duplicate books?

# for now, remove filenames with unknown name scheme (e.g. 'u3041.txt', 'et4358.txt', 'b5096.txt')
named_dups=file_name_counts[(~file_name_counts.fname.str.match(r'(u|et|b)'))]
named_dups = named_dups[(named_dups.n>1)]

# inspect 5 random filenames occuring multiple times, inspect listings
for index, row in named_dups.sample(5).iterrows():
    print("File name: {}".format(index))
    print("{} Occurrences:".format(len(bc_books[bc_books.fname==row.fname])))
    display(HTML(bc_books[bc_books.fname==row.fname].to_html()))

File name: 2274
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
10225,./Historical/180662.txt,Historical,180662.txt,132195,760K,180662,True
10226,./Literature/180662.txt,Literature,180662.txt,132195,760K,180662,True


File name: 2302
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
1767,./Adventure/321752.txt,Adventure,321752.txt,29614,172K,321752,True
1768,./Science_fiction/321752.txt,Science_fiction,321752.txt,29614,172K,321752,True


File name: 2859
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
4260,./Fantasy/506088.txt,Fantasy,506088.txt,58601,324K,506088,True
4261,./Teen/506088.txt,Teen,506088.txt,58601,324K,506088,True


File name: 1717
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
2767,./Fantasy/368712.txt,Fantasy,368712.txt,42090,232K,368712,True
2768,./Science_fiction/368712.txt,Science_fiction,368712.txt,42090,232K,368712,True


File name: 1670
2 Occurrences:


Unnamed: 0,location,category,fname,word_count,disk_usage,smashwords_id,in_bc_books
3322,./Science_fiction/364494.txt,Science_fiction,364494.txt,48863,268K,364494,True
3323,./Teen/364494.txt,Teen,364494.txt,48863,268K,364494,True


## Sample Comparisons

#### Genre Distribution

In [72]:
# Genre distribution in original BookCorpus
genre_df = pd.DataFrame(bc_books.category.value_counts())
genre_df.columns = ['BookCorpusN']
genre_df['BookCorpusP'] = (genre_df.BookCorpusN/genre_df.BookCorpusN.sum()*100).round(1).astype(str)+'%'
genre_df['BookCorpus'] = genre_df.BookCorpusP + ' (' + genre_df.BookCorpusN.astype(str) + ')'


# BookCorpusOpen
genre_df['BookCorpusOpenN'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'BookCorpusOpenN'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(g,case=False))])

total_bcopen = len(sw_books[(sw_books.in_bcopen==True)])
genre_df.BookCorpusOpenN = genre_df.BookCorpusOpenN.astype(int)
genre_df['BookCorpusOpenP'] = (genre_df.BookCorpusOpenN/total_bcopen*100).round(1).astype(str)+'%'
genre_df['BookCorpusOpen'] = genre_df.BookCorpusOpenP + ' (' + genre_df.BookCorpusOpenN.astype(str) + ')'



# Smashwords21
genre_df['Smashwords21'] = ''
for genre in genre_df.index.tolist():
    g=genre.replace('_',' ')
    genre_df.at[genre,'Smashwords21N'] = len(sw_books[sw_books.Categories.str.contains(g,case=False)])

    
total_smashwords = len(sw_books)
genre_df.Smashwords21N = genre_df.Smashwords21N.astype(int)
genre_df['Smashwords21P'] = (genre_df.Smashwords21N/total_smashwords*100).round(1).astype(str)+'%'
genre_df['Smashwords21'] = genre_df.Smashwords21P + ' (' + genre_df.Smashwords21N.astype(str) + ')'



to_print = genre_df[['BookCorpus','BookCorpusOpen','Smashwords21']]
display(HTML(to_print.to_html()))

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Romance,26.1% (2880),18.0% (3314),16.0% (66083)
Fantasy,13.6% (1502),17.2% (3171),10.6% (44032)
Science_fiction,7.5% (823),13.3% (2453),7.8% (32063)
New_Adult,6.9% (766),0.9% (175),0.7% (2902)
Young_Adult,6.8% (748),9.5% (1748),4.6% (19015)
Thriller,5.9% (646),7.4% (1368),5.7% (23587)
Mystery,5.6% (621),5.3% (987),4.7% (19351)
Vampires,5.4% (600),0.0% (0),0.0% (0)
Horror,4.1% (448),3.9% (727),3.9% (15944)
Teen,3.9% (430),9.5% (1752),4.6% (19154)


#### Religious Viewpoint Distribution

In [73]:
# based on recommendations by Dhamala and Sun et al. https://dl.acm.org/doi/abs/10.1145/3442188.3445924
religion_list = ['Sikhism', 'Judaism', 'Islam', 'Hinduism', 'Christianity', 'Buddhism', 'Atheism']

religion_df = pd.DataFrame(columns=['BookCorpus','BookCorpusOpen','Smashwords21'],index=religion_list)

In [74]:
for religion in religion_list:
    religion_df.at[religion,'BookCorpusOpen'] = len(sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains(religion,case=False))])
    religion_df.at[religion,'Smashwords21'] = len(sw_books[sw_books.Categories.str.contains(religion,case=False)])
    religion_df.at[religion,'BookCorpus'] = len(sw_books[(sw_books.in_bc_books==True)&(sw_books.Categories.str.contains(religion,case=False))])

religion_df

Unnamed: 0,BookCorpus,BookCorpusOpen,Smashwords21
Sikhism,0,0,15
Judaism,0,18,371
Islam,0,229,1305
Hinduism,0,12,261
Christianity,0,154,2671
Buddhism,0,32,512
Atheism,0,18,175


In [75]:
# sample of BookCorpusOpen books about Islam
sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains('Islam',case=False))].sample(5)[['Link','Title']]

Unnamed: 0,Link,Title
23940,https://www.smashwords.com/books/view/648949,Has the Bible Been Changed?: The Reliability o...
2620,https://www.smashwords.com/books/view/115812,Could Not Answer
62865,https://www.smashwords.com/books/view/128298,Muslim Christian Dialogue
1441,https://www.smashwords.com/books/view/116105,Ethics of Islam
109184,https://www.smashwords.com/books/view/835050,Allah'ın Detay Sanatı


In [76]:
# sample of BookCorpusOpen books about Christianity
sw_books[(sw_books.in_bcopen==True)&(sw_books.Categories.str.contains('Christianity',case=False))].sample(5)[['Link','Title']]

Unnamed: 0,Link,Title
67931,https://www.smashwords.com/books/view/65356,Stories from the Forest -- Stories by a Counse...
87647,https://www.smashwords.com/books/view/117560,Hell: A Biblical and Evangelical Response to T...
18835,https://www.smashwords.com/books/view/99241,"I Am Coming, Volume 4"
102601,https://www.smashwords.com/books/view/36677,Fashioned by Culture
70218,https://www.smashwords.com/books/view/208575,"Ich Komme, Band 5"


In [77]:
# how many books in BookCorpusOpen?
len(sw_books[sw_books.in_bcopen==True])

18451

In [78]:
# how many unique books in Smashwords21?
len(sw_books[sw_books.in_smashwords21==True])

412581