In [33]:
# Library
import nltk
import random
import re
import csv
import urllib.request
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/yuting/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package punkt to /Users/yuting/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# Find the available book name
nltk.corpus.gutenberg.fileids() 

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [44]:
# Define the URL and filenames for each book
books = [
    {"url": "https://www.gutenberg.org/files/16/16-0.txt", "filename": "Peter_Pen.txt"},
    {"url": "https://www.gutenberg.org/files/74/74-0.txt", "filename": "The_Adventures_of_Tom_Sawyer.txt"},
    {"url": "https://www.gutenberg.org/cache/epub/120/pg120.txt", "filename": "Treasure_Island.txt"},
    {"url": "https://www.gutenberg.org/files/289/289-0.txt", "filename": "The_Wind_in_the_Willows.txt"},
    {"url": "https://www.gutenberg.org/cache/epub/17396/pg17396.txt", "filename": "The_Secret_Garden.txt"},
    {"url": "https://www.gutenberg.org/cache/epub/1342/pg1342.txt", "filename": "Pride_and_Prejudice.txt"},
    {"url": "https://www.gutenberg.org/files/1661/1661-0.txt", "filename": "The_Adventures_of_Sherlock_Holmes.txt"},
]

# Download and save each book
for book in books:
    url = book["url"]
    filename = book["filename"]
    urllib.request.urlretrieve(url, filename)
    print(f"{filename} has been saved.")


Peter_Pen.txt has been saved.
The_Adventures_of_Tom_Sawyer.txt has been saved.
Treasure_Island.txt has been saved.
The_Wind_in_the_Willows.txt has been saved.
The_Secret_Garden.txt has been saved.


# Choice five books have the same genres and semantically the same
## Download: https://www.gutenberg.org
# Children literature
'Peter Pen" by J. M. Barrie
https://www.gutenberg.org/files/16/16-0.txt

"The Adventures of Tom Sawyer" by Mark Twain
https://www.gutenberg.org/files/74/74-0.txt

"The Wind in the Willows" by Kenneth Grahame
https://www.gutenberg.org/files/289/289-0.txt

"The Secret Garden" by Frances Hodgson Burnett
https://www.gutenberg.org/cache/epub/17396/pg17396.txt

"Treasure Island" by Robert Louis Stevenson
https://www.gutenberg.org/cache/epub/120/pg120.txt

# Another text book by the different genre
# romance fiction
"Pride and Prejudice" by Jane Austen
https://www.gutenberg.org/cache/epub/1342/pg1342.txt

# mystery fiction
"The Adventures of Sherlock Holmes" by Sir Arthur Conan Doyle
https://www.gutenberg.org/files/1661/1661-0.txt

In [45]:
def sample_digital_book(book_name, num_partitions, size_partition):
    
    # Download the digital book from the local directory
    book_file = open(book_name, "r")
    book = book_file.read()
    book_file.close()

    # use 'word_tokenize' function to tokenize the book into words.
    # Then, divide digital book into each partitions of the specified size (100 words)
    partitions = nltk.word_tokenize(book)
    partitions = [partitions[i : i+size_partition] for i in range(0, len(partitions), size_partition)]
    
    # Check num_partitions is valid
    if num_partitions > len(partitions) or num_partitions < 0:
        num_partitions = len(partitions)
    partitions = partitions[:num_partitions]
    
    # Create labels
    labels = [book_name[:1]]
    # Repeat the labels for the number of times that can fit in the partitions
    label_list = labels*(num_partitions//len(labels))
    # Get the remainder labels that are needed.
    label_list += labels[:num_partitions%len(labels)]
    
    
    # Use regular expression to manipulate the text
    # and the regular expression r'[^\w\s]' is used to remove non-alphanumeric characters from the text.
    partitions = [[re.sub(r'[^\w\s]', '', word) for word in partition] for partition in partitions]
    
    # Remove empty strings from the list
    partitions = [[word for word in partition if word] for partition in partitions]

    # Create pandas dataframe to store the text data
    data = {'partition': partitions, 'label': label_list}
    df = pd.DataFrame(data) 

    # Serialize dataframe to csv
    df.to_csv(book_name + '.csv', index=False)
    
    return partitions

In [46]:
# Testing for multiple books:
book_name1 = "Peter_Pen.txt"
book_name2 = "The_Adventures_of_Tom_Sawyer.txt"
book_name3 = "Treasure_Island.txt"
book_name4 = "The_Wind_in_the_Willows.txt"
book_name5 = "The_Secret_Garden.txt"
book_name6 = "Pride_and_Prejudice.txt"
book_name7 = "The_Adventures_of_Sherlock_Holmes.txt"
book_list = [book_name1, book_name2, book_name3, book_name4, book_name5, book_name6]
# If book_list > label_list, add "f","g",etc.
label_list = ["a","b","c","d","e","f","g"]
df_list = []
for i, book_name in enumerate(book_list):
    partitions = sample_digital_book(book_name, 200, 100)
    temp = pd.DataFrame({'partition': partitions, 'label': label_list[i]})
    df_list.append(temp)

df = pd.concat(df_list)
df.to_csv("Children_Literature_books_data.csv", index=False)

