# Part 1 — Import books

### Read text files

In [255]:
#from urllib import request  # This will only be needed if/when downloading the books from internet
import pandas as pd

In [256]:
book_files_dict = { 'Hume - Enquiry Concerning Human Understanding': 'Hume_ECHU.txt',\
                    'James - Pluralist Universe' : 'James_Pluralist_Universe.txt'
                    'Whitehead - Process and Reality': 'Whitehead_process_and_reality.txt',\
}
path = 'text_sources/'

### Basic pre-processing

Convert faux hard-returns into continuous string

In [259]:
def remove_pseudo_soft_returns(s, break_style):
    if break_style == '\r\n':
        s = s.replace('\r\n'*3, '^S ^p ').replace('\r\n'*2, '^p ').replace('\r\n', ' ')
    elif break_style == '\\n\\n':
        s = s.replace('\\n'*4, '^S ^p ').replace('\\n'*2, '^p ').replace('\\n', ' ')
    elif break_style == '\n\n':
        s = s.replace('\n\n'*2, '^S ^p ').replace(' \n\n', '^p ').replace('\n\n', '^p').replace('- \n', '').replace(' \n', ' ').replace('\'', '')
    return s


In [260]:
book_file = open(path + 'James_Pluralist_Universe.txt', 'rt')
book_string = book_file.read()
book_string[-3018900:-20500]



In [261]:

book_row = {'author': book.split(' - ')[0], \
            'title': book.split(' - ')[1],\
            'text': ' '.join(remove_pseudo_soft_returns(book_string,'\n\n').split())}
print(book_row['author'], book_row['title'])
books_df = books_df.append(book_row, ignore_index=True)
print(books_df.loc[books_df['title']==book_row['title'],['text']].to_string()[-3018900:-20500]  )


James Pluralist Universe
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

#### Make dataframe to hold all books

In [262]:
books_list = []
books_df = pd.DataFrame(columns = ['author', 'title', 'text'])

for book in book_files_dict:
    book_file = open(path + book_files_dict[book], 'rt')
    book_string = book_file.read()
    book_row = {'author': book.split(' - ')[0], \
                'title': book.split(' - ')[1],\
                'text': ' '.join(remove_pseudo_soft_returns(book_string,'\n\n').split())}
    print(book_row['author'], book_row['title'])
    books_df = books_df.append(book_row, ignore_index=True)
    print(books_df.loc[books_df['title']==book_row['title'],['text']].to_string()[-30500:-20500]  )
    print('\n')

Hume Enquiry Concerning Human Understanding
s never amounted to proof, 92; the passion for the wonderful in human nature, 93; prevalence of miracles in savage and early periods and their diminution with civilization, 94; the evidence for miracles in matters of religion opposed by the almost infinite number of witnesses for rival religions, 95; value of human testimony diminished by temptation to pose as a prophet or apostle, 97; no testimony for a miracle has ever amounted to a probability, much less to a proof, and if it did amount to a proof it would be opposed by another perfect proof, 98; so a miracle can never be proved so as to be the foundation of a system of religion, 99; a conclusion which confounds those who base the Christian religion on reason, not on faith, 100; the Christian religion cannot be believed without a miracle which will subvert the principle of a mans understanding and give him a determination to believe what is most contrary to custom and experience, 101.^pMor

#### Customized pre-processing to determine where the main text lies within each book

In [263]:
def custom_processing_setup_per_book():
    start_text_string_A = 'SECTION I' # Hume
    start_text_string_B = 'CHAPTER I' # Whitehead
    #start_text_string_C = 'LECTURE I' # James
    start_text_string_C = 'INDEX 401' # James
    
    books_df.loc[books_df['title']=='Enquiry Concerning Human Understanding', 'break_style']=start_text_string_A
    books_df.loc[books_df['title']=='Process and Reality', 'break_style']=start_text_string_B
    books_df.loc[books_df['title']=='Pluralist Universe', 'break_style']=start_text_string_C
    
    books_df.loc[books_df['title']=='Enquiry Concerning Human Understanding', 'start_row_text']=start_text_string_A
    books_df.loc[books_df['title']=='Process and Reality', 'start_row_text']=start_text_string_B
    books_df.loc[books_df['title']=='Pluralist Universe', 'start_row_text']=start_text_string_C
    
    end_text_string_W = 'INDEX' # Hume and James and 
    #end_text_string_X = '  Index' # Whitehead
    end_text_string_X = 'Index' # Whitehead
    end_text_string_Y = 'INDEX TO THE LECTURES'
    
    books_df.loc[books_df['title']=='Enquiry Concerning Human Understanding', 'end_material_text']=end_text_string_W
    books_df.loc[books_df['title']=='Process and Reality', 'end_material_text']=end_text_string_X
    books_df.loc[books_df['title']=='Pluralist Universe', 'end_material_text']=end_text_string_Y
    return
    

In [264]:
custom_processing_setup_per_book()


In [265]:
books_df

Unnamed: 0,author,title,text,break_style,start_row_text,end_material_text
0,Hume,Enquiry Concerning Human Understanding,﻿The Project Gutenberg EBook of An Enquiry Con...,SECTION I,SECTION I,INDEX
1,Whitehead,Process and Reality,Skip to main content Search UPLOAD SIGN UP | L...,CHAPTER I,CHAPTER I,Index
2,James,Pluralist Universe,﻿The Project Gutenberg EBook of A Pluralistic ...,INDEX 401,INDEX 401,INDEX TO THE LECTURES


Assign columns for start and end strings

In [266]:

books_df

Unnamed: 0,author,title,text,break_style,start_row_text,end_material_text
0,Hume,Enquiry Concerning Human Understanding,﻿The Project Gutenberg EBook of An Enquiry Con...,SECTION I,SECTION I,INDEX
1,Whitehead,Process and Reality,Skip to main content Search UPLOAD SIGN UP | L...,CHAPTER I,CHAPTER I,Index
2,James,Pluralist Universe,﻿The Project Gutenberg EBook of A Pluralistic ...,INDEX 401,INDEX 401,INDEX TO THE LECTURES


### Pickle the dataframe containing `main_text` of each book

#### Cycle through the paragraphs we can isolate the main text from the front material and end material

In [269]:
def paragraph_list(title, df):
    text_string = df.iloc[books_df.loc[books_df['title']==title].index[0], :]['text']
    book_par_list = text_string.split('^p')
    return book_par_list

#### Cycling through each book to include just the main text in the dataframe's `main_text` column

In [270]:
import pickle

def process_books():
    for book in book_files_dict:

    #book_title = 'Hume - Enquiry Concerning Human Understanding'

        book_title = book.split(' - ')[1]

        book_df = pd.DataFrame(paragraph_list(book_title, books_df))
        book_df.rename(columns = {0:'par_text'}, inplace=True)
        #book_df

        starting_text_string = books_df[books_df['title']==book_title][  'start_row_text'].iloc[0]
        starting_text_string

        start_maintext_pos = book_df[book_df['par_text'].str.contains(starting_text_string)].index.values.astype(int)[0]
        start_maintext_pos

        end_text_string = books_df[books_df['title']==book_title][  'end_material_text'].iloc[0]
        end_text_string
        end_material_row = book_df.iloc[start_maintext_pos: ][(book_df.iloc[start_maintext_pos: ]['par_text'].str.contains(end_text_string)) ].index.values.astype(int)[0]
        end_material_row

        main_text_df = book_df.iloc[start_maintext_pos:end_material_row:]
        main_text_df

        main_text_string = ' '.join(main_text_df['par_text'].to_list())
        main_text_string

        # change this so it puts underscores between words in the book title
        with open("data/" + book_title + "_df.pickle", 'wb') as to_write:
            pickle.dump(main_text_df, to_write)
            
        books_df.loc[books_df['title']==book_title, 'main_text']=main_text_string

    with open("data/books_df.pickle", 'wb') as to_write:
        pickle.dump(books_df, to_write)

    return    
        

### Testing/exploring that the above worked

In [271]:
process_books()

In [272]:
books_df

Unnamed: 0,author,title,text,break_style,start_row_text,end_material_text,main_text
0,Hume,Enquiry Concerning Human Understanding,﻿The Project Gutenberg EBook of An Enquiry Con...,SECTION I,SECTION I,INDEX,SECTION I. OF THE DIFFERENT SPECIES OF PHILOS...
1,Whitehead,Process and Reality,Skip to main content Search UPLOAD SIGN UP | L...,CHAPTER I,CHAPTER I,Index,CHAPTER I SPECULATIVE PHILOSOPHY SECTION I ...
2,James,Pluralist Universe,﻿The Project Gutenberg EBook of A Pluralistic ...,INDEX 401,INDEX 401,INDEX TO THE LECTURES,INDEX 401^S LECTURE I THE TYPES OF PHILOSO...


In [273]:
try_title = 'Process and Reality'

In [274]:
with open("data/" + try_title + "_df.pickle", 'rb') as to_read:
        try_book_df = pickle.load(to_read)

In [275]:
try_book_df.iloc[2,:].iloc[0]

' [4] This course of lectures is designed as an essay in Speculative Philosophy. Its first task must be to define speculative philosophy/ and to defend it as a method productive of important knowledge.'

In [276]:
try_book_df

Unnamed: 0,par_text
317,CHAPTER I SPECULATIVE PHILOSOPHY
318,SECTION I
319,[4] This course of lectures is designed as an...
320,Speculative Philosophy is the endeavour to fr...
321,"[5] Coherence, as here employed, means that t..."
...,...
2566,SECTION VII
2567,Thus the consequent nature of God is composed...
2568,Each actuality in the temporal world has its ...
2569,But the principle of universal relativity is ...


In [74]:
#book_title = 'Enquiry Concerning Human Understanding'



Unnamed: 0,par_text
0,﻿The Project Gutenberg EBook of An Enquiry Con...
1,This eBook is for the use of anyone anywhere a...
2,Title: An Enquiry Concerning Human Understanding
3,Author: David Hume L. A. Selby-Bigge
4,"Posting Date: November 15, 2011 [EBook #9662] ..."
...,...
621,Professor Michael S. Hart was the originator o...
622,Project Gutenberg-tm eBooks are often created...
623,Most people start at our Web site which has t...
624,https://www.gutenberg.org


Unnamed: 0,author,title,text,start_row_text,end_material_text
0,Hume,Enquiry Concerning Human Understanding,﻿The Project Gutenberg EBook of An Enquiry Con...,SECTION I,INDEX
1,Whitehead,Process and Reality,Skip to main content Search UPLOAD SIGN UP | L...,CHAPTER I,Index
2,James,Pluralist Universe,﻿The Project Gutenberg EBook of A Pluralistic ...,LECTURE I,INDEX


In [98]:
#starting_text_string = start_text_string_B
#starting_text_string = books_df.loc[books_df['title']==book_title,  'start_row_text'].astype(str)


'SECTION I'

19

'INDEX'

In [104]:
book_df.iloc[start_maintext_pos: ]

Unnamed: 0,par_text
19,SECTION I.
20,OF THE DIFFERENT SPECIES OF PHILOSOPHY.
21,"1. Moral philosophy, or the science of human ..."
22,2. The other species of philosophers consider ...
23,3. It is certain that the easy and obvious phi...
...,...
621,Professor Michael S. Hart was the originator o...
622,Project Gutenberg-tm eBooks are often created...
623,Most people start at our Web site which has t...
624,https://www.gutenberg.org


374

Unnamed: 0,par_text
19,SECTION I.
20,OF THE DIFFERENT SPECIES OF PHILOSOPHY.
21,"1. Moral philosophy, or the science of human ..."
22,2. The other species of philosophers consider ...
23,3. It is certain that the easy and obvious phi...
...,...
369,Moral reasonings are either concerning particu...
370,"The sciences, which treat of general facts, ar..."
371,"Divinity or Theology, as it proves the existen..."
372,Morals and criticism are not so properly objec...


In [None]:
# pickle the dataframe

' SECTION I. OF THE DIFFERENT SPECIES OF PHILOSOPHY.  1. Moral philosophy, or the science of human nature, may be treated after two different manners; each of which has its peculiar merit, and may contribute to the entertainment, instruction, and reformation of mankind. The one considers man chiefly as born for action; and as influenced in his measures by taste and sentiment; pursuing one object, and avoiding another, according to the value which these objects seem to possess, and according to the light in which they present themselves. As virtue, of all objects, is allowed to be the most valuable, this species of philosophers paint her in the most amiable colours; borrowing all helps from poetry and eloquence, and treating their subject in an easy and obvious manner, and such as is best fitted to please the imagination, and engage the affections. They select the most striking observations and instances from common life; place opposite characters in a proper contrast; and alluring us i

In [116]:
books_df

Unnamed: 0,author,title,text,start_row_text,end_material_text,main_text
0,Hume,Enquiry Concerning Human Understanding,﻿The Project Gutenberg EBook of An Enquiry Con...,SECTION I,INDEX,SECTION I. OF THE DIFFERENT SPECIES OF PHILOS...
1,Whitehead,Process and Reality,Skip to main content Search UPLOAD SIGN UP | L...,CHAPTER I,Index,
2,James,Pluralist Universe,﻿The Project Gutenberg EBook of A Pluralistic ...,LECTURE I,INDEX,


In [22]:
book_df

Unnamed: 0,author,title,text,break_style,start_row_text,end_material_text,df
0,Hume,Enquiry Concerning Human Understanding,﻿The Project Gutenberg EBook of An Enquiry Con...,SECTION I,SECTION I,INDEX,
1,Whitehead,Process and Reality,Skip to main content Search UPLOAD SIGN UP | L...,CHAPTER I,CHAPTER I,Index,
2,James,Pluralist Universe,﻿The Project Gutenberg EBook of A Pluralistic ...,LECTURE I,LECTURE I,INDEX,


## Comments and code below were made obsolete by other jupyter notebooks 2a, 2b, and 3

#### Make paragraph-by-paragraph dataframe for each book. 
Use the Hume notebook as a model  (use the `start_row_text` and `end_material_text` to locate flanking rows

#### Use the above to locate the main text and reconstruct it to back in the dataframe of all the books.

#### Do topic modeling of the whole corpus of book

#### SpaCy scatterword

Determine start row of main text paragraphs

NameError: name 'start_text_string' is not defined

Determine row that starts end material and populate main_text column of df

In [298]:
end_text_string = end_text_string_X
end_material_row = books_df[(bookS_df['par_text'].str.contains(end_text_string)) ].index.values.astype(int)[0]


NameError: name 'book_df' is not defined

In [276]:
books_df.loc[books_df['title']=='Enquiry Concerning Human Understanding', 'text'].to_string()

'0    \ufeffThe Project Gutenberg EBook of An Enquiry Con...'

In [313]:
books_df.iloc[0, :]['text'].find(start_text_string_A + '^p' )

-1

In [306]:
books_df.iloc[0, :]['text'][1620:1640]

'sophy    INDEX  SECT'

In [304]:
books_df.iloc[0, :]['text'].find(end_text_string_W)

1629

In [15]:
books_df.iloc[0, :]['text']

'\ufeffThe Project Gutenberg EBook of An Enquiry Concerning Human Understanding, by David Hume and L. A. Selby-Bigge This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: An Enquiry Concerning Human Understanding Author: David Hume L. A. Selby-Bigge Posting Date: November 15, 2011 [EBook #9662] Release Date: January, 2006 First Posted: October 14, 2003 Language: English *** START OF THIS PROJECT GUTENBERG EBOOK ENQUIRY CONCERNING HUMAN UNDERSTANDING *** Produced by Jonathan Ingram and Project Gutenberg Distributed Proofreaders^S ^p ^S ^p AN ENQUIRY CONCERNING HUMAN UNDERSTANDING. BY DAVID HUME Extracted from: Enquiries Concerning the Human Understanding, and Concerning the Principles of Morals, By David Hume. Reprinted from The Posthumous Edition of 1777, and Edited with Introduction, Com

In [32]:
books_df.loc[books_df['title']=='Enquiry Concerning Human Understanding'].index[0]

0

In [33]:
books_df.iloc[books_df.loc[books_df['title']=='Enquiry Concerning Human Understanding'].index[0], :]['text']

'\ufeffThe Project Gutenberg EBook of An Enquiry Concerning Human Understanding, by David Hume and L. A. Selby-Bigge This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: An Enquiry Concerning Human Understanding Author: David Hume L. A. Selby-Bigge Posting Date: November 15, 2011 [EBook #9662] Release Date: January, 2006 First Posted: October 14, 2003 Language: English *** START OF THIS PROJECT GUTENBERG EBOOK ENQUIRY CONCERNING HUMAN UNDERSTANDING *** Produced by Jonathan Ingram and Project Gutenberg Distributed Proofreaders^S ^p ^S ^p AN ENQUIRY CONCERNING HUMAN UNDERSTANDING. BY DAVID HUME Extracted from: Enquiries Concerning the Human Understanding, and Concerning the Principles of Morals, By David Hume. Reprinted from The Posthumous Edition of 1777, and Edited with Introduction, Com

In [279]:
books_df.iloc[0, :]['text'][ books_df.iloc[0, :]['text'].find(start_text_string_A), books_df.iloc[0, :]['text'].find(end_text_string_W)]

TypeError: string indices must be integers

pickle dataframe for use in another notebook

In [245]:
import pickle