# Document Cleaning

In [105]:
# Import modules
# RegEx
import re

# Lemmatization Module
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Tokenizer
from nltk.tokenize import word_tokenize 

## "Alciphron", George Berkeley

In [106]:
with open("CorpusComplete/Berkeley_Alciphron.txt", "r", encoding="utf-8") as file:
    B_Alc = file.read()

In [107]:
# Remove header/TOC information
B_AlcPage = re.sub(r'Alciphron George Berkeley First Dialogue'," ",B_Alc)
B_AlcPage = re.sub(r'Alciphron George Berkeley Second dialogue'," ",B_AlcPage)
B_AlcPage = re.sub(r'Alciphron George Berkeley Third dialogue'," ",B_AlcPage)
B_AlcPage = re.sub(r'Alciphron George Berkeley Fourth dialogue'," ",B_AlcPage)
B_AlcPage = re.sub(r'Alciphron George Berkeley Fifth dialogue'," ",B_AlcPage)
B_AlcPage = re.sub(r'Alciphron George Berkeley Sixth dialogue'," ",B_AlcPage)
B_AlcPage = re.sub(r'Alciphron George Berkeley Seventh dialogue'," ",B_AlcPage)
B_AlcPage = re.sub(r'Alciphron George Berkeley'," ",B_AlcPage)

In [108]:
# Remove dialogue names
B_AlcName = re.sub(r'Alciphron:'," ",B_AlcPage)
B_AlcName = re.sub(r'Lysicles:'," ",B_AlcName)
B_AlcName = re.sub(r'Euphranor:'," ",B_AlcName)
B_AlcName = re.sub(r'Crito:'," ",B_AlcName)

In [109]:
# Remove Roman numerals
B_AlcRom = re.sub(r'\bM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b\.'," ",B_AlcName)

# Remove all punctuation
B_AlcRom = re.sub(r'[^\w\s]'," ",B_AlcRom)

# Replace "ideas" with "idea", lemmatizer does not pick this up
B_AlcRom = re.sub(r'[^\w\s]'," ",B_AlcRom)

In [110]:
# Remove white space
B_AlcClean = re.sub(r'\n'," ",B_AlcRom)

# Remove all numbers
B_AlcClean = re.sub(r'[0-9].*?'," ",B_AlcClean)

# Remove single letters
B_AlcClean = re.sub(r'\b[a-zA-Z]\b'," ",B_AlcClean)

# Remove double spaces +
B_AlcClean = re.sub(r'\s\s+'," ",B_AlcClean)

# Lowercase
B_AlcClean = B_AlcClean.lower()

print(B_AlcClean[:2000])

alciphron or the minute philosopher defence of the christian religion against the so called free thinkers george berkeley contents first dialogue free thinking and the general good second dialogue the ethics of mandeville third dialogue the ethics of shaftesbury fourth dialogue the truth of theism fifth dialogue the utility of christianity sixth dialogue the truth of christianity seventh dialogue the truth of christianity want to consider the various things that free thinker can be atheist libertine fanatic scorner critic metaphysician fatalist sceptic but you shouldn think that according to me every individual free thinker is all of these all am saying is that each item on that list characterizes some freethinkers you may think that no free thinker is an atheist it has often been said that although there are admittedly some atheists who claim to be philosophical theorists no one is really an atheist as matter of philosophical theory know these things are said but am well assured that 

In [111]:
# Lemmatization function
def Lemmatize(text):
    # Tokenize text
    text = nltk.word_tokenize(text)
    
    # Empty variable
    textLemma = []
    
    # Lemmatize words and append to list
    for word in text:
        wordLemma = wordnet_lemmatizer.lemmatize(word, 'v')
        textLemma.append(wordLemma)
    
    # Convert back to string for saving
    text = " ".join(textLemma)
    
    return text;

# Save the lemmatization in a variable
B_AlcClean = Lemmatize(B_AlcClean)

In [112]:
# Test lemmatized output
print(B_AlcClean[:2000])

alciphron or the minute philosopher defence of the christian religion against the so call free thinkers george berkeley content first dialogue free think and the general good second dialogue the ethics of mandeville third dialogue the ethics of shaftesbury fourth dialogue the truth of theism fifth dialogue the utility of christianity sixth dialogue the truth of christianity seventh dialogue the truth of christianity want to consider the various things that free thinker can be atheist libertine fanatic scorner critic metaphysician fatalist sceptic but you shouldn think that accord to me every individual free thinker be all of these all be say be that each item on that list characterize some freethinkers you may think that no free thinker be an atheist it have often be say that although there be admittedly some atheists who claim to be philosophical theorists no one be really an atheist as matter of philosophical theory know these things be say but be well assure that one of the most not

In [113]:
# Write out the clean version of the text file
file = open("Clean/Berkeley_AlciphronCLEAN.txt", "w", encoding="utf-8") 
file.write(B_AlcClean) 
file.close() 

## "A Treatise Concerning the Principles of Human Knowledge", George Berkeley

In [114]:
with open("CorpusComplete/Berkeley_HumanKnowledge.txt", "r", encoding="utf-8") as file:
    B_HumKnow = file.read()

In [115]:
# Remove page numbers
B_HumKnowPage = re.sub(r'[0-9]|999\n',"",B_HumKnow)

In [116]:
# Remove Roman numerals
B_HumKnowRom = re.sub(r'\bM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\b\.',"",B_HumKnowPage)

# Remove all punctuation
B_HumKnowRom = re.sub(r'[^\w\s]',"",B_HumKnowRom)

In [117]:
# Remove white space
B_HumKnowClean = re.sub(r'\n'," ",B_HumKnowRom)

# Remove double spaces +
B_HumKnowClean = re.sub(r'\s\s+'," ",B_HumKnowClean)

# Lowercase
B_HumKnowClean = B_HumKnowClean.lower()

print(B_HumKnowClean[:2000])

a treatise concerning the principles of human knowledge wherein the chief causes of error and difficulty in the sciences with the grounds of scepticism atheism and irreligion are inquired into first printed in the year to which are added three dialogues between hylas and philonous in opposition to scepticks and atheists first printed in the year both written by george berkeley a fellow of trinitycollege dublin london printed for jacob tonson introduction philosophy being nothing else but the study of wisdom and truth it may with reason be expected that those who have spent most time and pains in it should enjoy a greater calm and serenity of mind a greater clearness and evidence of knowledge and be less disturbed with doubts and difficulties than other men yet so it is we see the illiterate bulk of mankind that walk the highroad of plain common sense and are governed by the dictates of nature for the most part easy and undisturbed to them nothing thats familiar appears unaccountable or

In [118]:
# Call the lemmatization function
B_HumKnowClean = Lemmatize(B_HumKnowClean)

In [119]:
# Test lemmatized output
print(B_HumKnowClean[:2000])

a treatise concern the principles of human knowledge wherein the chief cause of error and difficulty in the sciences with the ground of scepticism atheism and irreligion be inquire into first print in the year to which be add three dialogues between hylas and philonous in opposition to scepticks and atheists first print in the year both write by george berkeley a fellow of trinitycollege dublin london print for jacob tonson introduction philosophy be nothing else but the study of wisdom and truth it may with reason be expect that those who have spend most time and pain in it should enjoy a greater calm and serenity of mind a greater clearness and evidence of knowledge and be less disturb with doubt and difficulties than other men yet so it be we see the illiterate bulk of mankind that walk the highroad of plain common sense and be govern by the dictate of nature for the most part easy and undisturbed to them nothing thats familiar appear unaccountable or difficult to comprehend they co

In [120]:
# Write out the clean version of the text file
file = open("Clean/Berkeley_HumanKnowledgeCLEAN.txt", "w", encoding="utf-8") 
file.write(B_HumKnowClean) 
file.close() 

## "An Essay Towards a New Theory of Vision", George Berkeley

In [121]:
with open("CorpusComplete/Berkeley_TheoryOfVision.txt", "r", encoding="utf-8") as file:
    B_TheoryOf = file.read()

In [122]:
# Remove page numbers
B_TheoryOfPage = re.sub(r'[0-9]|999\n',"",B_TheoryOf)

In [123]:
# Remove all punctuation
B_TheoryOfRom = re.sub(r'[^\w\s]',"",B_TheoryOfPage)

In [124]:
# Remove white space
B_TheoryOfClean = re.sub(r'\n'," ",B_TheoryOfRom)

# Remove double spaces +
B_TheoryOfClean = re.sub(r'\s\s+'," ",B_TheoryOfClean)

# Lowercase
B_TheoryOfClean = B_TheoryOfClean.lower()

print(B_TheoryOfClean[:2000])

an essay towards a new theory of vision by george berkeley contents sect design distance of itself invisible remote distance perceived rather by experience than by sense near distance thought to be perceived by the angle of the optic axes difference between this and the former manner of perceiving distance also by diverging rays this depends not on experience these the common accounts but not satisfactory some ideas perceived by the mediation of others no idea which is not itself perceived can be the means of perceiving another distance perceived by means of some other idea those lines and angles mentioned in optics are not themselves perceived hence the mind does not perceive distance by lines and angles also because they have no real existence and because they are insufficient to explain the phenomena the ideas that suggest distance are st the sensation arising from the turn of the eyes betwixt which and distance there is no necessary connection scarce room for mistake in this matter

In [125]:
# Call the lemmatization function
B_TheoryOfClean = Lemmatize(B_TheoryOfClean)

In [126]:
# Test lemmatized output
print(B_TheoryOfClean[:2000])

an essay towards a new theory of vision by george berkeley content sect design distance of itself invisible remote distance perceive rather by experience than by sense near distance think to be perceive by the angle of the optic ax difference between this and the former manner of perceive distance also by diverge ray this depend not on experience these the common account but not satisfactory some ideas perceive by the mediation of others no idea which be not itself perceive can be the mean of perceive another distance perceive by mean of some other idea those line and angle mention in optics be not themselves perceive hence the mind do not perceive distance by line and angle also because they have no real existence and because they be insufficient to explain the phenomena the ideas that suggest distance be st the sensation arise from the turn of the eye betwixt which and distance there be no necessary connection scarce room for mistake in this matter no regard have to the angle of the 

In [127]:
# Write out the clean version of the text file
file = open("Clean/Berkeley_TheoryOfVisionCLEAN.txt", "w", encoding="utf-8") 
file.write(B_TheoryOfClean) 
file.close() 

## "Three Dialogues between Hylas and Philonous in opposition to Sceptics and Atheists",  George Berkeley

In [128]:
with open("CorpusComplete/Berkeley_ThreeDialogues.txt", "r", encoding="utf-8") as file:
    B_ThreeDia = file.read()

In [129]:
# Remove page numbers
B_ThreeDiaPage = re.sub(r'[0-9]|999\n',"",B_ThreeDia)

In [130]:
# Remove header information
B_ThreeDiaHead = re.sub(r'Three Dialogues George Berkeley First Dialogue',"",B_ThreeDiaPage)
B_ThreeDiaHead = re.sub(r'Three Dialogues George Berkeley Second Dialogue',"",B_ThreeDiaHead)
B_ThreeDiaHead = re.sub(r'Three Dialogues George Berkeley Third Dialogue',"",B_ThreeDiaHead)

In [131]:
# Remove dialogue names
B_ThreeDiaName = re.sub(r'Hyl:',"",B_ThreeDiaHead)
B_ThreeDiaName = re.sub(r'Phil:',"",B_ThreeDiaName)

In [132]:
# Remove weird dots
B_ThreeDiaDot = re.sub(r'•',"",B_ThreeDiaName)
B_ThreeDiaDot = re.sub(r'·',"",B_ThreeDiaDot)

# Remove all punctuation
B_ThreeDiaDot = re.sub(r'[^\w\s]',"",B_ThreeDiaDot)

In [133]:
# Remove white space
B_ThreeDiaClean = re.sub(r'\n'," ",B_ThreeDiaDot)

# Remove double spaces +
B_ThreeDiaClean = re.sub(r'\s\s+'," ",B_ThreeDiaClean)

# Lowercase
B_ThreeDiaClean = B_ThreeDiaClean.lower()

print(B_ThreeDiaClean[:2000])

three dialogues between hylas and philonous in opposition to sceptics and atheists george berkeley contents the first dialogue the second dialogue the third dialogue the first dialogue philonous good morning hylas i didnt expect to find you out and about so early hylas it is indeed somewhat unusual but my thoughts were so taken up with a subject i was talking about last night that i couldnt sleep so i decided to get up and walk in the garden thats good it gives you a chance to see what innocent and agreeable pleasures you lose every morning can there be a pleasanter time of the day or a more delightful season of the year that purple sky those wild but sweet notes of birds the fragrant bloom on the trees and flowers the gentle influence of the rising sun these and a thousand nameless beauties of nature inspire the soul with secret raptures but im afraid i am interrupting your thoughts for you seemed very intent on something yes i was and id be grateful if you would allow me to carry on 

In [134]:
# Call the lemmatization function
B_ThreeDiaClean = Lemmatize(B_ThreeDiaClean)

In [135]:
# Test lemmatized output
print(B_ThreeDiaClean[:2000])

three dialogues between hylas and philonous in opposition to sceptics and atheists george berkeley content the first dialogue the second dialogue the third dialogue the first dialogue philonous good morning hylas i didnt expect to find you out and about so early hylas it be indeed somewhat unusual but my thoughts be so take up with a subject i be talk about last night that i couldnt sleep so i decide to get up and walk in the garden thats good it give you a chance to see what innocent and agreeable pleasures you lose every morning can there be a pleasanter time of the day or a more delightful season of the year that purple sky those wild but sweet note of bird the fragrant bloom on the tree and flower the gentle influence of the rise sun these and a thousand nameless beauties of nature inspire the soul with secret raptures but im afraid i be interrupt your thoughts for you seem very intent on something yes i be and id be grateful if you would allow me to carry on with it but i dont in 

In [136]:
# Write out the clean version of the text file
file = open("Clean/Berkeley_ThreeDialoguesCLEAN.txt", "w", encoding="utf-8") 
file.write(B_ThreeDiaClean) 
file.close() 

## "Essays Moral, Political, Literary", David Hume

In [137]:
with open("CorpusComplete/Hume_EssaysMoralPoliticalLiterary.txt", "r", encoding="utf-8") as file:
    H_MPL = file.read()

In [138]:
# Remove header information
H_MPLPage = re.sub(r'Online Library of Liberty: Essays Moral, Political, Literary \(LF ed.\)',"",H_MPL)
H_MPLPage = re.sub(r'PLL v6.0 \(generated September, 2011\) .*? http://oll.libertyfund.org/title/704',"",H_MPLPage)

# Remove TOC link
H_MPLPage = re.sub(r'\[Back to Table of Contents\]',"",H_MPLPage)

In [139]:
# Remove all numbers
H_MPLNum = re.sub(r'[0-9].*?',"",H_MPLPage)

In [140]:
# Remove weird dots
H_MPLDot = re.sub(r'°',"",H_MPLNum)

# Remove all punctuation
H_MPLDot = re.sub(r'[^\w\s]',"",H_MPLDot)

In [141]:
# Remove white space
H_MPLClean = re.sub(r'\n'," ",H_MPLDot)

# Remove Roman numerals
H_MPLClean = re.sub(r'\b[IVXLCDM]+\b',"",H_MPLClean)

# Remove double spaces +
H_MPLClean = re.sub(r'\s\s+'," ",H_MPLClean)

# Lowercase
H_MPLClean = H_MPLClean.lower()

print(H_MPLClean[:2000])

david hume essays moral political literary lf ed part essays moral political and literary essay of the delicacy of taste and passion essay of the liberty of the press essay that politics may be reduced to a science essay of the first principles of government essay of the origin of government essay of the independency of parliament essay whether the british government inclines more to absolute monarchy or to a republic essay of parties in general essay of the parties of great britain essay of superstition and enthusiasm essay of the dignity or meanness of human nature essay of civil liberty essay of eloquence essay of the rise and progress of the arts and sciences essay the epicurean essay the stoic essay the platonist essay the sceptic essay of polygamy and divorces essay of simplicity and refinement in writing essay of national characters essay of tragedy essay of the standard of taste part essays moral political and literary essay of commerce essay of refinement in the arts essay of 

In [142]:
# Call the lemmatization function
H_MPLClean = Lemmatize(H_MPLClean)

In [143]:
# Test lemmatized output
print(H_MPLClean[:2000])

david hume essay moral political literary lf ed part essay moral political and literary essay of the delicacy of taste and passion essay of the liberty of the press essay that politics may be reduce to a science essay of the first principles of government essay of the origin of government essay of the independency of parliament essay whether the british government incline more to absolute monarchy or to a republic essay of party in general essay of the party of great britain essay of superstition and enthusiasm essay of the dignity or meanness of human nature essay of civil liberty essay of eloquence essay of the rise and progress of the arts and sciences essay the epicurean essay the stoic essay the platonist essay the sceptic essay of polygamy and divorce essay of simplicity and refinement in write essay of national character essay of tragedy essay of the standard of taste part essay moral political and literary essay of commerce essay of refinement in the arts essay of money essay o

In [144]:
# Write out the clean version of the text file
file = open("Clean/Hume_EssaysMoralPoliticalLiteraryCLEAN.txt", "w", encoding="utf-8") 
file.write(H_MPLClean) 
file.close() 

## "An Enquiry Concerning Human Understanding", David Hume

In [145]:
with open("CorpusComplete/Hume_HumanUnderstanding.txt", "r", encoding="utf-8") as file:
    H_HumUnd = file.read()

In [146]:
# Remove header information
H_HumUndPage = re.sub(r'[0-9]/David Hume',"",H_HumUnd)
H_HumUndPage = re.sub(r'Enquiry Concerning Human Understanding/[0-9]',"",H_HumUndPage)

In [147]:
# Remove all numbers
H_HumUndNum = re.sub(r'[0-9].*?',"",H_HumUndPage)

In [148]:
# Remove all punctuation
H_HumUndDot = re.sub(r'[^\w\s]',"",H_HumUndNum)

In [149]:
# Remove white space
H_HumUndClean = re.sub(r'\n'," ",H_HumUndDot)

# Remove Roman numerals
H_HumUndClean = re.sub(r'\b[IVXLCDM]+\b',"",H_HumUndClean)

# Remove double spaces +
H_HumUndClean = re.sub(r'\s\s+'," ",H_HumUndClean)

# Lowercase
H_HumUndClean = H_HumUndClean.lower()

print(H_HumUndClean[:2000])

an enquiry concerning human understanding david hume contents sect of the different species of philosophy sect of the origin of ideas sect of the association of ideas sect sceptical doubts concerning the operations of the understanding sect sceptical solution of these doubts sect of probability sect of the idea of necessary connexion sect of liberty and necessity sect of the reason of animals sect of miracles sect of a particular providence and of a future state sect of the academical or sceptical philosophy notes sect of the different species of philosophy moral philosophy or the science of human nature may be treated after two different manners each of which has its peculiar merit and may contribute to the entertainment instruction and reformation of mankind the one considers man chiefly as born for action and as influenced in his measures by taste and sentiment pursuing one object and avoiding another according to the value which these objects seem to possess and according to the li

In [150]:
# Call the lemmatization function
H_HumUndClean = Lemmatize(H_HumUndClean)

In [151]:
# Test lemmatized output
print(H_HumUndClean[:2000])

an enquiry concern human understand david hume content sect of the different species of philosophy sect of the origin of ideas sect of the association of ideas sect sceptical doubt concern the operations of the understand sect sceptical solution of these doubt sect of probability sect of the idea of necessary connexion sect of liberty and necessity sect of the reason of animals sect of miracles sect of a particular providence and of a future state sect of the academical or sceptical philosophy note sect of the different species of philosophy moral philosophy or the science of human nature may be treat after two different manners each of which have its peculiar merit and may contribute to the entertainment instruction and reformation of mankind the one consider man chiefly as bear for action and as influence in his measure by taste and sentiment pursue one object and avoid another accord to the value which these object seem to possess and accord to the light in which they present themse

In [152]:
# Write out the clean version of the text file
file = open("Clean/Hume_HumanUnderstandingCLEAN.txt", "w", encoding="utf-8") 
file.write(H_HumUndClean) 
file.close() 

## "Dialogues Concerning Natural Religion", David Hume

In [153]:
with open("CorpusComplete/Hume_NaturalReligion.txt", "r", encoding="utf-8") as file:
    H_NatRel = file.read()

In [154]:
# Remove header information
H_NatRelPage = re.sub(r'Dialogues concerning Natural Religion David Hume Part [0-9]',"",H_NatRel)

In [155]:
# Remove "Part" information
H_NatRelNum = re.sub(r'Part [0-9].*?',"",H_NatRelPage)

# Remove all numbers
H_NatRelNum = re.sub(r'[0-9].*?',"",H_NatRelNum)

In [156]:
# Remove weird dots
H_NatRelDot = re.sub(r'•',"",H_NatRelNum)
H_NatRelDot = re.sub(r'·',"",H_NatRelDot)

# Remove all punctuation
H_NatRelDot = re.sub(r'[^\w\s]',"",H_NatRelDot)

In [157]:
# Remove white space
H_NatRelClean = re.sub(r'\n'," ",H_NatRelDot)

# Remove double spaces +
H_NatRelClean = re.sub(r'\s\s+'," ",H_NatRelClean)

# Lowercase
H_NatRelClean = H_NatRelClean.lower()

print(H_NatRelClean[:2000])

dialogues concerning natural religion david hume contents letter from pamphilus to hermippus dialogues concerning natural religion david hume dialogues concerning natural religion david hume pamphilus to hermippus letter from pamphilus to hermippus it has been remarked that though the ancient philosophers mostly taught through dialogues the dialogue form hasnt been much used in recent times and has seldom succeeded when people have tried it there is a good reason for this philosophical enquirers these days are expected to produce precise and orderly arguments and someone aiming at those will naturally proceed with a methodical exposition in which he can right at the outset explain the point he wants to establish and then proceed without interruption to present his proofs of it it hardly seems natural to present a system in conversation and there is also another disadvantage of the dialogue form by departing from the direct style of composition the dialoguewriter hopes to give a freer a

In [158]:
# Call the lemmatization function
H_NatRelClean = Lemmatize(H_NatRelClean)

In [159]:
# Test lemmatized output
print(H_NatRelClean[:2000])

dialogues concern natural religion david hume content letter from pamphilus to hermippus dialogues concern natural religion david hume dialogues concern natural religion david hume pamphilus to hermippus letter from pamphilus to hermippus it have be remark that though the ancient philosophers mostly teach through dialogues the dialogue form hasnt be much use in recent time and have seldom succeed when people have try it there be a good reason for this philosophical enquirers these days be expect to produce precise and orderly arguments and someone aim at those will naturally proceed with a methodical exposition in which he can right at the outset explain the point he want to establish and then proceed without interruption to present his proof of it it hardly seem natural to present a system in conversation and there be also another disadvantage of the dialogue form by depart from the direct style of composition the dialoguewriter hop to give a freer air to his performance and to avoid 

In [160]:
# Write out the clean version of the text file
file = open("Clean/Hume_NaturalReligionCLEAN.txt", "w", encoding="utf-8") 
file.write(H_NatRelClean) 
file.close() 

## "An Enquiry into the Sources of Morals", David Hume

In [161]:
with open("CorpusComplete/Hume_SourcesofMorals.txt", "r", encoding="utf-8") as file:
    H_SourceOf = file.read()

In [162]:
# Remove header information
# Was able to locate this information from the TOC
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 1: General sources of morals',"",H_SourceOf)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 2: Benevolence',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 3: Justice',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 4: Political society',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 5: Why utility pleases',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 6: Qualities useful to ourselves',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 7: Qualities immediately agreeable to ourselves',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 8. Qualities immediately agreeable to others',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume 9: Conclusion',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume Appendix 1. Moral sentiment',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume Appendix 2. Self-love',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume Appendix 3. More about justice',"",H_SourceOfPage)
H_SourceOfPage = re.sub(r'Sources of Morals David Hume Appendix 4. Verbal disputes',"",H_SourceOfPage)

In [163]:
# Remove "Section" + "Part" information
H_SourceOfNum = re.sub(r'Section [0-9].*?',"",H_SourceOfPage)
H_SourceOfNum = re.sub(r'Part [0-9].*?',"",H_SourceOfNum)

# Remove all numbers
H_SourceOfNum = re.sub(r'[0-9].*?',"",H_SourceOfNum)

In [164]:
# Remove weird dots
H_SourceOfDot = re.sub(r'•',"",H_SourceOfNum)
H_SourceOfDot = re.sub(r'·',"",H_SourceOfDot)

# Remove all punctuation
H_SourceOfDot = re.sub(r'[^\w\s]',"",H_SourceOfDot)

In [165]:
# Remove white space
H_SourceOfClean = re.sub(r'\n'," ",H_SourceOfDot)

# Remove double spaces +
H_SourceOfClean = re.sub(r'\s\s+'," ",H_SourceOfClean)

# Lowercase
H_SourceOfClean = H_SourceOfClean.lower()

print(H_SourceOfClean[:2000])

an enquiry into the sources of morals david hume sources of morals david hume contents the general sources of morals benevolence justice political society why utility pleases qualities useful to ourselves qualities immediately agreeable to ourselves qualities immediately agreeable to others conclusion appendix moral sentiment or feeling appendix selflove appendix further points about justice appendix some verbal disputes most of the principles and reasonings contained in this volume were published in a work in three volumes called a treatise of human nature a work which the author had projected before he left college and which he wrote and published soon after it wasnt a success and he came to realize that he had gone to the press too early so he reworked the whole thing in the following pieces in which he hopes to have corrected some faults in his earlier reasoning and more in his writing the enquiry concerning human understanding the dissertation on the passions and the present work 

In [166]:
# Call the lemmatization function
H_SourceOfClean = Lemmatize(H_SourceOfClean)

In [167]:
# Test lemmatized output
print(H_SourceOfClean[:2000])

an enquiry into the source of morals david hume source of morals david hume content the general source of morals benevolence justice political society why utility please qualities useful to ourselves qualities immediately agreeable to ourselves qualities immediately agreeable to others conclusion appendix moral sentiment or feel appendix selflove appendix further point about justice appendix some verbal dispute most of the principles and reason contain in this volume be publish in a work in three volumes call a treatise of human nature a work which the author have project before he leave college and which he write and publish soon after it wasnt a success and he come to realize that he have go to the press too early so he rework the whole thing in the follow piece in which he hop to have correct some fault in his earlier reason and more in his write the enquiry concern human understand the dissertation on the passions and the present work be publish in one volume yet several writers wh

In [168]:
# Write out the clean version of the text file
file = open("Clean/Hume_SourcesofMoralsCLEAN.txt", "w", encoding="utf-8") 
file.write(H_SourceOfClean) 
file.close() 

## "An Essay Concerning Human Understanding", John Locke 

In [169]:
with open("CorpusComplete/Locke_HumanUnderstanding.txt", "r", encoding="utf-8") as file:
    L_HumUnd = file.read()

In [170]:
# Remove header information
L_HumUndPage = re.sub(r'[0-9]\nJohn Locke',"",L_HumUnd)
L_HumUndPage = re.sub(r'[0-9]\nHuman Understanding',"",L_HumUndPage)

In [171]:
# Remove TOC/Chapter Information
L_HumUndTOC = re.sub(r'INTRODUCTION',"",L_HumUndPage)
L_HumUndTOC = re.sub(r'BOOK (M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))',"",L_HumUndTOC)
L_HumUndTOC = re.sub(r'Chapter (M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))',"",L_HumUndTOC)

In [172]:
# Remove all numbers
L_HumUndNum = re.sub(r'[0-9].*?',"",L_HumUndTOC)

In [173]:
# Remove all punctuation
L_HumUndDot = re.sub(r'[^\w\s]',"",L_HumUndNum)

# Remove white space
L_HumUndClean = re.sub(r'\n'," ",L_HumUndDot)

# Remove double spaces +
L_HumUndClean = re.sub(r'\s\s+'," ",L_HumUndClean)

# Lowercase
L_HumUndClean = L_HumUndClean.lower()

print(L_HumUndClean[:2000])

an essay concerning human understanding by john locke contents an essay concerning human understanding epistle to the reader neither principles nor ideas are innate no innate speculative principles no innate practical principles other considerations concerning innate principles both speculative and practical of ideas of ideas in general and their original of simple ideas of simple ideas of sense idea of solidity of simple ideas of divers senses i of simple ideas of reflection ii of simple ideas of both sensation and reflection iii some further considerations concerning our simple ideas of sensation of perception of retention of discerning and other operations of the mind of complex ideas complex ideas of simple modesand first of the simple modes of the idea of space idea of duration and its simple modes ideas of duration and expansion considered together idea of number of infinity other simple modes of the modes of thinking of modes of pleasure and pain of power of mixed modes of our c

In [174]:
# Call the lemmatization function
L_HumUndClean = Lemmatize(L_HumUndClean)

In [175]:
# Test lemmatized output
print(L_HumUndClean[:2000])

an essay concern human understand by john locke content an essay concern human understand epistle to the reader neither principles nor ideas be innate no innate speculative principles no innate practical principles other considerations concern innate principles both speculative and practical of ideas of ideas in general and their original of simple ideas of simple ideas of sense idea of solidity of simple ideas of divers sense i of simple ideas of reflection ii of simple ideas of both sensation and reflection iii some further considerations concern our simple ideas of sensation of perception of retention of discern and other operations of the mind of complex ideas complex ideas of simple modesand first of the simple modes of the idea of space idea of duration and its simple modes ideas of duration and expansion consider together idea of number of infinity other simple modes of the modes of think of modes of pleasure and pain of power of mix modes of our complex ideas of substances of c

In [176]:
# Write out the clean version of the text file
file = open("Clean/Locke_HumanUnderstandingCLEAN.txt", "w", encoding="utf-8") 
file.write(L_HumUndClean) 
file.close() 

## "Two Treatises of Government", John Locke

In [177]:
with open("CorpusComplete/Locke_TwoTreatises.txt", "r", encoding="utf-8") as file:
    L_TwoTrea = file.read()

In [178]:
# Remove header information
L_HumTwoTreaPage = re.sub(r'[0-9]/John Locke',"",L_TwoTrea)
L_HumTwoTreaPage = re.sub(r'Two Treatises of Government/[0-9]',"",L_HumTwoTreaPage)

In [179]:
# Remove Chapter Information
L_HumTwoTreaCh = re.sub(r'§[0-9].',"",L_HumTwoTreaPage)
L_HumTwoTreaCh = re.sub(r'Chapter (M{1,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|C?D|D?C{1,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|X?L|L?X{1,3})(IX|IV|V?I{0,3})|M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|I?V|V?I{1,3}))',"",L_HumTwoTreaCh)

In [180]:
# Remove all numbers
L_HumTwoTreaNum = re.sub(r'[0-9].*?',"",L_HumTwoTreaCh)

In [181]:
# Remove all punctuation
L_HumTwoTreaDot = re.sub(r'[^\w\s]',"",L_HumTwoTreaNum)

# Remove white space
L_HumTwoTreaClean = re.sub(r'\n'," ",L_HumTwoTreaDot)

# Remove double spaces +
L_HumTwoTreaClean = re.sub(r'\s\s+'," ",L_HumTwoTreaClean)

# Lowercase
L_HumTwoTreaClean = L_HumTwoTreaClean.lower()

print(L_HumTwoTreaClean[:2000])

two treatises of government in the former the false principles and foundation of sir robert filmer and his followers are detected and overthrown the latter is an essay concerning the original extent and end of civil government john locke the preface reader thou hast here the beginning and end of a discourse concerning government what fate has otherwise disposed of the papers that should have filled up the middle and were more than all the rest it is not worth while to tell thee these which remain i hope are sufficient to establish the throne of our great restorer our present king william to make good his title in else consent of the people which being the only one of all lawful governments he has more fully and clearly than any prince in christendom and to justify to the world the people of england whose love of their just and natural rights with their resolution to preserve them saved the nation when it war on the very brink of slavery and ruin if these papers have that evidence i fla

In [182]:
# Call the lemmatization function
L_HumTwoTreaClean = Lemmatize(L_HumTwoTreaClean)

In [183]:
# Test lemmatized output
print(L_HumTwoTreaClean[:2000])

two treatises of government in the former the false principles and foundation of sir robert filmer and his followers be detect and overthrow the latter be an essay concern the original extent and end of civil government john locke the preface reader thou hast here the begin and end of a discourse concern government what fate have otherwise dispose of the paper that should have fill up the middle and be more than all the rest it be not worth while to tell thee these which remain i hope be sufficient to establish the throne of our great restorer our present king william to make good his title in else consent of the people which be the only one of all lawful governments he have more fully and clearly than any prince in christendom and to justify to the world the people of england whose love of their just and natural right with their resolution to preserve them save the nation when it war on the very brink of slavery and ruin if these paper have that evidence i flatter myself be to be find

In [184]:
# Write out the clean version of the text file
file = open("Clean/Locke_TwoTreatisesCLEAN.txt", "w", encoding="utf-8") 
file.write(L_HumTwoTreaClean) 
file.close() 

## Total Character Count and Concatenation

In [185]:
print("Berkeley: ")
len(B_HumKnowClean + B_TheoryOfClean + B_ThreeDiaClean + B_AlcClean)

Berkeley: 


986707

In [186]:
# Concatenate all text files for analysis
Berkeley = B_HumKnowClean + B_TheoryOfClean + B_ThreeDiaClean + B_AlcClean

file = open("FullText/BerkeleyComplete.txt", "w", encoding="utf-8") 
file.write(Berkeley) 
file.close() 

In [187]:
print("Hume: ")
len(H_MPLClean + H_HumUndClean + H_NatRelClean + H_SourceOfClean)

Hume: 


2094204

In [188]:
# Concatenate all text files for analysis
Hume = H_MPLClean + H_HumUndClean + H_NatRelClean + H_SourceOfClean

file = open("FullText/HumeComplete.txt", "w", encoding="utf-8") 
file.write(Hume) 
file.close() 

In [189]:
print("Locke: ")
len(L_HumUndClean + L_HumTwoTreaClean)

Locke: 


2040978

In [190]:
# Concatenate all text files for analysis
Locke = L_HumUndClean + L_HumTwoTreaClean

file = open("FullText/LockeComplete.txt", "w", encoding="utf-8") 
file.write(Locke) 
file.close() 