<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [5]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
hobbes_leviathan = get_text(drive_path + '/phil_txts/hobbes_leviathan.txt')

In [12]:
hobbes_leviathan = hobbes_leviathan[198:]

### Clean the Text



In [13]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [None]:
# note extras like bracketed footnotes or specific words to remove


In [14]:
# build a dictionary for the book
hobbes_leviathan_dict = {
    'author': 'Hobbes',
    'title': 'Leviathan',
    'text': hobbes_leviathan,
    'school': 'Hobbes',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 1651,
    'corpus date': 1651
}

In [15]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [16]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [19]:
# use the function
f_t_df = from_raw_to_df(hobbes_leviathan_dict)
df = f_t_df

In [20]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
5574,Leviathan,Hobbes,Hobbes,"(Therefore, the, safest, way, is, to, believe, that, by, the, descending, of, the, dove, upon, the, Apostles, ,, and, by, Christ, 's, breathing, on, them, when, he, gave, them, the, Holy, Ghost, ,...",1651,1651,"Therefore the safest way is to believe that by the descending of the dove upon the Apostles, and by Christ's breathing on them when he gave them the Holy Ghost, and by the giving of it by impositi..."
4068,Leviathan,Hobbes,Hobbes,"(In, this, Trinity, on, earth, ,, the, unity, is, not, of, the, thing, ;, for, the, spirit, ,, the, water, ,, and, the, blood, are, not, the, same, substance, ,, though, they, give, the, same, tes...",1651,1651,"In this Trinity on earth, the unity is not of the thing; for the spirit, the water, and the blood are not the same substance, though they give the same testimony: but in the Trinity of heaven, the..."
774,Leviathan,Hobbes,Hobbes,"(For, duels, also, are, many, times, effects, of, courage, ,, and, the, ground, of, courage, is, always, strength, or, skill, ,, which, are, power, ;, though, for, the, most, part, they, be, effec...",1651,1651,"For duels also are many times effects of courage, and the ground of courage is always strength or skill, which are power; though for the most part they be effects of rash speaking, and of the fear..."
2697,Leviathan,Hobbes,Hobbes,"(And, seeing, the, end, of, punishing, is, not, revenge, and, discharge, of, choler, ,, but, correction, either, of, the, offender, or, of, others, by, his, example, ,, the, severest, punishments,...",1651,1651,"And seeing the end of punishing is not revenge and discharge of choler, but correction either of the offender or of others by his example, the severest punishments are to be inflicted for those cr..."
2637,Leviathan,Hobbes,Hobbes,"(Lastly, ,, they, are, to, be, taught, that, not, only, the, unjust, facts, ,, but, the, designs, and, intentions, to, do, them, ,, though, by, accident, hindered, ,, are, injustice, ;, which, con...",1651,1651,"Lastly, they are to be taught that not only the unjust facts, but the designs and intentions to do them, though by accident hindered, are injustice; which consisteth in the pravity of the will, as..."
5365,Leviathan,Hobbes,Hobbes,"(In, the, other, places, which, he, allegeth, out, of, the, Old, Testament, ,, there, is, not, so, much, as, any, show, or, colour, of, proof, .)",1651,1651,"In the other places which he allegeth out of the Old Testament, there is not so much as any show or colour of proof."
2954,Leviathan,Hobbes,Hobbes,"(Joshua, had, set, up, twelve, stones, in, the, midst, of, Jordan, ,, for, a, monument, of, their, passage, ;, of, which, the, writer, saith, thus, ,, They, are, there, unto, this, day, for, unto,...",1651,1651,"Joshua had set up twelve stones in the midst of Jordan, for a monument of their passage; of which the writer saith thus, They are there unto this day for unto this day is a phrase that signifieth ..."
1469,Leviathan,Hobbes,Hobbes,"(Besides, ,, there, is, no, favourite, of, a, monarch, which, can, not, as, well, succour, his, friends, as, hurt, his, enemies, :, but, orators, ,, that, is, to, say, ,, favourites, of, sovereign...",1651,1651,"Besides, there is no favourite of a monarch which cannot as well succour his friends as hurt his enemies: but orators, that is to say, favourites of sovereign assemblies, though they have great po..."
2860,Leviathan,Hobbes,Hobbes,"(there, dependeth, much, upon, supernatural, revelations, of, the, will, of, God, ,, the, ground, of, my, discourse, must, be, not, only, the, natural, word, of, God, ,, but, also, the, prophetica...",1651,1651,"there dependeth much upon supernatural revelations of the will of God, the ground of my discourse must be not only the natural word of God, but also the prophetical."
1427,Leviathan,Hobbes,Hobbes,"(For, in, the, sovereignty, is, the, fountain, of, honour, .)",1651,1651,For in the sovereignty is the fountain of honour.


In [21]:
len(df)

6029

#### Remove Short Sentences

In [22]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 518 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
2648,Leviathan,Hobbes,Hobbes,"(Hard, questions, .)",1651,1651,Hard questions.,15
4329,Leviathan,Hobbes,Hobbes,"(Exodus, ,, .)",1651,1651,"Exodus,.",8
4204,Leviathan,Hobbes,Hobbes,"(Acts, ,, .)",1651,1651,"Acts,.",6
4562,Leviathan,Hobbes,Hobbes,"(Ibid, ., ,, .)",1651,1651,"Ibid.,.",7
4588,Leviathan,Hobbes,Hobbes,"(Matthew, ,, .)",1651,1651,"Matthew,.",9


In [23]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

5511

#### Remove Cases of Self-Mention

In [24]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Kierkegaard'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [25]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

5511

#### Deal with Duplicates

In [26]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

0

In [27]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

ValueError: ignored

In [28]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [29]:
len(df)

5511

#### Check for Foreign Languages

In [30]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

0

In [31]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [32]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

5511

In [33]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

5511

In [34]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

5511

In [35]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

5511

In [36]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

5511

### Lemmatize and Tokenize

In [37]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [38]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [39]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
5258,Leviathan,Hobbes,Hobbes,"(And, after, Noah, came, out, of, the, ark, ,, God, saith, ,, He, will, no, more, smite, omnem, animam, viventem, ,, that, is, ,, every, living, creature, .)",1651,1651,"And after Noah came out of the ark, God saith, He will no more smite omnem animam viventem, that is, every living creature.",123,"[and, after, noah, came, out, of, the, ark, god, saith, he, will, no, more, smite, omnem, animam, viventem, that, is, every, living, creature]","and after Noah come out of the ark , God saith , -PRON- will no more smite omnem animam viventem , that is , every live creature ."
4739,Leviathan,Hobbes,Hobbes,"(But, this, also, maketh, only, for, the, legislative, power, of, civil, sovereigns, :, for, the, Scribes, and, Pharisees, sat, in, Moses, ', chair, ,, but, Moses, next, under, God, was, sovereign...",1651,1651,"But this also maketh only for the legislative power of civil sovereigns: for the Scribes and Pharisees sat in Moses' chair, but Moses next under God was sovereign of the people of Israel: and ther...",352,"[but, this, also, maketh, only, for, the, legislative, power, of, civil, sovereigns, for, the, scribes, and, pharisees, sat, in, moses, chair, but, moses, next, under, god, was, sovereign, of, the...","but this also maketh only for the legislative power of civil sovereign : for the Scribes and Pharisees sit in Moses ' chair , but Moses next under God be sovereign of the people of Israel : and t..."
2440,Leviathan,Hobbes,Hobbes,"(the, representative, will, :, for, in, denying, subjection, ,, he, denies, such, punishment, as, by, the, law, hath, been, ordained, ,, and, therefore, suffers, as, an, enemy, of, the, Commonweal...",1651,1651,"the representative will: for in denying subjection, he denies such punishment as by the law hath been ordained, and therefore suffers as an enemy of the Commonwealth;",166,"[the, representative, will, for, in, denying, subjection, he, denies, such, punishment, as, by, the, law, hath, been, ordained, and, therefore, suffers, as, an, enemy, of, the, commonwealth]","the representative will : for in deny subjection , -PRON- deny such punishment as by the law hath be ordain , and therefore suffer as an enemy of the Commonwealth ;"
2603,Leviathan,Hobbes,Hobbes,"(Wherein, they, argue, as, ill, ,, as, if, the, savage, people, of, America, should, deny, there, were, any, grounds, or, principles, of, reason, so, to, build, a, house, as, to, last, as, long, a...",1651,1651,"Wherein they argue as ill, as if the savage people of America should deny there were any grounds or principles of reason so to build a house as to last as long as the materials, because they never...",223,"[wherein, they, argue, as, ill, as, if, the, savage, people, of, america, should, deny, there, were, any, grounds, or, principles, of, reason, so, to, build, house, as, to, last, as, long, as, the...","wherein -PRON- argue as ill , as if the savage people of America should deny there be any ground or principle of reason so to build a house as to last as long as the material , because -PRON- nev..."
2405,Leviathan,Hobbes,Hobbes,"(And, to, rob, a, poor, man, is, a, greater, crime, than, to, rob, a, rich, man, ,, because, it, is, to, the, poor, a, more, sensible, damage, .)",1651,1651,"And to rob a poor man is a greater crime than to rob a rich man, because it is to the poor a more sensible damage.",114,"[and, to, rob, poor, man, is, greater, crime, than, to, rob, rich, man, because, it, is, to, the, poor, more, sensible, damage]","and to rob a poor man be a great crime than to rob a rich man , because -PRON- be to the poor a more sensible damage ."


### Combine with the Old Dataframe & Export to CSV

In [40]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
227172,The Crisis Of The European Sciences And Phenomenology,Husserl,phenomenology,"So it is understandable how, as a consequence of the awakened striving for philosophical knowledge, knowledge which determines the true, the objective being of the world, the empirical art of meas...","So it is understandable how, as a consequence of the awakened striving for philosophical knowledge, knowledge which determines the true, the objective being of the world, the empirical art of meas...",1936,1970,398,"so it is understandable how, as a consequence of the awakened striving for philosophical knowledge, knowledge which determines the true, the objective being of the world, the empirical art of meas...","['so', 'it', 'is', 'understandable', 'how', 'as', 'consequence', 'of', 'the', 'awakened', 'striving', 'for', 'philosophical', 'knowledge', 'knowledge', 'which', 'determines', 'the', 'true', 'the',...","so -PRON- be understandable how , as a consequence of the awaken strive for philosophical knowledge , knowledge which determine the true , the objective being of the world , the empirical art of ..."
52543,Aristotle - Complete Works,Aristotle,aristotle,"A rainbow is the reflection of a segment of the sun or of the moon, seen, as in a mirror, in a cloud which is moist, hollow, and continuous in appearance, and taking a circular form.","A rainbow is the reflection of a segment of the sun or of the moon, seen, as in a mirror, in a cloud which is moist, hollow, and continuous in appearance, and taking a circular form.",-320,1991,182,"a rainbow is the reflection of a segment of the sun or of the moon, seen, as in a mirror, in a cloud which is moist, hollow, and continuous in appearance, and taking a circular form.","['rainbow', 'is', 'the', 'reflection', 'of', 'segment', 'of', 'the', 'sun', 'or', 'of', 'the', 'moon', 'seen', 'as', 'in', 'mirror', 'in', 'cloud', 'which', 'is', 'moist', 'hollow', 'and', 'contin...","a rainbow be the reflection of a segment of the sun or of the moon , see , as in a mirror , in a cloud which be moist , hollow , and continuous in appearance , and take a circular form ."
165649,Quintessence,Quine,analytic,"So our renunciation must extend to all de re belief, and similarly, doubt, for the other propositional attitudes.","So our renunciation must extend to all de re belief, and similarly, doubt, for the other propositional attitudes.",1950,2004,113,"so our renunciation must extend to all de re belief, and similarly, doubt, for the other propositional attitudes.","['so', 'our', 'renunciation', 'must', 'extend', 'to', 'all', 'de', 're', 'belief', 'and', 'similarly', 'doubt', 'for', 'the', 'other', 'propositional', 'attitudes']","so -PRON- renunciation must extend to all de re belief , and similarly , doubt , for the other propositional attitude ."
190464,History Of Madness,Foucault,continental,"Only it did not offer itself then as madness but rather as a recognisable type, as the madman. '","Only it did not offer itself then as madness but rather as a recognisable type, as the madman. '",1961,2006,96,"only it did not offer itself then as madness but rather as a recognisable type, as the madman. '","['only', 'it', 'did', 'not', 'offer', 'itself', 'then', 'as', 'madness', 'but', 'rather', 'as', 'recognisable', 'type', 'as', 'the', 'madman']","only -PRON- do not offer -PRON- then as madness but rather as a recognisable type , as the madman . '"
219646,The Phenomenology Of Perception,Merleau-Ponty,phenomenology,A being capable of sense experience (sentir) in the sense of coinciding absolutely with an impression or a quality could have no other mode of knowing.,A being capable of sense experience (sentir) in the sense of coinciding absolutely with an impression or a quality could have no other mode of knowing.,1945,2002,151,a being capable of sense experience (sentir) in the sense of coinciding absolutely with an impression or a quality could have no other mode of knowing.,"['being', 'capable', 'of', 'sense', 'experience', 'sentir', 'in', 'the', 'sense', 'of', 'coinciding', 'absolutely', 'with', 'an', 'impression', 'or', 'quality', 'could', 'have', 'no', 'other', 'mo...",a be capable of sense experience ( sentir ) in the sense of coincide absolutely with an impression or a quality could have no other mode of know .


In [41]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.131354
Plato              0.103314
Hegel              0.061128
Anselm             0.043293
Foucault           0.041039
Heidegger          0.041036
Kant               0.038034
Marx               0.036324
Lewis              0.035330
Malebranche        0.034999
Deleuze            0.033768
Kripke             0.033604
Smith              0.031487
Wittgenstein       0.024327
Locke              0.023926
Hume               0.022383
Merleau-Ponty      0.020444
Quine              0.019854
Nietzsche          0.018349
Derrida            0.016154
Davis              0.015931
Husserl            0.015462
Fichte             0.014294
Russell            0.013661
Leibniz            0.013537
Seneca             0.013515
Popper             0.012597
Lenin              0.012034
Spinoza            0.010214
Moore              0.009877
Keynes             0.009185
Ricardo            0.008321
Beauvoir           0.008246
Berkeley           0.007362
Augustine          0.006891
Marcus Aurelius    0

In [None]:
len(og_df)

369590

In [42]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.129434
Plato              0.101803
Hegel              0.060234
Anselm             0.042660
Foucault           0.040439
Heidegger          0.040436
Kant               0.037478
Marx               0.035793
Lewis              0.034814
Malebranche        0.034487
Deleuze            0.033275
Kripke             0.033113
Smith              0.031027
Wittgenstein       0.023971
Locke              0.023576
Hume               0.022056
Merleau-Ponty      0.020145
Quine              0.019564
Nietzsche          0.018081
Derrida            0.015918
Davis              0.015698
Husserl            0.015236
Hobbes             0.014623
Fichte             0.014085
Russell            0.013461
Leibniz            0.013339
Seneca             0.013318
Popper             0.012413
Lenin              0.011858
Spinoza            0.010065
Moore              0.009733
Keynes             0.009051
Ricardo            0.008199
Beauvoir           0.008125
Berkeley           0.007255
Augustine          0

In [44]:
new_df[new_df['author']=='Hobbes'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
1153,Leviathan,Hobbes,Hobbes,"(For, the, question, is, not, of, promises, mutual, ,, where, there, is, no, security, of, performance, on, either, side, ,, as, when, there, is, no, civil, power, erected, over, the, parties, pro...","For the question is not of promises mutual, where there is no security of performance on either side, as when there is no civil power erected over the parties promising; for such promises are no c...",1651,1651,425,,"[for, the, question, is, not, of, promises, mutual, where, there, is, no, security, of, performance, on, either, side, as, when, there, is, no, civil, power, erected, over, the, parties, promising...","for the question be not of promise mutual , where there be no security of performance on either side , as when there be no civil power erect over the party promise ; for such promise be no covena..."
2884,Leviathan,Hobbes,Hobbes,"(The, prophet, that, was, sent, to, prophesy, against, the, altar, set, up, by, Jeroboam, ,, though, a, true, prophet, ,, and, that, by, two, miracles, done, in, his, presence, appears, to, be, a,...","The prophet that was sent to prophesy against the altar set up by Jeroboam, though a true prophet, and that by two miracles done in his presence appears to be a prophet sent from God, was yet dece...",1651,1651,296,,"[the, prophet, that, was, sent, to, prophesy, against, the, altar, set, up, by, jeroboam, though, true, prophet, and, that, by, two, miracles, done, in, his, presence, appears, to, be, prophet, se...","the prophet that be send to prophesy against the altar set up by Jeroboam , though a true prophet , and that by two miracle do in -PRON- presence appear to be a prophet send from God , be yet dec..."
1328,Leviathan,Hobbes,Hobbes,"(For, the, laws, of, nature, ,, as, justice, ,, equity, ,, modesty, ,, mercy, ,, and, ,, in, sum, ,, doing, to, others, as, we, would, be, done, to, ,, of, themselves, ,, without, the, terror, of,...","For the laws of nature, as justice, equity, modesty, mercy, and, in sum, doing to others as we would be done to, of themselves, without the terror of some power to cause them to be observed, are c...",1651,1651,287,,"[for, the, laws, of, nature, as, justice, equity, modesty, mercy, and, in, sum, doing, to, others, as, we, would, be, done, to, of, themselves, without, the, terror, of, some, power, to, cause, th...","for the law of nature , as justice , equity , modesty , mercy , and , in sum , do to other as -PRON- would be do to , of -PRON- , without the terror of some power to cause -PRON- to be observe , ..."
2678,Leviathan,Hobbes,Hobbes,"(But, what, is, a, good, law, ?)",But what is a good law?,1651,1651,23,,"[but, what, is, good, law]",but what be a good law ?
954,Leviathan,Hobbes,Hobbes,"(That, the, clergy, ,, and, regulars, ,, in, what, country, soever, ,, shall, be, exempt, from, the, jurisdiction, of, their, king, in, cases, criminal, ?)","That the clergy, and regulars, in what country soever, shall be exempt from the jurisdiction of their king in cases criminal?",1651,1651,125,,"[that, the, clergy, and, regulars, in, what, country, soever, shall, be, exempt, from, the, jurisdiction, of, their, king, in, cases, criminal]","that the clergy , and regular , in what country soever , shall be exempt from the jurisdiction of -PRON- king in case criminal ?"


In [45]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [46]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
127519,The Search After Truth,Malebranche,Rationalism,1674,But we must not adorn a phantom without body or reality; we must not excite useless impulses.
76581,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,Now parents know their offspring better than their children
322153,On The Principles Of Political Economy And Taxation,Ricardo,Capitalism,1817,"Let the two nations, between which the commercial treaty is made, be the mother country and her colony, and Adam Smith, it is evident, admits, that a mother country may be benefited by oppressing ..."
287639,Elements Of The Philosophy Of Right,Hegel,German Idealism,1820,"The sun, moon, mountains, rivers, and all natural objects"
29615,Plato - Complete Works,Plato,Plato,350 BC,"And, as for the land rings that separated the rings of sea, they pierced them at the point of the bridges, and thus joined them by water."


In [47]:
len(for_db)

376865

In [49]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('postgresql://nrolc7ed7joo',
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp',          
              con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Hobbes'""").fetchone()) 

  """)


('Leviathan', 'Hobbes', 'Hobbes', '1651', 'All which qualities called sensible are in the object that causeth them')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [50]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Hobbes'""").fetchone()) 

('Leviathan', 'Hobbes', 'Hobbes', '1651', 'All which qualities called sensible are in the object that causeth them')
