<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [50]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
anselm_proslogion = get_text(drive_path + '/phil_txts/anselm_proslogion.txt')
anselm_de_veritate = get_text(drive_path + '/phil_txts/anselm_de_veritate.txt')

In [51]:
anselm_proslogion = anselm_proslogion.split('PREFACE 1')[1].split('John 16:24.   Isaiah 9:6.')[0][:-75]
anselm_de_veritate = anselm_de_veritate.split('Preface [to the Three Dialogues')[1][45:]

### Clean the Text



In [103]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [104]:
# note extras like bracketed footnotes or specific words to remove

anselm_de_veritate_to_rm = ['On Truth', 'rectitudo', 'S\.', 'T\.']
anselm_proslogion_to_rm = ['Proslogion', 'Preface', '&']


In [105]:
# build a dictionary for the book
anselm_proslogion_dict = {
    'author': 'Anselm',
    'title': 'Proslogion',
    'text': anselm_proslogion,
    'school': 'Scholasticism',
    'words to remove': anselm_proslogion_to_rm,
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 1077,
    'corpus date': 2000
}

anselm_de_veritate_dict = {
    'author': 'Anselm',
    'title': 'De Veritate',
    'text': anselm_de_veritate,
    'school': 'Scholasticism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 1086,
    'corpus date': 2000
}

In [136]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [137]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [138]:
# use the function
proslogion_df = from_raw_to_df(anselm_proslogion_dict)
de_veritate_df = from_raw_to_df(anselm_de_veritate_dict)

df = proslogion_df.append(de_veritate_df, ignore_index=True)

In [139]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
396,Proslogion,Anselm,Scholasticism,"(My, soul, hoped, for, fullness, ;, and, ,, lo, ,, once, again, it, is, overwhelmed, with, need, .)",1077,2000,"My soul hoped for fullness; and, lo, once again it is overwhelmed with need."
664,De Veritate,Anselm,Scholasticism,"(No, doubt, about, it, .)",1086,2000,No doubt about it.
595,De Veritate,Anselm,Scholasticism,"(For, if, God, had, given, him, perseverance, ,, he, would, have, had, it, just, as, the, good, angels, had, it, because, God, gave, it, to, them, .)",1086,2000,"For if God had given him perseverance, he would have had it just as the good angels had it because God gave it to them."
210,Proslogion,Anselm,Scholasticism,"(But, if, perceiving, is, only, knowing, or, only, for, the, sake, of, knowing, (, for, anyone, who, perceives, knows, in, accordance, with, the, characteristic, capabilities, of, the, respective,...",1077,2000,But if perceiving is only knowing or only for the sake of knowing (for anyone who perceives knows in accordance with the characteristic capabilities of the respective senses.g.
235,Proslogion,Anselm,Scholasticism,"(How, He, who, is, completely, and, supremely, just, spares, those, who, are, evil, .)",1077,2000,How He who is completely and supremely just spares those who are evil.
366,Proslogion,Anselm,Scholasticism,"(For, since, there, can, be, thought, to, exist, something, of, this, kind, ,, if, You, were, not, this, Being, then, something, greater, than, You, could, be, thought, a, consequence, which, is, ...",1077,2000,"For since there can be thought to exist something of this kind, if You were not this Being then something greater than You could be thought a consequence which is impossible."
128,Proslogion,Anselm,Scholasticism,"(Permit, me, ,, at, least, from, afar, or, from, the, deep, ,, to, look, upwards, toward, Your, light, .)",1077,2000,"Permit me, at least from afar or from the deep, to look upwards toward Your light."
1028,De Veritate,Anselm,Scholasticism,"(See, whether, something, in, this, definition, ought, perhaps, to, be, amended, .)",1086,2000,See whether something in this definition ought perhaps to be amended.
758,De Veritate,Anselm,Scholasticism,"(So, unless, I, am, wrong, ,, we, can, also, number, among, right, actions)",1086,2000,"So unless I am wrong, we can also number among right actions"
1105,De Veritate,Anselm,Scholasticism,"(Clearly, not, .)",1086,2000,Clearly not.


In [140]:
len(df)

1141

#### Remove Short Sentences

In [141]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 120 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
815,De Veritate,Anselm,Scholasticism,"(No, .)",1086,2000,No.,3
355,Proslogion,Anselm,Scholasticism,"(Why, is, this, ?)",1077,2000,Why is this?,12
1105,De Veritate,Anselm,Scholasticism,"(Clearly, not, .)",1086,2000,Clearly not.,12
525,Proslogion,Anselm,Scholasticism,"(Psalms, .)",1077,2000,Psalms.,7
140,Proslogion,Anselm,Scholasticism,"(Psalms, .)",1077,2000,Psalms.,7


In [142]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

1021

#### Remove Cases of Self-Mention

In [143]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Anselm'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [144]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

1021

#### Deal with Duplicates

In [145]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

36

In [146]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
47,Proslogion,Anselm,Scholasticism,"(A, conjecture, about, what, kind, of, good, this, is, and, about, how, great, it, is, .)",1077,2000,A conjecture about what kind of good this is and about how great it is.,71
490,Proslogion,Anselm,Scholasticism,"(A, conjecture, about, what, kind, of, good, this, is, and, about, how, great, it, is, .)",1077,2000,A conjecture about what kind of good this is and about how great it is.,71
25,Proslogion,Anselm,Scholasticism,"(Alone, existing, through, Himself, ,, He, makes, all, other, things, from, nothing, .)",1077,2000,"Alone existing through Himself, He makes all other things from nothing.",71
198,Proslogion,Anselm,Scholasticism,"(Alone, existing, through, Himself, ,, He, makes, all, other, things, from, nothing, .)",1077,2000,"Alone existing through Himself, He makes all other things from nothing.",71
19,Proslogion,Anselm,Scholasticism,"(Arousal, of, the, mind, for, contemplating, God, .)",1077,2000,Arousal of the mind for contemplating God.,42
...,...,...,...,...,...,...,...,...
460,Proslogion,Anselm,Scholasticism,"(Whether, this, eternity, is, one, aeon, or, more, than, one, .)",1077,2000,Whether this eternity is one aeon or more than one.,51
49,Proslogion,Anselm,Scholasticism,"(Whether, this, is, the, full, joy, which, the, Lord, promises, .)",1077,2000,Whether this is the full joy which the Lord promises.,53
559,Proslogion,Anselm,Scholasticism,"(Whether, this, is, the, full, joy, which, the, Lord, promises, .)",1077,2000,Whether this is the full joy which the Lord promises.,53
46,Proslogion,Anselm,Scholasticism,"(necessary, Being, which, is, every, good, ,, complete, good, ,, and, the, only, good, .)",1077,2000,"necessary Being which is every good, complete good, and the only good.",70


In [147]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [148]:
len(df)

985

#### Check for Foreign Languages

In [149]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

0

In [150]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [151]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

985

In [152]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

985

In [153]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

985

In [154]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

985

In [155]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

985

### Lemmatize and Tokenize

In [156]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [157]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [158]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
338,Proslogion,Anselm,Scholasticism,"(And, you, have, found, that, this, Being, is, life, itself, ,, light, ,, wisdom, ,, goodness, ,, eter, nal, blessedness, ,, and, blessed, eternity, and, that, this, Being, exists, everywhere, and...",1077,2000,"And you have found that this Being is life itself, light, wisdom, goodness, eter nal blessedness, and blessed eternity and that this Being exists everywhere and always.",168,"[and, you, have, found, that, this, being, is, life, itself, light, wisdom, goodness, eter, nal, blessedness, and, blessed, eternity, and, that, this, being, exists, everywhere, and, always]","and -PRON- have find that this being be life -PRON- , light , wisdom , goodness , eter nal blessedness , and bless eternity and that this be exist everywhere and always ."
1135,De Veritate,Anselm,Scholasticism,"(For, truth, does, not, have, its, being, in, or, from, or, through, the, things, in, which, it, is, said, to, be, .)",1086,2000,For truth does not have its being in or from or through the things in which it is said to be.,93,"[for, truth, does, not, have, its, being, in, or, from, or, through, the, things, in, which, it, is, said, to, be]",for truth do not have -PRON- being in or from or through the thing in which -PRON- be say to be .
534,Proslogion,Anselm,Scholasticism,"(For, just, as, God, is, able, to, do, through, Himself, that, which, He, wills, ,, so, they, shall, be, able, to, do, through, Him, that, which, they, shall, will, .)",1077,2000,"For just as God is able to do through Himself that which He wills, so they shall be able to do through Him that which they shall will.",134,"[for, just, as, god, is, able, to, do, through, himself, that, which, he, wills, so, they, shall, be, able, to, do, through, him, that, which, they, shall, will]","for just as God be able to do through -PRON- that which -PRON- will , so -PRON- shall be able to do through -PRON- that which -PRON- shall will ."
81,Proslogion,Anselm,Scholasticism,"(Alas, ,, what, he, lost, and, what, he, found, ,, what, vanished, and, what, remained, !)",1077,2000,"Alas, what he lost and what he found, what vanished and what remained!",70,"[alas, what, he, lost, and, what, he, found, what, vanished, and, what, remained]","alas , what -PRON- lose and what -PRON- find , what vanish and what remain !"
165,Proslogion,Anselm,Scholasticism,"(But, surely, that, than, which, a, greater, can, not, be, thought, can, not, be, only, in, the, understanding, .)",1077,2000,But surely that than which a greater cannot be thought cannot be only in the understanding.,91,"[but, surely, that, than, which, greater, cannot, be, thought, cannot, be, only, in, the, understanding]",but surely that than which a great can not be think can not be only in the understanding .


### Combine with the Old Dataframe & Export to CSV

In [159]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
107115,Ethics,Spinoza,rationalism,"It is impossible that there should be in the universe two substances with an identical attribute, ie which have anything common to them both","It is impossible that there should be in the universe two substances with an identical attribute, ie which have anything common to them both",1677,2003,140,"it is impossible that there should be in the universe two substances with an identical attribute, ie which have anything common to them both","['it', 'is', 'impossible', 'that', 'there', 'should', 'be', 'in', 'the', 'universe', 'two', 'substances', 'with', 'an', 'identical', 'attribute', 'ie', 'which', 'have', 'anything', 'common', 'to',...","-PRON- be impossible that there should be in the universe two substance with an identical attribute , ie which have anything common to -PRON- both"
45566,Aristotle - Complete Works,Aristotle,aristotle,It is clear then that chance is an accidental cause in the sphere of those actions for the sake of something which involve choice.,It is clear then that chance is an accidental cause in the sphere of those actions for the sake of something which involve choice.,-320,1991,130,it is clear then that chance is an accidental cause in the sphere of those actions for the sake of something which involve choice.,"['it', 'is', 'clear', 'then', 'that', 'chance', 'is', 'an', 'accidental', 'cause', 'in', 'the', 'sphere', 'of', 'those', 'actions', 'for', 'the', 'sake', 'of', 'something', 'which', 'involve', 'ch...",-PRON- be clear then that chance be an accidental cause in the sphere of those action for the sake of something which involve choice .
98023,A Treatise Of Human Nature,Hume,empiricism,"For as like causes always produce like effects, when in any instance we find our expectation to be disappointed, we must conclude that this irregularity proceeds from some difference in the causes.","For as like causes always produce like effects, when in any instance we find our expectation to be disappointed, we must conclude that this irregularity proceeds from some difference in the causes.",1739,2003,197,"for as like causes always produce like effects, when in any instance we find our expectation to be disappointed, we must conclude that this irregularity proceeds from some difference in the causes.","['for', 'as', 'like', 'causes', 'always', 'produce', 'like', 'effects', 'when', 'in', 'any', 'instance', 'we', 'find', 'our', 'expectation', 'to', 'be', 'disappointed', 'we', 'must', 'conclude', '...","for as like cause always produce like effect , when in any instance -PRON- find -PRON- expectation to be disappoint , -PRON- must conclude that this irregularity proceed from some difference in t..."
282011,The Phenomenology Of Spirit,Hegel,german_idealism,"In general, she maintains that itis the power of youth that really counts; the worth of the son lies in his being the lord and master of the mother who bore him, that ofthehrotherasheingone in who...","In general, she maintains that itis the power of youth that really counts; the worth of the son lies in his being the lord and master of the mother who bore him, that ofthehrotherasheingone in who...",1807,1977,386,"in general, she maintains that itis the power of youth that really counts; the worth of the son lies in his being the lord and master of the mother who bore him, that ofthehrotherasheingone in who...","['in', 'general', 'she', 'maintains', 'that', 'itis', 'the', 'power', 'of', 'youth', 'that', 'really', 'counts', 'the', 'worth', 'of', 'the', 'son', 'lies', 'in', 'his', 'being', 'the', 'lord', 'a...","in general , -PRON- maintain that itis the power of youth that really count ; the worth of the son lie in -PRON- be the lord and master of the mother who bear -PRON- , that ofthehrotherasheingone..."
118366,The Search After Truth,Malebranche,rationalism,an: indifferent to it.,an: indifferent to it.,1674,1997,22,an: indifferent to it.,"['an', 'indifferent', 'to', 'it']",an : indifferent to -PRON- .


In [160]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.133687
Plato              0.105148
Hegel              0.062213
Foucault           0.041768
Heidegger          0.041765
Kant               0.038720
Nietzsche          0.037130
Marx               0.036969
Lewis              0.035957
Beauvoir           0.035675
Malebranche        0.035620
Deleuze            0.034368
Kripke             0.034201
Smith              0.032047
Wittgenstein       0.024759
Locke              0.024351
Hume               0.022780
Merleau-Ponty      0.020807
Quine              0.020207
Derrida            0.016441
Husserl            0.015737
Fichte             0.014547
Russell            0.013903
Leibniz            0.013777
Popper             0.012821
Lenin              0.012248
Augustine          0.011149
Spinoza            0.010395
Moore              0.010053
Keynes             0.009348
Ricardo            0.008469
Davis              0.008384
Berkeley           0.007493
Wollstonecraft     0.007013
Marcus Aurelius    0.006062
Descartes          0

In [161]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.133327
Plato              0.104865
Hegel              0.062045
Foucault           0.041655
Heidegger          0.041652
Kant               0.038616
Nietzsche          0.037030
Marx               0.036869
Lewis              0.035861
Beauvoir           0.035579
Malebranche        0.035524
Deleuze            0.034275
Kripke             0.034109
Smith              0.031960
Wittgenstein       0.024692
Locke              0.024285
Hume               0.022719
Merleau-Ponty      0.020751
Quine              0.020152
Derrida            0.016397
Husserl            0.015694
Fichte             0.014508
Russell            0.013866
Leibniz            0.013740
Popper             0.012786
Lenin              0.012215
Augustine          0.011119
Spinoza            0.010367
Moore              0.010026
Keynes             0.009323
Ricardo            0.008446
Davis              0.008361
Berkeley           0.007473
Wollstonecraft     0.006994
Marcus Aurelius    0.006046
Descartes          0

In [162]:
new_df[new_df['author']=='Anselm'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
941,De Veritate,Anselm,Scholasticism,"(Tell, me, ,, then, ,, whether, you, think, that, there, is, still, another, rightness, in, addition, to, those, rightnesses, we, have, examined, .)","Tell me, then, whether you think that there is still another rightness in addition to those rightnesses we have examined.",1086,2000,121,,"[tell, me, then, whether, you, think, that, there, is, still, another, rightness, in, addition, to, those, rightnesses, we, have, examined]","tell -PRON- , then , whether -PRON- think that there be still another rightness in addition to those rightness -PRON- have examine ."
802,De Veritate,Anselm,Scholasticism,"(Your, answer, has, satisfied, me, .)",Your answer has satisfied me.,1086,2000,29,,"[your, answer, has, satisfied, me]",-PRON- answer have satisfy -PRON- .
648,De Veritate,Anselm,Scholasticism,"(I, do, not, think, it, is, .)",I do not think it is.,1086,2000,21,,"[do, not, think, it, is]",-PRON- do not think -PRON- be .
266,Proslogion,Anselm,Scholasticism,"(Those, who, are, just, You, save, through, the, aid, of, their, merits, ;, those, who, are, evil)",Those who are just You save through the aid of their merits; those who are evil,1077,2000,79,,"[those, who, are, just, you, save, through, the, aid, of, their, merits, those, who, are, evil]",those who be just -PRON- save through the aid of -PRON- merit ; those who be evil
1,Proslogion,Anselm,Scholasticism,"(Afterwards, ,, considering, this, work, to, be, composed, of, a, chain, of, many, arguments, ,, I, began, to, ask, myself, whether, perhaps, a, single, consideration, could, be, found, which, wou...","Afterwards, considering this work to be composed of a chain of many arguments, I began to ask myself whether perhaps a single consideration could be found which would require nothing other than it...",1077,2000,277,,"[afterwards, considering, this, work, to, be, composed, of, chain, of, many, arguments, began, to, ask, myself, whether, perhaps, single, consideration, could, be, found, which, would, require, no...","afterwards , consider this work to be compose of a chain of many argument , -PRON- begin to ask -PRON- whether perhaps a single consideration could be find which would require nothing other than ..."


In [163]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [166]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
7294,Plato - Complete Works,Plato,Plato,350 BC,"The rest of it, visitor, seems to have been said in due measure; but that ideal rule may exist even without laws was something harder for a hearer to accept."
72796,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,"Yet all the parts must exist only potentially, when they are one and continuous by nature, not by force or even by growing together, for"
57750,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,"Some assert that ring doves and turtle doves pair and procreate when only three months old, and instance their superabundant numbers by way of proof of the assertion."
174618,Philosophical Troubles,Kripke,Analytic,1975,What premise did we use?
83429,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,"Athletic excellence of the body consists in size and strength; for the swift man is strong he who can fling forward his legs in a certain way, and move them fast and far, is good at running; he wh..."


In [177]:
len(for_db)

365861

In [175]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('postgo',
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp', 
               con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Anselm'""").fetchone()) 

('Proslogion', 'Anselm', 'Scholasticism', '1077', 'How is it, then, Lord, that You are all these things?')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [176]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Anselm'""").fetchone()) 

('Proslogion', 'Anselm', 'Scholasticism', '1077', 'How is it, then, Lord, that You are all these things?')
