<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [4]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
seneca_all_texts = get_guten('http://www.gutenberg.org/files/56075/56075-0.txt')

In [30]:
seneca_on_benefits = seneca_all_texts.split('SENECA OF BENEFITS')[1][54:].split('SENECA OF ANGER')[0]
seneca_on_anger = seneca_all_texts.split('SENECA OF ANGER')[1][95:].split('SENECA OF CLEMENCY')[0]
seneca_on_clemency = seneca_all_texts.split('SENECA OF CLEMENCY')[1][7:].split('Obvious typographical errors have been corre')[0][:-80]


### Clean the Text



In [31]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [32]:
# note extras like bracketed footnotes or specific words to remove


In [33]:
# build a dictionary for the book
seneca_on_benefits_dict = {
    'author': 'Seneca',
    'title': 'On Benefits',
    'text': seneca_on_benefits,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 59,
    'corpus date': 2017
}

seneca_on_anger_dict = {
    'author': 'Seneca',
    'title': 'On Benefits',
    'text': seneca_on_anger,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 45,
    'corpus date': 2017
}

seneca_on_clemency_dict = {
    'author': 'Seneca',
    'title': 'On Benefits',
    'text': seneca_on_clemency,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 55,
    'corpus date': 2017
}

In [34]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [35]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [36]:
# use the function
on_benefits_df = from_raw_to_df(seneca_on_benefits_dict)
on_anger_df = from_raw_to_df(seneca_on_anger_dict)
on_clemency_df = from_raw_to_df(seneca_on_clemency_dict)

df = on_benefits_df.append(on_anger_df, ignore_index=True).append(on_clemency_df, ignore_index=True)

In [37]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
1986,On Benefits,Seneca,Stoicism,"(That, which, we, take, to, be, very, high, at, a, distance)",59,2017,That which we take to be very high at a distance
3210,On Benefits,Seneca,Stoicism,"(To, take, a, farther, view, ,, now, ,, of, the, miserable, consequences, and, sanguinary, effects, of, this, hideous, distemper, ;, from, hence, come, slaughters, and, poisons, ,, wars, ,, and, d...",45,2017,"To take a farther view, now, of the miserable consequences and sanguinary effects of this hideous distemper; from hence come slaughters and poisons, wars, and desolations, the razing and burning o..."
3596,On Benefits,Seneca,Stoicism,"(And, then, it, has, this, misfortune, ,, that, a, man, must, be, wicked, upon, necessity, ;, for, there, is, no, going, back, ;, so, that, he, must, betake, himself, to, arms, ,, and)",45,2017,"And then it has this misfortune, that a man must be wicked upon necessity; for there is no going back; so that he must betake himself to arms, and"
3239,On Benefits,Seneca,Stoicism,"(So, that, at, last, he, ordered, a, retreat, ,, wanting, no, delicates, all, this, while, for, himself, ,, while, his, soldiers, were, taking, their, chance, who, should, die, miserably, ,, or, l...",45,2017,"So that at last he ordered a retreat, wanting no delicates all this while for himself, while his soldiers were taking their chance who should die miserably, or live worse."
2431,On Benefits,Seneca,Stoicism,"(He, that, is, a, slave, to, business, is, the, most, wretched, of, slaves, ., ')",59,2017,He that is a slave to business is the most wretched of slaves. '
3276,On Benefits,Seneca,Stoicism,"(The, sight, of, a, red, coat, enrages, a, bull, ;, a, shadow, provokes, the, asp, ;, nay, ,, so, unreasonable, are, some, men, ,, that, they, take, moderate, benefits, for, injuries, ,, and, squa...",45,2017,"The sight of a red coat enrages a bull; a shadow provokes the asp; nay, so unreasonable are some men, that they take moderate benefits for injuries, and squabble about it with their nearest relati..."
940,On Benefits,Seneca,Stoicism,"(And, ,, if, a, man, may, judge, of, the, conscience, by, the, countenance, the, ungrateful, man, is, never, without, a, canker, at, his, heart, ;, his, mind, an, aspect, is, sad, and, solicitous,...",59,2017,"And, if a man may judge of the conscience by the countenance the ungrateful man is never without a canker at his heart; his mind an aspect is sad and solicitous; whereas the other is always cheerf..."
2582,On Benefits,Seneca,Stoicism,"(Why, was, such, a, one, taken, away, in, the, prime, of, his, years, ?, ')",59,2017,Why was such a one taken away in the prime of his years?'
2950,On Benefits,Seneca,Stoicism,"(Thrasippus, in, his, drink, fell, foul, upon, the, cruelties, of, Pisistratus, ;, who, ,, when, he, was, urged, by, several, about, him, to, make, an, example, of, him, ,, returned, this, answer,...",45,2017,"Thrasippus in his drink fell foul upon the cruelties of Pisistratus; who, when he was urged by several about him to make an example of him, returned this answer, 'Why should I be angry with a man ..."
1114,On Benefits,Seneca,Stoicism,"(If, there, were, any, difference, ,, I, should, prefer, the, virtues, of, patience, before, those, of, pleasure, ;, for, it, is, braver, to, break, through, difficulties, than, to, temper, our, d...",59,2017,"If there were any difference, I should prefer the virtues of patience before those of pleasure; for it is braver to break through difficulties than to temper our delights."


In [38]:
len(df)

3810

#### Remove Short Sentences

In [39]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 72 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
701,On Benefits,Seneca,Stoicism,"(What, do, I, know)",59,2017,What do I know,14
2764,On Benefits,Seneca,Stoicism,"(we, are, sure, of, .)",59,2017,we are sure of.,15
2510,On Benefits,Seneca,Stoicism,('),59,2017,',1
521,On Benefits,Seneca,Stoicism,('),59,2017,',1
359,On Benefits,Seneca,Stoicism,(A.),59,2017,A.,2


In [40]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

3738

#### Remove Cases of Self-Mention

In [41]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Anselm'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [42]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

3738

#### Deal with Duplicates

In [43]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

0

In [44]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

ValueError: ignored

In [46]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [47]:
len(df)

3738

#### Check for Foreign Languages

In [48]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

0

In [49]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [50]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

3738

In [51]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

3738

In [52]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

3738

In [53]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

3738

In [54]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

3738

### Lemmatize and Tokenize

In [55]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [56]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [57]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
1992,On Benefits,Seneca,Stoicism,"(a, little, bread, and, water, is, sufficient, ,, and, all, the, rest, is, but, superfluous, .)",59,2017,"a little bread and water is sufficient, and all the rest is but superfluous.",76,"[little, bread, and, water, is, sufficient, and, all, the, rest, is, but, superfluous]","a little bread and water be sufficient , and all the rest be but superfluous ."
3105,On Benefits,Seneca,Stoicism,"(He, shall, never, stir, out, of, his, house, but, he, shall, meet, with, criminals, of, all, sorts, ;, prodigal, ,, impudent, ,, covetous, ,, perfidious, ,, contentious, ,, children, persecuting,...",45,2017,"He shall never stir out of his house but he shall meet with criminals of all sorts; prodigal, impudent, covetous, perfidious, contentious, children persecuting their parents, parents cursing their...",333,"[he, shall, never, stir, out, of, his, house, but, he, shall, meet, with, criminals, of, all, sorts, prodigal, impudent, covetous, perfidious, contentious, children, persecuting, their, parents, p...","-PRON- shall never stir out of -PRON- house but -PRON- shall meet with criminal of all sort ; prodigal , impudent , covetous , perfidious , contentious , child persecute -PRON- parent , parent cu..."
3276,On Benefits,Seneca,Stoicism,"(The, sight, of, a, red, coat, enrages, a, bull, ;, a, shadow, provokes, the, asp, ;, nay, ,, so, unreasonable, are, some, men, ,, that, they, take, moderate, benefits, for, injuries, ,, and, squa...",45,2017,"The sight of a red coat enrages a bull; a shadow provokes the asp; nay, so unreasonable are some men, that they take moderate benefits for injuries, and squabble about it with their nearest relati...",317,"[the, sight, of, red, coat, enrages, bull, shadow, provokes, the, asp, nay, so, unreasonable, are, some, men, that, they, take, moderate, benefits, for, injuries, and, squabble, about, it, with, t...","the sight of a red coat enrage a bull ; a shadow provoke the asp ; nay , so unreasonable be some man , that -PRON- take moderate benefit for injury , and squabble about -PRON- with -PRON- near re..."
3054,On Benefits,Seneca,Stoicism,"(I, will, do, my, duty, without, fear, or, confusion, ,, I, will, not, rage, ,, I, will, not, weep, ;, but, discharge, the, office, of, a, good, man, without, forfeiting, the, dignity, of, a, man, .)",45,2017,"I will do my duty without fear or confusion, I will not rage, I will not weep; but discharge the office of a good man without forfeiting the dignity of a man.",158,"[will, do, my, duty, without, fear, or, confusion, will, not, rage, will, not, weep, but, discharge, the, office, of, good, man, without, forfeiting, the, dignity, of, man]","-PRON- will do -PRON- duty without fear or confusion , -PRON- will not rage , -PRON- will not weep ; but discharge the office of a good man without forfeit the dignity of a man ."
1700,On Benefits,Seneca,Stoicism,"(It, is, a, great, matter, for, a, man, to, advance, his, mind, above, her, threats, or, flatteries, ;, for, he, that, has, once, gotten, the, better, of, her, is, safe, forever, .)",59,2017,It is a great matter for a man to advance his mind above her threats or flatteries; for he that has once gotten the better of her is safe forever.,146,"[it, is, great, matter, for, man, to, advance, his, mind, above, her, threats, or, flatteries, for, he, that, has, once, gotten, the, better, of, her, is, safe, forever]",-PRON- be a great matter for a man to advance -PRON- mind above -PRON- threat or flattery ; for -PRON- that have once get the well of -PRON- be safe forever .


### Combine with the Old Dataframe & Export to CSV

In [58]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
176125,Philosophical Troubles,Kripke,analytic,"If we restricted the admissible total extensions to those defining maximal consistent sets of sentences, in the usual sense, not only but even Sent :T neg will come out true in the minimal fixed p...","If we restricted the admissible total extensions to those defining maximal consistent sets of sentences, in the usual sense, not only but even Sent :T neg will come out true in the minimal fixed p...",1975,2011,201,"if we restricted the admissible total extensions to those defining maximal consistent sets of sentences, in the usual sense, not only but even sent :t neg will come out true in the minimal fixed p...","['if', 'we', 'restricted', 'the', 'admissible', 'total', 'extensions', 'to', 'those', 'defining', 'maximal', 'consistent', 'sets', 'of', 'sentences', 'in', 'the', 'usual', 'sense', 'not', 'only', ...","if -PRON- restrict the admissible total extension to those define maximal consistent set of sentence , in the usual sense , not only but even send : T neg will come out true in the minimal fix po..."
109898,Ethics,Spinoza,rationalism,"This may be answered by pointing out that, if reason persuaded him to act thus, it would persuade all men to act in a similar manner, in which case reason would persuade men not to agree in good f...","This may be answered by pointing out that, if reason persuaded him to act thus, it would persuade all men to act in a similar manner, in which case reason would persuade men not to agree in good f...",1677,2003,306,"this may be answered by pointing out that, if reason persuaded him to act thus, it would persuade all men to act in a similar manner, in which case reason would persuade men not to agree in good f...","['this', 'may', 'be', 'answered', 'by', 'pointing', 'out', 'that', 'if', 'reason', 'persuaded', 'him', 'to', 'act', 'thus', 'it', 'would', 'persuade', 'all', 'men', 'to', 'act', 'in', 'similar', '...","this may be answer by point out that , if reason persuade -PRON- to act thus , -PRON- would persuade all man to act in a similar manner , in which case reason would persuade man not to agree in g..."
335057,Twilight Of The Idols,Nietzsche,nietzsche,I have given my reply to the problem in advance.,I have given my reply to the problem in advance.,1888,2016,48,i have given my reply to the problem in advance.,"['have', 'given', 'my', 'reply', 'to', 'the', 'problem', 'in', 'advance']",-PRON- have give -PRON- reply to the problem in advance .
320909,On The Principles Of Political Economy And Taxation,Ricardo,capitalism,In no respect would such a tax differ from a tax on rent.,In no respect would such a tax differ from a tax on rent.,1817,2010,57,in no respect would such a tax differ from a tax on rent.,"['in', 'no', 'respect', 'would', 'such', 'tax', 'differ', 'from', 'tax', 'on', 'rent']",in no respect would such a tax differ from a tax on rent .
77401,Aristotle - Complete Works,Aristotle,aristotle,"For why, they say, when we are ill or ugly, does no one blame us for things of this sort?","For why, they say, when we are ill or ugly, does no one blame us for things of this sort?",-320,1991,89,"for why, they say, when we are ill or ugly, does no one blame us for things of this sort?","['for', 'why', 'they', 'say', 'when', 'we', 'are', 'ill', 'or', 'ugly', 'does', 'no', 'one', 'blame', 'us', 'for', 'things', 'of', 'this', 'sort']","for why , -PRON- say , when -PRON- be ill or ugly , do no one blame -PRON- for thing of this sort ?"


In [59]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.133327
Plato              0.104865
Hegel              0.062045
Foucault           0.041655
Heidegger          0.041652
Kant               0.038616
Nietzsche          0.037030
Marx               0.036869
Lewis              0.035861
Beauvoir           0.035579
Malebranche        0.035524
Deleuze            0.034275
Kripke             0.034109
Smith              0.031960
Wittgenstein       0.024692
Locke              0.024285
Hume               0.022719
Merleau-Ponty      0.020751
Quine              0.020152
Derrida            0.016397
Husserl            0.015694
Fichte             0.014508
Russell            0.013866
Leibniz            0.013740
Popper             0.012786
Lenin              0.012215
Augustine          0.011119
Spinoza            0.010367
Moore              0.010026
Keynes             0.009323
Ricardo            0.008446
Davis              0.008361
Berkeley           0.007473
Wollstonecraft     0.006994
Marcus Aurelius    0.006046
Descartes          0

In [65]:
len(og_df)

365861

In [60]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.131978
Plato              0.103804
Hegel              0.061418
Foucault           0.041234
Heidegger          0.041231
Kant               0.038225
Nietzsche          0.036656
Marx               0.036496
Lewis              0.035498
Beauvoir           0.035219
Malebranche        0.035165
Deleuze            0.033929
Kripke             0.033764
Smith              0.031637
Wittgenstein       0.024443
Locke              0.024040
Hume               0.022489
Merleau-Ponty      0.020541
Quine              0.019949
Derrida            0.016231
Husserl            0.015536
Fichte             0.014362
Russell            0.013726
Leibniz            0.013601
Popper             0.012657
Lenin              0.012091
Augustine          0.011007
Spinoza            0.010262
Seneca             0.010114
Moore              0.009924
Keynes             0.009229
Ricardo            0.008360
Davis              0.008277
Berkeley           0.007397
Wollstonecraft     0.006924
Marcus Aurelius    0

In [61]:
new_df[new_df['author']=='Seneca'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
3440,On Benefits,Seneca,Stoicism,"(But, this, must, be, a, person, then, that, has, some, authority, over, him, .)",But this must be a person then that has some authority over him.,45,2017,64,,"[but, this, must, be, person, then, that, has, some, authority, over, him]",but this must be a person then that have some authority over -PRON- .
976,On Benefits,Seneca,Stoicism,"(In, answer, to, this, ;, men, will, be, careful, enough, when, they, oblige, without, a, law, :, nor, is, it, possible, for, a, judge, ever, to, set, us, right, in, it, ;, or, indeed, ,, anything...","In answer to this; men will be careful enough when they oblige without a law: nor is it possible for a judge ever to set us right in it; or indeed, anything else, but the faith of the receiver.",59,2017,193,,"[in, answer, to, this, men, will, be, careful, enough, when, they, oblige, without, law, nor, is, it, possible, for, judge, ever, to, set, us, right, in, it, or, indeed, anything, else, but, the, ...","in answer to this ; man will be careful enough when -PRON- oblige without a law : nor be -PRON- possible for a judge ever to set -PRON- right in -PRON- ; or indeed , anything else , but the faith..."
1088,On Benefits,Seneca,Stoicism,"(They, are, great, blessings, to, have, tender, parents, ,, dutiful, children, ,, and, to, live, under, a, just, and, well, ordered, government, .)","They are great blessings to have tender parents, dutiful children, and to live under a just and well ordered government.",59,2017,120,,"[they, are, great, blessings, to, have, tender, parents, dutiful, children, and, to, live, under, just, and, well, ordered, government]","-PRON- be great blessing to have tender parent , dutiful child , and to live under a just and well order government ."
3668,On Benefits,Seneca,Stoicism,"(He, that, uses, his, power, as, he, should, ,, takes, as, much, delight, in, making, it, comfortable, to, his, people, as, glorious, to, himself, .)","He that uses his power as he should, takes as much delight in making it comfortable to his people as glorious to himself.",55,2017,121,,"[he, that, uses, his, power, as, he, should, takes, as, much, delight, in, making, it, comfortable, to, his, people, as, glorious, to, himself]","-PRON- that use -PRON- power as -PRON- should , take as much delight in make -PRON- comfortable to -PRON- people as glorious to -PRON- ."
1041,On Benefits,Seneca,Stoicism,"(There, is, no, condition, of, life, that, excludes, a, wise, man, from, discharging, his, duty, .)",There is no condition of life that excludes a wise man from discharging his duty.,59,2017,81,,"[there, is, no, condition, of, life, that, excludes, wise, man, from, discharging, his, duty]",there be no condition of life that exclude a wise man from discharge -PRON- duty .


In [62]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [63]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
133670,The Problems Of Philosophy,Russell,Analytic,1912,"It might seem as though we were quite sure of being the same person to day as we were yesterday, and this is no doubt true in some sense."
292248,Capital,Marx,Communism,1883,"A third owner of commodities comes to him now as seller, who in this capacity also enjoys the privilege of selling his commodities too dear."
3769,On Benefits,Seneca,Stoicism,55,"He is discovered, and can do no hurt to your person; and it will yet advantage you in your reputation.'"
51708,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,They compare it to what happens when you strike the sea with a rod by night and the water is seen to shine.
287,On Benefits,Seneca,Stoicism,59,"must the sun stand still in the middle of his course, and heaven and earth drop into confusion?"


In [64]:
len(for_db)

369599

In [70]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('postgresql://wgamztqqflivsb:6cdd7254bde0dec3a9b3fcbd1f24429f49e2106d1cf782c44dadfb07fb9f365d@ec2-54-146-73-98.compute-1.amazonaws.com:5432/d8nrolc7ed7joo',
                       echo=False)
  
# attach the data frame to the sql server 
for_db[for_db['AUTHOR']=='Seneca'].to_sql('phil_nlp', 
               con = engine,
              if_exists='append',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Seneca'""").fetchone()) 

  """)


('On Benefits', 'Seneca', 'Stoicism', '59', 'And this arises from a mistake, partly in the person that we would oblige, and partly in the thing itself.')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [71]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Anselm'""").fetchone()) 

('Proslogion', 'Anselm', 'Scholasticism', '1077', 'How is it, then, Lord, that You are all these things?')
