<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [4]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
seneca_all_texts = get_guten('http://www.gutenberg.org/files/56075/56075-0.txt')

In [30]:
seneca_on_benefits = seneca_all_texts.split('SENECA OF BENEFITS')[1][54:].split('SENECA OF ANGER')[0]
seneca_on_anger = seneca_all_texts.split('SENECA OF ANGER')[1][95:].split('SENECA OF CLEMENCY')[0]
seneca_on_clemency = seneca_all_texts.split('SENECA OF CLEMENCY')[1][7:].split('Obvious typographical errors have been corre')[0][:-80]


### Clean the Text



In [31]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [32]:
# note extras like bracketed footnotes or specific words to remove


In [101]:
# build a dictionary for the book
seneca_on_benefits_dict = {
    'author': 'Seneca',
    'title': 'On Benefits',
    'text': seneca_on_benefits,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 59,
    'corpus date': 2017
}

seneca_on_anger_dict = {
    'author': 'Seneca',
    'title': 'On Anger',
    'text': seneca_on_anger,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 45,
    'corpus date': 2017
}

seneca_on_clemency_dict = {
    'author': 'Seneca',
    'title': 'On Clemency',
    'text': seneca_on_clemency,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 55,
    'corpus date': 2017
}

In [102]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [103]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [104]:
# use the function
on_benefits_df = from_raw_to_df(seneca_on_benefits_dict)
on_anger_df = from_raw_to_df(seneca_on_anger_dict)
on_clemency_df = from_raw_to_df(seneca_on_clemency_dict)

df = on_benefits_df.append(on_anger_df, ignore_index=True).append(on_clemency_df, ignore_index=True)

In [105]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
463,On Benefits,Seneca,Stoicism,"(they, couch, their, meaning)",59,2017,they couch their meaning
1158,On Benefits,Seneca,Stoicism,"(I, speak, of, those, that, fortune, has, made, famous, for, their, persecutions, :, and, there, are, others, also, that, the, world, never, took, notice, of, until, they, were, dead, ;, as, Epicu...",59,2017,"I speak of those that fortune has made famous for their persecutions: and there are others also that the world never took notice of until they were dead; as Epicurus and Metrodorus, that were almo..."
1988,On Benefits,Seneca,Stoicism,"(And, the, business, is, ,, we, do, not, understand, the, true, state, of, things, :, we, are, deceived, by, rumors, ;, when, we, have, gained, the, thing, we, aimed, at, ,, we, find, it, to, be, ...",59,2017,"And the business is, we do not understand the true state of things: we are deceived by rumors; when we have gained the thing we aimed at, we find it to be either ill or empty; or perchance less th..."
2872,On Benefits,Seneca,Stoicism,"(All, I, desire, is, that, my, property, may, not, be, a, burden, to, myself, ,, or, make, me, so, to, others, ;, and, that, is, the, best, state, of, fortune, that, is, neither, directly, necessi...",59,2017,"All I desire is that my property may not be a burden to myself, or make me so to others; and that is the best state of fortune that is neither directly necessitous, nor far from it."
1400,On Benefits,Seneca,Stoicism,"(It, is, dangerous, for, a, man, too, suddenly, ,, or, too, easily, ,, to, believe, himself, .)",59,2017,"It is dangerous for a man too suddenly, or too easily, to believe himself."
1796,On Benefits,Seneca,Stoicism,"(How, long, shall, we, covet, and, oppress, ,, enlarge, our, possessions, ,, and, account, that, too, little, for, one, man, which, was, formerly, enough, for, a, nation, ?)",59,2017,"How long shall we covet and oppress, enlarge our possessions, and account that too little for one man which was formerly enough for a nation?"
570,On Benefits,Seneca,Stoicism,"(The, benefits, that, we, receive, from, our, superiors, are, then, welcome, when, they, come, with, an, open, hand, ,, and, a, clear, brow, ;, without, either, contumely, or, state, ;, and, so, a...",59,2017,"The benefits that we receive from our superiors are then welcome when they come with an open hand, and a clear brow; without either contumely or state; and so as to prevent our necessities."
1922,On Benefits,Seneca,Stoicism,"(As, in, the, symptoms, of, an, approaching, disease, ,, a, man, shall, find, himself, lazy, and, listless, :, a, weariness, in, his, limbs, ,, with, a, yawning, and, shuddering, all, over, him, ;...",59,2017,"As in the symptoms of an approaching disease, a man shall find himself lazy and listless: a weariness in his limbs, with a yawning and shuddering all over him; so it is in the case of a weak mind,..."
359,On Benefits,Seneca,Stoicism,(A.),59,2017,A.
3021,On Anger,Seneca,Stoicism,"(There, is, nothing, great, but, what, is, virtuous, ,, nor, indeed, truly, great, ,, but, what, is, also, composed, and, quiet, .)",45,2017,"There is nothing great but what is virtuous, nor indeed truly great, but what is also composed and quiet."


In [106]:
len(df)

3810

#### Remove Short Sentences

In [107]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 72 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
3022,On Anger,Seneca,Stoicism,"(Anger, ,, alas, !)",45,2017,"Anger, alas!",12
2592,On Benefits,Seneca,Stoicism,"(how, well, .)",59,2017,how well.,9
1654,On Benefits,Seneca,Stoicism,"(I, know, :)",59,2017,I know:,7
366,On Benefits,Seneca,Stoicism,"(the, brother, ?)",59,2017,the brother?,12
1773,On Benefits,Seneca,Stoicism,"(Is, it, day, ?, ')",59,2017,Is it day? ',12


In [108]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

3738

#### Remove Cases of Self-Mention

In [109]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Anselm'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [110]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

3738

#### Deal with Duplicates

In [111]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

0

In [112]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

ValueError: ignored

In [113]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [114]:
len(df)

3738

#### Check for Foreign Languages

In [115]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

0

In [116]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [117]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

3738

In [118]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

3738

In [119]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

3738

In [120]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

3738

In [121]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

3738

### Lemmatize and Tokenize

In [122]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [123]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [124]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
2276,On Benefits,Seneca,Stoicism,"(and, ,, in, effect, ,, a, friend, is, an, eye, ,, a, heart, ,, a, tongue, ,, a, hand, ,, at, all, distances, .)",59,2017,"and, in effect, a friend is an eye, a heart, a tongue, a hand, at all distances.",80,"[and, in, effect, friend, is, an, eye, heart, tongue, hand, at, all, distances]","and , in effect , a friend be an eye , a heart , a tongue , a hand , at all distance ."
3406,On Anger,Seneca,Stoicism,"(The, one, ,, it, is, true, ,, is, wholly, void, of, reason, ,, but, it, is, also, an, equivalent, darkness, of, mind, that, possesses, the, other, .)",45,2017,"The one, it is true, is wholly void of reason, but it is also an equivalent darkness of mind that possesses the other.",118,"[the, one, it, is, true, is, wholly, void, of, reason, but, it, is, also, an, equivalent, darkness, of, mind, that, possesses, the, other]","the one , -PRON- be true , be wholly void of reason , but -PRON- be also an equivalent darkness of mind that possess the other ."
2475,On Benefits,Seneca,Stoicism,"(He, is, restless, in, his, thoughts, ,, unsteady, in, his, counsels, ,, dissatisfied, with, the, present, ,, solicitous, for, the, future, ;, whereas, he, that, prudently, computes, his, hours, a...",59,2017,"He is restless in his thoughts, unsteady in his counsels, dissatisfied with the present, solicitous for the future; whereas he that prudently computes his hours and his business, does not only for...",377,"[he, is, restless, in, his, thoughts, unsteady, in, his, counsels, dissatisfied, with, the, present, solicitous, for, the, future, whereas, he, that, prudently, computes, his, hours, and, his, bus...","-PRON- be restless in -PRON- thought , unsteady in -PRON- counsel , dissatisfied with the present , solicitous for the future ; whereas -PRON- that prudently compute -PRON- hour and -PRON- busine..."
83,On Benefits,Seneca,Stoicism,"(Some, there, are, ,, I, know, ,, that, take, the, matter, for, the, benefit, ,, and, tax, the, obligation, by, weight, and, measure, .)",59,2017,"Some there are, I know, that take the matter for the benefit, and tax the obligation by weight and measure.",107,"[some, there, are, know, that, take, the, matter, for, the, benefit, and, tax, the, obligation, by, weight, and, measure]","some there be , -PRON- know , that take the matter for the benefit , and tax the obligation by weight and measure ."
3138,On Anger,Seneca,Stoicism,"(It, is, so, potent, a, passion, that, Socrates, durst, not, trust, himself, with, it, ., ')",45,2017,It is so potent a passion that Socrates durst not trust himself with it. ',74,"[it, is, so, potent, passion, that, socrates, durst, not, trust, himself, with, it]",-PRON- be so potent a passion that Socrates durst not trust -PRON- with -PRON- . '


### Combine with the Old Dataframe & Export to CSV

In [125]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
77107,Aristotle - Complete Works,Aristotle,aristotle,"not but what some particular detail may perhaps be well looked after by an unscientific person, if he has studied accurately in the light of experience what happens in each case, just as some peop...","not but what some particular detail may perhaps be well looked after by an unscientific person, if he has studied accurately in the light of experience what happens in each case, just as some peop...",-320,1991,281,"not but what some particular detail may perhaps be well looked after by an unscientific person, if he has studied accurately in the light of experience what happens in each case, just as some peop...","['not', 'but', 'what', 'some', 'particular', 'detail', 'may', 'perhaps', 'be', 'well', 'looked', 'after', 'by', 'an', 'unscientific', 'person', 'if', 'he', 'has', 'studied', 'accurately', 'in', 't...","not but what some particular detail may perhaps be well look after by an unscientific person , if -PRON- have study accurately in the light of experience what happen in each case , just as some p..."
76881,Aristotle - Complete Works,Aristotle,aristotle,It is something of this sort that we are looking for.,It is something of this sort that we are looking for.,-320,1991,53,it is something of this sort that we are looking for.,"['it', 'is', 'something', 'of', 'this', 'sort', 'that', 'we', 'are', 'looking', 'for']",-PRON- be something of this sort that -PRON- be look for .
350267,The Second Sex,Beauvoir,feminism,I detested him because of his ways.,I detested him because of his ways.,1949,2009,35,i detested him because of his ways.,"['detested', 'him', 'because', 'of', 'his', 'ways']",-PRON- detest -PRON- because of -PRON- way .
191162,History Of Madness,Foucault,continental,"Madness begins when a subject states as an affirmation that he is dead, and gives truth value to the still neutral content of the image 'I am dead.'","Madness begins when a subject states as an affirmation that he is dead, and gives truth value to the still neutral content of the image 'I am dead.'",1961,2006,148,"madness begins when a subject states as an affirmation that he is dead, and gives truth value to the still neutral content of the image 'i am dead.'","['madness', 'begins', 'when', 'subject', 'states', 'as', 'an', 'affirmation', 'that', 'he', 'is', 'dead', 'and', 'gives', 'truth', 'value', 'to', 'the', 'still', 'neutral', 'content', 'of', 'the',...","madness begin when a subject state as an affirmation that -PRON- be dead , and give truth value to the still neutral content of the image ' -PRON- be dead . '"
83537,Aristotle - Complete Works,Aristotle,aristotle,"For what is often useful surpasses what is seldom useful, whence the saying The best of things is water.","For what is often useful surpasses what is seldom useful, whence the saying The best of things is water.",-320,1991,104,"for what is often useful surpasses what is seldom useful, whence the saying the best of things is water.","['for', 'what', 'is', 'often', 'useful', 'surpasses', 'what', 'is', 'seldom', 'useful', 'whence', 'the', 'saying', 'the', 'best', 'of', 'things', 'is', 'water']","for what be often useful surpass what be seldom useful , whence the say the good of thing be water ."


In [126]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.131978
Plato              0.103804
Hegel              0.061418
Foucault           0.041234
Heidegger          0.041231
Kant               0.038225
Nietzsche          0.036656
Marx               0.036496
Lewis              0.035498
Beauvoir           0.035219
Malebranche        0.035165
Deleuze            0.033929
Kripke             0.033764
Smith              0.031637
Wittgenstein       0.024443
Locke              0.024040
Hume               0.022489
Merleau-Ponty      0.020541
Quine              0.019949
Derrida            0.016231
Husserl            0.015536
Fichte             0.014362
Russell            0.013726
Leibniz            0.013601
Popper             0.012657
Lenin              0.012091
Augustine          0.011007
Spinoza            0.010262
Seneca             0.010114
Moore              0.009924
Keynes             0.009229
Ricardo            0.008360
Davis              0.008277
Berkeley           0.007397
Wollstonecraft     0.006924
Marcus Aurelius    0

In [135]:
len(og_df)

369599

In [128]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.130657
Plato              0.102765
Hegel              0.060803
Foucault           0.040821
Heidegger          0.040818
Kant               0.037842
Nietzsche          0.036289
Marx               0.036131
Lewis              0.035143
Beauvoir           0.034867
Malebranche        0.034813
Deleuze            0.033589
Kripke             0.033426
Smith              0.031320
Wittgenstein       0.024198
Locke              0.023799
Hume               0.022264
Merleau-Ponty      0.020336
Seneca             0.020025
Quine              0.019749
Derrida            0.016069
Husserl            0.015380
Fichte             0.014218
Russell            0.013588
Leibniz            0.013465
Popper             0.012530
Lenin              0.011970
Augustine          0.010896
Spinoza            0.010160
Moore              0.009825
Keynes             0.009137
Ricardo            0.008277
Davis              0.008194
Berkeley           0.007323
Wollstonecraft     0.006854
Marcus Aurelius    0

In [129]:
new_df[new_df['author']=='Seneca'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
368381,On Benefits,Seneca,Stoicism,"for they are beset on all hands, and every minute in dread of a surprise.","for they are beset on all hands, and every minute in dread of a surprise.",59,2017,73,,"['for', 'they', 'are', 'beset', 'on', 'all', 'hands', 'and', 'every', 'minute', 'in', 'dread', 'of', 'surprise']","for -PRON- be beset on all hand , and every minute in dread of a surprise ."
1277,On Benefits,Seneca,Stoicism,"(There, are, some, dispositions, that, embrace, good, things, as, soon, as, they, hear, them, ;, but, they, will, still, need, quickening, by, admonition, and, precept)",There are some dispositions that embrace good things as soon as they hear them; but they will still need quickening by admonition and precept,59,2017,141,,"[there, are, some, dispositions, that, embrace, good, things, as, soon, as, they, hear, them, but, they, will, still, need, quickening, by, admonition, and, precept]",there be some disposition that embrace good thing as soon as -PRON- hear -PRON- ; but -PRON- will still need quicken by admonition and precept
2422,On Benefits,Seneca,Stoicism,"(I, speak, this, of, Cato, 's, last, part, ;, for, in, his, former, time, the, commonwealth, was, made, unfit, for, a, wise, man, 's, administration, .)",I speak this of Cato's last part; for in his former time the commonwealth was made unfit for a wise man's administration.,59,2017,121,,"[speak, this, of, cato, last, part, for, in, his, former, time, the, commonwealth, was, made, unfit, for, wise, man, administration]",-PRON- speak this of Cato 's last part ; for in -PRON- former time the commonwealth be make unfit for a wise man 's administration .
366049,On Benefits,Seneca,Stoicism,"It was well said of him that called a good office, that was done harshly, and with an ill will, a stony piece of bread it is necessary for him that is hungry to receive it, but it almost chokes a ...","It was well said of him that called a good office, that was done harshly, and with an ill will, a stony piece of bread it is necessary for him that is hungry to receive it, but it almost chokes a ...",59,2017,218,,"['it', 'was', 'well', 'said', 'of', 'him', 'that', 'called', 'good', 'office', 'that', 'was', 'done', 'harshly', 'and', 'with', 'an', 'ill', 'will', 'stony', 'piece', 'of', 'bread', 'it', 'is', 'n...","-PRON- be well say of -PRON- that call a good office , that be do harshly , and with an ill will , a stony piece of bread -PRON- be necessary for -PRON- that be hungry to receive -PRON- , but -PR..."
2972,On Anger,Seneca,Stoicism,"(Nothing, but, a, predominant, fear, could, ever, have, mastered, his, choleric, and, sanguinary, disposition, .)",Nothing but a predominant fear could ever have mastered his choleric and sanguinary disposition.,45,2017,96,,"[nothing, but, predominant, fear, could, ever, have, mastered, his, choleric, and, sanguinary, disposition]",nothing but a predominant fear could ever have master -PRON- choleric and sanguinary disposition .


In [130]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [131]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
96492,A Treatise Of Human Nature,Hume,Empiricism,1739,"Now such as the parts are, such is the whole."
14892,Plato - Complete Works,Plato,Plato,350 BC,"It is because of this that Nicias and Socrates agree (Laches is slow to accept the point, but it is clearly implied in what he has already said about courage's involving 'wisdom')"
263229,The System Of Ethics,Fichte,German Idealism,1798,General overview of the issue
49467,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,"The only difference is that in the last case, viz."
342640,Vindication Of The Rights Of Woman,Wollstonecraft,Feminism,1792,"Yet thus to give a sex to mind was not very consistent with the principles of a man who argued so warmly, and so well, for the immortality of the soul."


In [132]:
len(for_db)

373337

In [133]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('pos',
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp', 
               con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Seneca'""").fetchone()) 

('On Benefits', 'Seneca', 'Stoicism', '59', 'and so in infinitum.')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [71]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Anselm'""").fetchone()) 

('Proslogion', 'Anselm', 'Scholasticism', '1077', 'How is it, then, Lord, that You are all these things?')
