<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [4]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
augustine_confessions = get_guten('http://www.gutenberg.org/cache/epub/3296/pg3296.txt')

In [5]:
augustine_confessions = augustine_confessions.split('AD 401')[1][22:].split('GRATIAS TIBI DOMINE')[0]

### Clean the Text



In [6]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [7]:
# note extras like bracketed footnotes or specific words to remove



In [8]:
# build a dictionary for the book
book_dict = {
    'author': 'Augustine',
    'title': 'Confessions Of St. Augustine',
    'text': augustine_confessions,
    'school': 'Scholasticism',
    'words to remove': [],
    'remove capitals': False,
    'bracketed fn': False,
    'original date': 398,
    'corpus date': 2002
}


In [9]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [10]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [11]:
# use the function
df = from_raw_to_df(book_dict)

In [12]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
630,Confessions Of St. Augustine,Augustine,Scholasticism,"(As, if, in, an, armory, ,, one, ignorant, what, were, adapted, to, each, part, should, cover, his, head, with, greaves, ,, or, seek, to, be, shod, with, a, helmet, ,, and, complain, that, they, f...",398,2002,"As if in an armory, one ignorant what were adapted to each part should cover his head with greaves, or seek to be shod with a helmet, and complain that they fitted not: or as if on a day when busi..."
3892,Confessions Of St. Augustine,Augustine,Scholasticism,"(First, then, was, that, to, be, spoken, of, ,, over, which, He, might, be, borne, ;, and, then, He, ,, whom, it, was, meet, not, otherwise, to, be, spoken, of, than, as, being, borne, .)",398,2002,"First then was that to be spoken of, over which He might be borne; and then He, whom it was meet not otherwise to be spoken of than as being borne."
321,Confessions Of St. Augustine,Augustine,Scholasticism,"(Before, them, what, more, foul, than, I, was, already, ,, displeasing, even, such, as, myself, ?)",398,2002,"Before them what more foul than I was already, displeasing even such as myself?"
3083,Confessions Of St. Augustine,Augustine,Scholasticism,"(How, hast, Thou, loved, us, ,, good, Father, ,, who, sparedst, not, Thine, only, Son, ,, but, deliveredst, Him, up, for, us, ungodly, !)",398,2002,"How hast Thou loved us, good Father, who sparedst not Thine only Son, but deliveredst Him up for us ungodly!"
2755,Confessions Of St. Augustine,Augustine,Scholasticism,"(Whence, she, shall, so, repay, them, ,, that, they, who, would, not, be, made, manifest, by, her, ,, she, both, against, their, will, makes, manifest, ,, and, herself, becometh, not, manifest, un...",398,2002,"Whence she shall so repay them, that they who would not be made manifest by her, she both against their will makes manifest, and herself becometh not manifest unto them."
1150,Confessions Of St. Augustine,Augustine,Scholasticism,"(Furthermore, ,, what, the, Manichees, had, criticised, in, Thy, Scriptures, ,, I, thought, could, not, be, defended, ;, yet, at, times, verily, I, had, a, wish, to, confer, upon, these, several, ...",398,2002,"Furthermore, what the Manichees had criticised in Thy Scriptures, I thought could not be defended; yet at times verily I had a wish to confer upon these several points with some one very well skil..."
1948,Confessions Of St. Augustine,Augustine,Scholasticism,"(Whence, is, this, monstrousness, ?)",398,2002,Whence is this monstrousness?
232,Confessions Of St. Augustine,Augustine,Scholasticism,(echoed),398,2002,echoed
3126,Confessions Of St. Augustine,Augustine,Scholasticism,"(Behold, ,, Lord, my, God, ,, wherein, is, my, desire, .)",398,2002,"Behold, Lord my God, wherein is my desire."
3071,Confessions Of St. Augustine,Augustine,Scholasticism,"(For, they, were, mortal, ,, and, sinners, ;, but, thou, ,, Lord, ,, to, whom, they, proudly, sought, to, be, reconciled, ,, art, immortal, ,, and, without, sin, .)",398,2002,"For they were mortal, and sinners; but thou, Lord, to whom they proudly sought to be reconciled, art immortal, and without sin."


In [13]:
len(df)

4261

#### Remove Short Sentences

In [14]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 186 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
3766,Confessions Of St. Augustine,Augustine,Scholasticism,"(No, .)",398,2002,No.,3
2542,Confessions Of St. Augustine,Augustine,Scholasticism,"(what, kind, it, is, ?)",398,2002,what kind it is?,16
2589,Confessions Of St. Augustine,Augustine,Scholasticism,"(Who, will, say, so, ?)",398,2002,Who will say so?,16
21,Confessions Of St. Augustine,Augustine,Scholasticism,"(Why, ?)",398,2002,Why?,4
2151,Confessions Of St. Augustine,Augustine,Scholasticism,"(Know, this, :)",398,2002,Know this:,10


In [15]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

4075

#### Remove Cases of Self-Mention

In [16]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [17]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

4075

#### Deal with Duplicates

In [18]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

7

In [19]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
3738,Confessions Of St. Augustine,Augustine,Scholasticism,"(Another, he, who, says, ,, The, earth, was, invisible, and, without, form, ,, and, darkness, was, upon, the, deep, ;)",398,2002,"Another he who says, The earth was invisible and without form, and darkness was upon the deep;",94
3740,Confessions Of St. Augustine,Augustine,Scholasticism,"(Another, he, who, says, ,, The, earth, was, invisible, and, without, form, ,, and, darkness, was, upon, the, deep, ;)",398,2002,"Another he who says, The earth was invisible and without form, and darkness was upon the deep;",94
2822,Confessions Of St. Augustine,Augustine,Scholasticism,"(Give, what, Thou, enjoinest, ,, and, enjoin, what, Thou, wilt, .)",398,2002,"Give what Thou enjoinest, and enjoin what Thou wilt.",52
2884,Confessions Of St. Augustine,Augustine,Scholasticism,"(Give, what, Thou, enjoinest, ,, and, enjoin, what, Thou, wilt, .)",398,2002,"Give what Thou enjoinest, and enjoin what Thou wilt.",52
3003,Confessions Of St. Augustine,Augustine,Scholasticism,"(Give, what, Thou, enjoinest, ,, and, enjoin, what, Thou, wilt, .)",398,2002,"Give what Thou enjoinest, and enjoin what Thou wilt.",52
3731,Confessions Of St. Augustine,Augustine,Scholasticism,"(He, another, ,, that, saith, ,, In, the, Beginning, God, made, heaven, and, earth)",398,2002,"He another, that saith, In the Beginning God made heaven and earth",66
3733,Confessions Of St. Augustine,Augustine,Scholasticism,"(He, another, ,, that, saith, ,, In, the, Beginning, God, made, heaven, and, earth)",398,2002,"He another, that saith, In the Beginning God made heaven and earth",66
2801,Confessions Of St. Augustine,Augustine,Scholasticism,"(Lord, ,, have, pity, on, me, .)",398,2002,"Lord, have pity on me.",22
2804,Confessions Of St. Augustine,Augustine,Scholasticism,"(Lord, ,, have, pity, on, me, .)",398,2002,"Lord, have pity on me.",22
1948,Confessions Of St. Augustine,Augustine,Scholasticism,"(Whence, is, this, monstrousness, ?)",398,2002,Whence is this monstrousness?,29


In [20]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [21]:
len(df)

4068

#### Check for Foreign Languages

In [22]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

0

In [23]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [24]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

4068

In [25]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

4068

In [26]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

4068

In [27]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

4068

In [28]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

4068

### Lemmatize and Tokenize

In [29]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [30]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [31]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
3673,Confessions Of St. Augustine,Augustine,Scholasticism,"(For, this, ,, though, created, ,, is, also, called, wisdom, .)",398,2002,"For this, though created, is also called wisdom.",48,"[for, this, though, created, is, also, called, wisdom]","for this , though create , be also call wisdom ."
1887,Confessions Of St. Augustine,Augustine,Scholasticism,"(and, by, how, many, perils, arrive, we, at, a, greater, peril, ?)",398,2002,and by how many perils arrive we at a greater peril?,52,"[and, by, how, many, perils, arrive, we, at, greater, peril]",and by how many peril arrive -PRON- at a great peril ?
3570,Confessions Of St. Augustine,Augustine,Scholasticism,"(not, that, it, wanted, all, form, ,, but, because, it, had, such, as, my, mind, would, ,, if, presented, to, it, ,, turn, from, ,, as, unwonted, and, jarring, ,, and, human, frailness, would, be,...",398,2002,"not that it wanted all form, but because it had such as my mind would, if presented to it, turn from, as unwonted and jarring, and human frailness would be troubled at.",168,"[not, that, it, wanted, all, form, but, because, it, had, such, as, my, mind, would, if, presented, to, it, turn, from, as, unwonted, and, jarring, and, human, frailness, would, be, troubled, at]","not that -PRON- want all form , but because -PRON- have such as -PRON- mind would , if present to -PRON- , turn from , as unwonted and jarring , and human frailness would be trouble at ."
1886,Confessions Of St. Augustine,Augustine,Scholasticism,"(and, in, this, ,, what, is, there, not, brittle, ,, and, full, of, perils, ?)",398,2002,"and in this, what is there not brittle, and full of perils?",59,"[and, in, this, what, is, there, not, brittle, and, full, of, perils]","and in this , what be there not brittle , and full of peril ?"
3941,Confessions Of St. Augustine,Augustine,Scholasticism,"(And, they, contend, and, strive, ,, yet, ,, without, peace, ,, no, man, sees, that, vision, .)",398,2002,"And they contend and strive, yet, without peace, no man sees that vision.",73,"[and, they, contend, and, strive, yet, without, peace, no, man, sees, that, vision]","and -PRON- contend and strive , yet , without peace , no man see that vision ."


### Combine with the Old Dataframe & Export to CSV

In [6]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
84319,Aristotle - Complete Works,Aristotle,aristotle,"Also those who have parents living, or childre...","Also those who have parents living, or childre...",-320,1991,135,"also those who have parents living, or childre...","['also', 'those', 'who', 'have', 'parents', 'l...","also those who have parent live , or child , ..."
105038,Three Dialogues,Berkeley,empiricism,"Besides spirits, all that we know or conceive ...","Besides spirits, all that we know or conceive ...",1713,2009,64,"besides spirits, all that we know or conceive ...","['besides', 'spirits', 'all', 'that', 'we', 'k...","besides spirit , all that -PRON- know or conc..."
194500,History Of Madness,Foucault,continental,His health was only restored in the social val...,His health was only restored in the social val...,1961,2006,100,his health was only restored in the social val...,"['his', 'health', 'was', 'only', 'restored', '...",-PRON- health be only restore in the social v...
143706,Philosophical Investigations,Wittgenstein,analytic,I suddenly see the solution of a puzzle picture.,I suddenly see the solution of a puzzle picture.,1953,1986,48,i suddenly see the solution of a puzzle picture.,"['suddenly', 'see', 'the', 'solution', 'of', '...",-PRON- suddenly see the solution of a puzzle ...
80701,Aristotle - Complete Works,Aristotle,aristotle,"The doubt in these cases is, not who is, but w...","The doubt in these cases is, not who is, but w...",-320,1991,227,"the doubt in these cases is, not who is, but w...","['the', 'doubt', 'in', 'these', 'cases', 'is',...","the doubt in these case be , not who be , but..."


In [7]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.133687
Plato              0.105148
Hegel              0.062213
Foucault           0.041768
Heidegger          0.041765
Kant               0.038720
Nietzsche          0.037130
Marx               0.036969
Lewis              0.035957
Beauvoir           0.035675
Malebranche        0.035620
Deleuze            0.034368
Kripke             0.034201
Smith              0.032047
Wittgenstein       0.024759
Locke              0.024351
Hume               0.022780
Merleau-Ponty      0.020807
Quine              0.020207
Derrida            0.016441
Husserl            0.015737
Fichte             0.014547
Russell            0.013903
Leibniz            0.013777
Popper             0.012821
Lenin              0.012248
Augustine          0.011149
Spinoza            0.010395
Moore              0.010053
Keynes             0.009348
Ricardo            0.008469
Davis              0.008384
Berkeley           0.007493
Wollstonecraft     0.007013
Marcus Aurelius    0.006062
Descartes          0

In [34]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.133687
Plato              0.105148
Hegel              0.062213
Foucault           0.041768
Heidegger          0.041765
Kant               0.038720
Nietzsche          0.037130
Marx               0.036969
Lewis              0.035957
Beauvoir           0.035675
Malebranche        0.035620
Deleuze            0.034368
Kripke             0.034201
Smith              0.032047
Wittgenstein       0.024759
Locke              0.024351
Hume               0.022780
Merleau-Ponty      0.020807
Quine              0.020207
Derrida            0.016441
Husserl            0.015737
Fichte             0.014547
Russell            0.013903
Leibniz            0.013777
Popper             0.012821
Lenin              0.012248
Augustine          0.011149
Spinoza            0.010395
Moore              0.010053
Keynes             0.009348
Ricardo            0.008469
Davis              0.008384
Berkeley           0.007493
Wollstonecraft     0.007013
Marcus Aurelius    0.006062
Descartes          0

In [35]:
new_df[new_df['author']=='Augustine'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
358,Confessions Of St. Augustine,Augustine,Scholasticism,"(And, ,, he, that, is, unmarried, thinketh, of, the, things, of, the, Lord, ,, how, he, may, please, the, Lord, ;, but)","And, he that is unmarried thinketh of the things of the Lord, how he may please the Lord; but",398,2002,93,,"[and, he, that, is, unmarried, thinketh, of, the, things, of, the, lord, how, he, may, please, the, lord, but]","and , -PRON- that be unmarried thinketh of the thing of the Lord , how -PRON- may please the Lord ; but"
2062,Confessions Of St. Augustine,Augustine,Scholasticism,"(BOOK, Lord, ,, I, am, Thy, servant, ;, I, am, Thy, servant, ,, and, the, son, of, Thy, handmaid, :)","BOOK Lord, I am Thy servant; I am Thy servant, and the son of Thy handmaid:",398,2002,75,,"[book, lord, am, thy, servant, am, thy, servant, and, the, son, of, thy, handmaid]","BOOK Lord , -PRON- be thy servant ; -PRON- be thy servant , and the son of Thy handmaid :"
1056,Confessions Of St. Augustine,Augustine,Scholasticism,"(For, after, it, was, clear, that, he, was, ignorant, of, those, arts, in, which, I, thought, he, excelled, ,, I, began, to, despair, of, his, opening, and, solving, the, difficulties, which, perp...","For after it was clear that he was ignorant of those arts in which I thought he excelled, I began to despair of his opening and solving the difficulties which perplexed me (of which indeed however...",398,2002,275,,"[for, after, it, was, clear, that, he, was, ignorant, of, those, arts, in, which, thought, he, excelled, began, to, despair, of, his, opening, and, solving, the, difficulties, which, perplexed, me...","for after -PRON- be clear that -PRON- be ignorant of those art in which -PRON- think -PRON- excel , -PRON- begin to despair of -PRON- opening and solve the difficulty which perplex -PRON- ( of wh..."
1562,Confessions Of St. Augustine,Augustine,Scholasticism,"(Let, him, not, so, say, ,, for, he, is, man, .)","Let him not so say, for he is man.",398,2002,34,,"[let, him, not, so, say, for, he, is, man]","let -PRON- not so say , for -PRON- be man ."
750,Confessions Of St. Augustine,Augustine,Scholasticism,"(But, he, so, shrunk, from, me, ,, as, from, an, enemy, ;, and, with, a, wonderful, and, sudden, freedom, bade, me, ,, as, I, would, continue, his, friend, ,, forbear, such, language, to, him, .)","But he so shrunk from me, as from an enemy; and with a wonderful and sudden freedom bade me, as I would continue his friend, forbear such language to him.",398,2002,154,,"[but, he, so, shrunk, from, me, as, from, an, enemy, and, with, wonderful, and, sudden, freedom, bade, me, as, would, continue, his, friend, forbear, such, language, to, him]","but -PRON- so shrink from -PRON- , as from an enemy ; and with a wonderful and sudden freedom bid -PRON- , as -PRON- would continue -PRON- friend , forbear such language to -PRON- ."


In [36]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [8]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = og_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
11129,Plato - Complete Works,Plato,Plato,350 BC,if I gave you half a chance.
27074,Plato - Complete Works,Plato,Plato,350 BC,And let's not be dazzled by looking at one man...
330561,Beyond Good And Evil,Nietzsche,Nietzsche,1886,"I am affected, not because you have deceived m..."
307188,Essential Works Of Lenin,Lenin,Communism,1862,But no one is so deaf as he who will not hear.
163663,Quintessence,Quine,Analytic,1950,This sort of indirect conformity to the theory...


In [9]:
len(for_db)

364876

In [16]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('post7ed7joo',
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp', 
               con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Augustine'""").fetchone()) 

('Confessions Of St. Augustine', 'Augustine', 'Scholasticism', '398', 'But were it utterly blotted out of the mind')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [19]:
print(engine.execute("""SELECT * FROM phil_nlp where 'AUTHOR' = 'Nietzsche' """).fetchone()) 

None
