<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [6]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [7]:
from import_functions import *

In [8]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [93]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
kierkegaard_fear_trembling = get_text(drive_path + '/phil_txts/kierkegaard_fear_and_trembling.txt', encoding='utf-16')

In [94]:
kierkegaard_fear_trembling = kierkegaard_fear_trembling[348:]

### Clean the Text



In [95]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [96]:
# note extras like bracketed footnotes or specific words to remove


In [97]:
# build a dictionary for the book
kierkegaard_fear_trembling_dict = {
    'author': 'Kierkegaard',
    'title': 'Fear and Trembling',
    'text': kierkegaard_fear_trembling,
    'school': 'Kierkegaard',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 1843,
    'corpus date': 1941
}

In [98]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [99]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [100]:
# use the function
f_t_df = from_raw_to_df(kierkegaard_fear_trembling_dict)
df = f_t_df

In [101]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
796,Fear and Trembling,Kierkegaard,Kierkegaard,"(But, what, then, is, duty, Duty, is, precisely, the, expression, for, God, 's, will, .)",1843,1941,But what then is duty Duty is precisely the expression for God's will.
941,Fear and Trembling,Kierkegaard,Kierkegaard,"(Faith, ,, on, the, contrary, ,, is, the, paradox, that, inwardness, is, higher, than, outwardness, or, ,, to, recall, an, expression, used, above, ,, the, uneven, number, is, higher, than, the, e...",1843,1941,"Faith, on the contrary, is the paradox that inwardness is higher than outwardness or, to recall an expression used above, the uneven number is higher than the even."
557,Fear and Trembling,Kierkegaard,Kierkegaard,"(To, become, another, man, he, feels, no, inclination, ,, nor, does, he, by, any, means, regard, this, as, greatness, .)",1843,1941,"To become another man he feels no inclination, nor does he by any means regard this as greatness."
908,Fear and Trembling,Kierkegaard,Kierkegaard,"(If, such, is, not, the, position, of, Abraham, ,, then, he, is, not, even, a, tragic, hero, but, a, murderer, .)",1843,1941,"If such is not the position of Abraham, then he is not even a tragic hero but a murderer."
1464,Fear and Trembling,Kierkegaard,Kierkegaard,"(that, faith, as, a, matter, of, course, is, the, immediate, .)",1843,1941,that faith as a matter of course is the immediate.
1302,Fear and Trembling,Kierkegaard,Kierkegaard,"(In, this, case, he, must, embroil, himself, in, a, mystifictition, by, which, he, reduces, himself, to, naught, in, relation, to, her, .)",1843,1941,In this case he must embroil himself in a mystifictition by which he reduces himself to naught in relation to her.
0,Fear and Trembling,Kierkegaard,Kierkegaard,"(Not, merely, in, the, realm, of, commerce, but, in, the, world, of, ideas, as, well, our, age, is, organizing, a, regular, clearance, sale, .)",1843,1941,Not merely in the realm of commerce but in the world of ideas as well our age is organizing a regular clearance sale.
260,Fear and Trembling,Kierkegaard,Kierkegaard,"(make, him, blessed, in, thy, bosom, here)",1843,1941,make him blessed in thy bosom here
677,Fear and Trembling,Kierkegaard,Kierkegaard,"(If, it, is, really, true, that, all, the, Philistinism, I, behold, in, life, (, which, I, do, not, permit, my, word, but, my, actions, to, condemn, ), is, not, what, it, seems, to, be, is, it, th...",1843,1941,"If it is really true that all the Philistinism I behold in life (which I do not permit my word but my actions to condemn) is not what it seems to be is it the miracle That is conceivable, for the ..."
1052,Fear and Trembling,Kierkegaard,Kierkegaard,"(Humanly, speaking, ,, he, is, crazy, and, can, not, make, himself, intelligible, to, anyone, .)",1843,1941,"Humanly speaking, he is crazy and cannot make himself intelligible to anyone."


In [102]:
len(df)

1824

#### Remove Short Sentences

In [103]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 59 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
255,Fear and Trembling,Kierkegaard,Kierkegaard,"(Was, it, not, so)",1843,1941,Was it not so,13
1819,Fear and Trembling,Kierkegaard,Kierkegaard,"(Cf, .)",1843,1941,Cf.,3
1427,Fear and Trembling,Kierkegaard,Kierkegaard,"(That, 's, easy, enough, .)",1843,1941,That's easy enough.,19
425,Fear and Trembling,Kierkegaard,Kierkegaard,"(Let, us, go, further, .)",1843,1941,Let us go further.,18
1080,Fear and Trembling,Kierkegaard,Kierkegaard,"(So, is, he, not, mad)",1843,1941,So is he not mad,16


In [104]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

1765

#### Remove Cases of Self-Mention

In [105]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Kierkegaard'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [106]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

1765

#### Deal with Duplicates

In [107]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

1

In [108]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
154,Fear and Trembling,Kierkegaard,Kierkegaard,"(There, is, no, song, of, Lamentations, by, Abraham, .)",1843,1941,There is no song of Lamentations by Abraham.,44
161,Fear and Trembling,Kierkegaard,Kierkegaard,"(There, is, no, song, of, Lamentations, by, Abraham, .)",1843,1941,There is no song of Lamentations by Abraham.,44


In [109]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [110]:
len(df)

1764

#### Check for Foreign Languages

In [111]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

1

In [112]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [113]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

1764

In [114]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

1764

In [115]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

1764

In [116]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

1764

In [117]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

1764

### Lemmatize and Tokenize

In [118]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [119]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [120]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
1089,Fear and Trembling,Kierkegaard,Kierkegaard,"(He, can, derive, evidence, from, no, man, which, it, is, ,, for, with, that, query, he, is, outside, the, paradox, .)",1843,1941,"He can derive evidence from no man which it is, for with that query he is outside the paradox.",94,"[he, can, derive, evidence, from, no, man, which, it, is, for, with, that, query, he, is, outside, the, paradox]","-PRON- can derive evidence from no man which -PRON- be , for with that query -PRON- be outside the paradox ."
696,Fear and Trembling,Kierkegaard,Kierkegaard,"(before, he, has, to, preach, ,, the, auditor, can, well, take, a, nap, during, the, discourse, ,, for, all, goes, smoothly, ,, without, the, least, trouble, from, any, quarter, .)",1843,1941,"before he has to preach, the auditor can well take a nap during the discourse, for all goes smoothly, without the least trouble from any quarter.",145,"[before, he, has, to, preach, the, auditor, can, well, take, nap, during, the, discourse, for, all, goes, smoothly, without, the, least, trouble, from, any, quarter]","before -PRON- have to preach , the auditor can well take a nap during the discourse , for all go smoothly , without the least trouble from any quarter ."
406,Fear and Trembling,Kierkegaard,Kierkegaard,"(and, I, can, not, talk, together)",1843,1941,and I cannot talk together,26,"[and, cannot, talk, together]",and -PRON- can not talk together
1484,Fear and Trembling,Kierkegaard,Kierkegaard,"(This, mere, thought, of, taking, time, upon, one, 's, conscience, ,, of, giving, it, time, to, explore, with, its, sleepless, vigilance, every, secret, thought, ,, with, such, effect, that, ,, if...",1843,1941,"This mere thought of taking time upon one's conscience, of giving it time to explore with its sleepless vigilance every secret thought, with such effect that, if even, instant one does not make th...",243,"[this, mere, thought, of, taking, time, upon, one, conscience, of, giving, it, time, to, explore, with, its, sleepless, vigilance, every, secret, thought, with, such, effect, that, if, even, insta...","this mere thought of take time upon one 's conscience , of give -PRON- time to explore with -PRON- sleepless vigilance every secret thought , with such effect that , if even , instant one do not ..."
179,Fear and Trembling,Kierkegaard,Kierkegaard,"(But, Abraham, believed, ,, therefore, he, was, young, for, he, who, always, hopes, for, the, best, becomes, old, ,, and, he, who, is, always, prepared, for, the, worst, grows, old, early, ,, but,...",1843,1941,"But Abraham believed, therefore he was young for he who always hopes for the best becomes old, and he who is always prepared for the worst grows old early, but he who believes preserves an eternal...",203,"[but, abraham, believed, therefore, he, was, young, for, he, who, always, hopes, for, the, best, becomes, old, and, he, who, is, always, prepared, for, the, worst, grows, old, early, but, he, who,...","but Abraham believe , therefore -PRON- be young for -PRON- who always hope for the good become old , and -PRON- who be always prepared for the bad grow old early , but -PRON- who believe preserve..."


### Combine with the Old Dataframe & Export to CSV

In [121]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
211853,Difference And Repetition,Deleuze,continental,"As for the negative, this is only the shadow cast upon the affirmations produced by a problem: negation appears alongside affirmation like a powerless double, albeit one which testifies to the exi...","As for the negative, this is only the shadow cast upon the affirmations produced by a problem: negation appears alongside affirmation like a powerless double, albeit one which testifies to the exi...",1968,1994,266,"as for the negative, this is only the shadow cast upon the affirmations produced by a problem: negation appears alongside affirmation like a powerless double, albeit one which testifies to the exi...","['as', 'for', 'the', 'negative', 'this', 'is', 'only', 'the', 'shadow', 'cast', 'upon', 'the', 'affirmations', 'produced', 'by', 'problem', 'negation', 'appears', 'alongside', 'affirmation', 'like...","as for the negative , this be only the shadow cast upon the affirmation produce by a problem : negation appear alongside affirmation like a powerless double , albeit one which testify to the exis..."
311385,The Wealth Of Nations,Smith,capitalism,"The gross revenue of the society, the annual produce of their land and labour, is increased by the whole value which the labour of those workmen adds to the materials upon which they are employed,...","The gross revenue of the society, the annual produce of their land and labour, is increased by the whole value which the labour of those workmen adds to the materials upon which they are employed,...",1776,2009,341,"the gross revenue of the society, the annual produce of their land and labour, is increased by the whole value which the labour of those workmen adds to the materials upon which they are employed,...","['the', 'gross', 'revenue', 'of', 'the', 'society', 'the', 'annual', 'produce', 'of', 'their', 'land', 'and', 'labour', 'is', 'increased', 'by', 'the', 'whole', 'value', 'which', 'the', 'labour', ...","the gross revenue of the society , the annual produce of -PRON- land and labour , be increase by the whole value which the labour of those workman add to the material upon which -PRON- be employ ..."
160406,Quintessence,Quine,analytic,And it is hard to see what whole collection of general terms of nat ural science might not qualify likewise.,And it is hard to see what whole collection of general terms of nat ural science might not qualify likewise.,1950,2004,108,and it is hard to see what whole collection of general terms of nat ural science might not qualify likewise.,"['and', 'it', 'is', 'hard', 'to', 'see', 'what', 'whole', 'collection', 'of', 'general', 'terms', 'of', 'nat', 'ural', 'science', 'might', 'not', 'qualify', 'likewise']",and -PRON- be hard to see what whole collection of general term of nat ural science may not qualify likewise .
155580,Lewis - Papers,Lewis,analytic,"In speaking of our actual world without knowing just which world is ours, I am in effect generalizing over all worlds.","In speaking of our actual world without knowing just which world is ours, I am in effect generalizing over all worlds.",1985,2008,118,"in speaking of our actual world without knowing just which world is ours, i am in effect generalizing over all worlds.","['in', 'speaking', 'of', 'our', 'actual', 'world', 'without', 'knowing', 'just', 'which', 'world', 'is', 'ours', 'am', 'in', 'effect', 'generalizing', 'over', 'all', 'worlds']","in speak of -PRON- actual world without know just which world be ours , -PRON- be in effect generalize over all world ."
317653,The Wealth Of Nations,Smith,capitalism,"The proper performance of every service seems to require, that its pay or recompence should be, as exactly as possible, proportioned to the nature of the service.","The proper performance of every service seems to require, that its pay or recompence should be, as exactly as possible, proportioned to the nature of the service.",1776,2009,162,"the proper performance of every service seems to require, that its pay or recompence should be, as exactly as possible, proportioned to the nature of the service.","['the', 'proper', 'performance', 'of', 'every', 'service', 'seems', 'to', 'require', 'that', 'its', 'pay', 'or', 'recompence', 'should', 'be', 'as', 'exactly', 'as', 'possible', 'proportioned', 't...","the proper performance of every service seem to require , that -PRON- pay or recompence should be , as exactly as possible , proportioned to the nature of the service ."


In [122]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.131981
Plato              0.103807
Hegel              0.061419
Anselm             0.043500
Foucault           0.041235
Heidegger          0.041232
Kant               0.038215
Marx               0.036497
Lewis              0.035499
Malebranche        0.035166
Deleuze            0.033929
Kripke             0.033764
Smith              0.031638
Wittgenstein       0.024443
Locke              0.024040
Hume               0.022490
Merleau-Ponty      0.020542
Quine              0.019949
Nietzsche          0.018437
Derrida            0.016231
Davis              0.016007
Husserl            0.015536
Fichte             0.014362
Russell            0.013726
Leibniz            0.013602
Seneca             0.013580
Popper             0.012657
Lenin              0.012092
Spinoza            0.010263
Moore              0.009925
Keynes             0.009229
Ricardo            0.008361
Beauvoir           0.008285
Berkeley           0.007397
Augustine          0.006924
Marcus Aurelius    0

In [123]:
len(og_df)

369590

In [124]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.131354
Plato              0.103314
Hegel              0.061128
Anselm             0.043293
Foucault           0.041039
Heidegger          0.041036
Kant               0.038034
Marx               0.036324
Lewis              0.035330
Malebranche        0.034999
Deleuze            0.033768
Kripke             0.033604
Smith              0.031487
Wittgenstein       0.024327
Locke              0.023926
Hume               0.022383
Merleau-Ponty      0.020444
Quine              0.019854
Nietzsche          0.018349
Derrida            0.016154
Davis              0.015931
Husserl            0.015462
Fichte             0.014294
Russell            0.013661
Leibniz            0.013537
Seneca             0.013515
Popper             0.012597
Lenin              0.012034
Spinoza            0.010214
Moore              0.009877
Keynes             0.009185
Ricardo            0.008321
Beauvoir           0.008246
Berkeley           0.007362
Augustine          0.006891
Marcus Aurelius    0

In [125]:
new_df[new_df['author']=='Kierkegaard'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
1353,Fear and Trembling,Kierkegaard,Kierkegaard,"(He, had, in, fact, sent, it, ,, as, we, know, ,, but, it, was, kept, back, by, the, malice, of, a, lady, of, the, court, .)","He had in fact sent it, as we know, but it was kept back by the malice of a lady of the court.",1843,1941,94,,"[he, had, in, fact, sent, it, as, we, know, but, it, was, kept, back, by, the, malice, of, lady, of, the, court]","-PRON- have in fact send -PRON- , as -PRON- know , but -PRON- be keep back by the malice of a lady of the court ."
1744,Fear and Trembling,Kierkegaard,Kierkegaard,"(In, case, no, last, rejoinder, of, Socrates, had, existed, ,, I, should, have, been, able, to, think, myself, into, him, and, formulate, such, a, word, if, I, were, unable, to, do, it, ,, a, poet...","In case no last rejoinder of Socrates had existed, I should have been able to think myself into him and formulate such a word if I were unable to do it, a poet could, but no poet can catch up with...",1843,1941,205,,"[in, case, no, last, rejoinder, of, socrates, had, existed, should, have, been, able, to, think, myself, into, him, and, formulate, such, word, if, were, unable, to, do, it, poet, could, but, no, ...","in case no last rejoinder of Socrates have exist , -PRON- should have be able to think -PRON- into -PRON- and formulate such a word if -PRON- be unable to do -PRON- , a poet could , but no poet c..."
1638,Fear and Trembling,Kierkegaard,Kierkegaard,"(This, movement, ,, however, ,, is, as, properly, a, movement, of, irony, as, is, any, other, which, has, its, ground, in, the, fact, that, subjectivity, is, higher, than, reality, .)","This movement, however, is as properly a movement of irony as is any other which has its ground in the fact that subjectivity is higher than reality.",1843,1941,149,,"[this, movement, however, is, as, properly, movement, of, irony, as, is, any, other, which, has, its, ground, in, the, fact, that, subjectivity, is, higher, than, reality]","this movement , however , be as properly a movement of irony as be any other which have -PRON- ground in the fact that subjectivity be high than reality ."
882,Fear and Trembling,Kierkegaard,Kierkegaard,"(What, befalls, her, is, the, extraordinary, .)",What befalls her is the extraordinary.,1843,1941,38,,"[what, befalls, her, is, the, extraordinary]",what befall -PRON- be the extraordinary .
73,Fear and Trembling,Kierkegaard,Kierkegaard,"(On, the, morning, of, the, fourth, day, Abraham, said, never, a, word, ,, but, he, lifted, up, his, eyes, and, saw, Mount, Moriah, afar, off, .)","On the morning of the fourth day Abraham said never a word, but he lifted up his eyes and saw Mount Moriah afar off.",1843,1941,116,,"[on, the, morning, of, the, fourth, day, abraham, said, never, word, but, he, lifted, up, his, eyes, and, saw, mount, moriah, afar, off]","on the morning of the fourth day Abraham say never a word , but -PRON- lift up -PRON- eye and see Mount Moriah afar off ."


In [126]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [127]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
205474,Writing And Difference,Derrida,Continental,1967,"but in Europe and even in the Occident as a whole: Occidental takes precedence over the kind of performance improperly called theater recognizes as language, assigns the faculties and powers of a ..."
72252,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,Being in something' has similar and corresponding meanings to 'having'.
33578,Plato - Complete Works,Plato,Plato,350 BC,"However, the political system which we are now establishing by law has avoided both of them."
352578,Proslogion,Anselm,Scholasticism,1077,And Rodrigo: 'For what is it to die but to stop being necessary? '
326501,Meditations,Marcus Aurelius,Stoicism,170,"The third, is thy ruling part; and here consider; Thou art an old man; suffer not that excellent part to be brought in subjection, and to become slavish: suffer it not to be drawn up and down with..."


In [128]:
len(for_db)

371354

In [129]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('postgres,
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp',          
              con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Kierkegaard'""").fetchone()) 

('Fear and Trembling', 'Kierkegaard', 'Kierkegaard', '1843', 'Not merely in the realm of commerce but in the world of ideas as well our age is organizing a regular clearance sale.')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [65]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Kierkegaard'""").fetchone()) 

('Fear and Trembling', 'Kierkegaard', 'Kierkegaard', '1843', 'S ren Kierkegaard (Johannes De Silentio) Fear and Trembling')
