<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [11]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
sartre_being = get_text(drive_path + '/phil_txts/sartre_being_and_nothingness.txt')

In [23]:
sartre_being = sartre_being.split('MODERN')[1].split('1Paul')[0]

In [24]:
len(sartre_being)

1934411

### Clean the Text



In [65]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)
  result = re.sub(r'è', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [66]:
# note extras like bracketed footnotes or specific words to remove


In [67]:
# build a dictionary for the book
sartre_being_dict = {
    'author': 'Sartre',
    'title': 'Being and Nothingness',
    'text': sartre_being,
    'school': 'Existentialism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 1943,
    'corpus date': 2011
}

In [68]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [69]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [70]:
# use the function
f_t_df = from_raw_to_df(sartre_being_dict)
df = f_t_df

In [71]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
9923,Being and Nothingness,Sartre,Existentialism,"(and, I, shall, figure, as, equivalent, structures, in, solidarity, with, each, other, .)",1943,2011,and I shall figure as equivalent structures in solidarity with each other.
13242,Being and Nothingness,Sartre,Existentialism,"(The, unknown, object, is, given, as, immaculate, ,, as, virgin, ,, comparable, to, a, whiteness, .)",1943,2011,"The unknown object is given as immaculate, as virgin, comparable to a whiteness."
5567,Being and Nothingness,Sartre,Existentialism,"(This, being, was, not, in, me, potentially, before, the, appearance, of, the, Other, ,, for, it, could, not, have, found, any, place, in, the, For, itself, .)",1943,2011,"This being was not in me potentially before the appearance of the Other, for it could not have found any place in the For itself."
4615,Being and Nothingness,Sartre,Existentialism,"(For, example, ,, as, we, shall, see, later, ,, the, revelation, of, the, spatiality, of, being, is, one, with, the, non, positional, apprehension, by, the, for, itself, of, itself, as, unextended...",1943,2011,"For example, as we shall see later, the revelation of the spatiality of being is one with the non positional apprehension by the for itself of itself as unextended."
2747,Being and Nothingness,Sartre,Existentialism,"(The, possibility, of, being, stopped, by, a, fold, in, the, cloth, belongs, neither, to, the, billiard, ball, which, rolls, nor, to, the, cloth, ;, it, can, arise, only, in, the, organization, in...",1943,2011,The possibility of being stopped by a fold in the cloth belongs neither to the billiard ball which rolls nor to the cloth; it can arise only in the organization into a system of the ball and the c...
455,Being and Nothingness,Sartre,Existentialism,"(In, a, word, ,, man, is, active, and, the, means, which, he, employs, are, called, passive, .)",1943,2011,"In a word, man is active and the means which he employs are called passive."
9757,Being and Nothingness,Sartre,Existentialism,"(This, is, why, one, hates, right, through, the, revealed, psychic, but, not, the, psychic, itself, ;, this, is, why, also, it, is, indifferent, whether, we, hate, the, Other, 's, transcendence, t...",1943,2011,This is why one hates right through the revealed psychic but not the psychic itself; this is why also it is indifferent whether we hate the Other's transcendence through what we empirically call h...
3729,Being and Nothingness,Sartre,Existentialism,"(Etc, .)",1943,2011,Etc.
8607,Being and Nothingness,Sartre,Existentialism,"(Considered, at, its, proper, place, and, time, and, in, its, original, contingency, ,, this, appearance, of, the, body, does, not, seem, to, be, capable, of, giving, rise, to, new, problems, .)",1943,2011,"Considered at its proper place and time and in its original contingency, this appearance of the body does not seem to be capable of giving rise to new problems."
3403,Being and Nothingness,Sartre,Existentialism,"(Because, Presence, is, outside, of, itself, toward, something, lacking, which, is, beyond, the, world, ,, it, can, be, outside, itself, as, presence, to, an, in, itself, which, it, is, not, .)",1943,2011,"Because Presence is outside of itself toward something lacking which is beyond the world, it can be outside itself as presence to an in itself which it is not."


In [72]:
len(df)

14066

#### Remove Short Sentences

In [73]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 355 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
8634,Being and Nothingness,Sartre,Existentialism,"(Tr, .)",1943,2011,Tr.,3
4414,Being and Nothingness,Sartre,Existentialism,"(Tr, .)",1943,2011,Tr.,3
9792,Being and Nothingness,Sartre,Existentialism,"(But, no, .)",1943,2011,But no.,7
10888,Being and Nothingness,Sartre,Existentialism,"(but, at, what, price, ?)",1943,2011,but at what price?,18
12762,Being and Nothingness,Sartre,Existentialism,"(Tr, .)",1943,2011,Tr.,3


In [74]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

13711

#### Remove Cases of Self-Mention

In [75]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Kierkegaard'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [76]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

13711

#### Deal with Duplicates

In [77]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

18

In [78]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
521,Being and Nothingness,Sartre,Existentialism,"(Being, is, what, it, is, .)",1943,2011,Being is what it is.,20
5245,Being and Nothingness,Sartre,Existentialism,"(Being, is, what, it, is, .)",1943,2011,Being is what it is.,20
686,Being and Nothingness,Sartre,Existentialism,"(But, this, is, not, all, .)",1943,2011,But this is not all.,20
5976,Being and Nothingness,Sartre,Existentialism,"(But, this, is, not, all, .)",1943,2011,But this is not all.,20
13830,Being and Nothingness,Sartre,Existentialism,"(But, this, is, not, all, .)",1943,2011,But this is not all.,20
2471,Being and Nothingness,Sartre,Existentialism,"(Desire, is, a, lack, of, being, .)",1943,2011,Desire is a lack of being.,26
13173,Being and Nothingness,Sartre,Existentialism,"(Desire, is, a, lack, of, being, .)",1943,2011,Desire is a lack of being.,26
3847,Being and Nothingness,Sartre,Existentialism,"(Let, us, look, more, closely, .)",1943,2011,Let us look more closely.,25
10675,Being and Nothingness,Sartre,Existentialism,"(Let, us, look, more, closely, .)",1943,2011,Let us look more closely.,25
11370,Being and Nothingness,Sartre,Existentialism,"(Let, us, look, more, closely, .)",1943,2011,Let us look more closely.,25


In [79]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [80]:
len(df)

13693

#### Check for Foreign Languages

In [81]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

1

In [82]:
df = df.drop(df[df['sentence_str'].str.contains('\sder\s')].index)

In [83]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [84]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

13692

In [85]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

13692

In [86]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

13692

In [87]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

13692

In [88]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

13692

### Lemmatize and Tokenize

In [89]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [90]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [91]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
7277,Being and Nothingness,Sartre,Existentialism,"(Thus, our, description, has, enabled, us, to, satisfy, the, preliminary, conditions, which, we, have, posited, for, any, theory, about, the, existence, of, the, Other, .)",1943,2011,Thus our description has enabled us to satisfy the preliminary conditions which we have posited for any theory about the existence of the Other.,144,"[thus, our, description, has, enabled, us, to, satisfy, the, preliminary, conditions, which, we, have, posited, for, any, theory, about, the, existence, of, the, other]",thus -PRON- description have enable -PRON- to satisfy the preliminary condition which -PRON- have posit for any theory about the existence of the other .
5414,Being and Nothingness,Sartre,Existentialism,"(There, are, certain, futures, in, the, world, which, are, defined, by, chance, and, become, autonomous, probables, ,, which, are, not, probabilized, but, which, are, as, probables, ,, as, fully, ...",1943,2011,"There are certain futures in the world which are defined by chance and become autonomous probables, which are not probabilized but which are as probables, as fully constituted nows, with their con...",238,"[there, are, certain, futures, in, the, world, which, are, defined, by, chance, and, become, autonomous, probables, which, are, not, probabilized, but, which, are, as, probables, as, fully, consti...","there be certain future in the world which be define by chance and become autonomous probable , which be not probabilize but which be as probable , as fully constitute now , with -PRON- content w..."
12300,Being and Nothingness,Sartre,Existentialism,"(,, then, anybody, at, all, can, die, in, my, place, as, in, the, song, in, which, lots, are, drawn, to, see, who, is, to, be, eaten, .)",1943,2011,", then anybody at all can die in my place as in the song in which lots are drawn to see who is to be eaten.",107,"[then, anybody, at, all, can, die, in, my, place, as, in, the, song, in, which, lots, are, drawn, to, see, who, is, to, be, eaten]",", then anybody at all can die in -PRON- place as in the song in which lot be draw to see who be to be eat ."
10645,Being and Nothingness,Sartre,Existentialism,"(The, for, itself, which, exists, in, the, voluntary, mode, wishes, to, recover, itself, in, so, far, as, it, decides, and, acts, .)",1943,2011,The for itself which exists in the voluntary mode wishes to recover itself in so far as it decides and acts.,108,"[the, for, itself, which, exists, in, the, voluntary, mode, wishes, to, recover, itself, in, so, far, as, it, decides, and, acts]",the for -PRON- which exist in the voluntary mode wish to recover -PRON- in so far as -PRON- decide and act .
4034,Being and Nothingness,Sartre,Existentialism,"(Reflection, is, a, second, effort, by, the, for, itself, to, found, itself, ;, that, is, ,, to, be, for, itself, what, it, is, .)",1943,2011,"Reflection is a second effort by the for itself to found itself; that is, to be for itself what it is.",102,"[reflection, is, second, effort, by, the, for, itself, to, found, itself, that, is, to, be, for, itself, what, it, is]","reflection be a second effort by the for -PRON- to find -PRON- ; that is , to be for -PRON- what -PRON- be ."


### Combine with the Old Dataframe & Export to CSV

In [96]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
360179,Proslogion,Anselm,scholasticism,"this is what gives the impression of poverty found in 'secret diaries' or 'feminine autobiographies'; so occupied in flattering herself, the woman who does nothing becomes nothing and flatters a n...","this is what gives the impression of poverty found in 'secret diaries' or 'feminine autobiographies'; so occupied in flattering herself, the woman who does nothing becomes nothing and flatters a n...",1077,2000,203,"this is what gives the impression of poverty found in 'secret diaries' or 'feminine autobiographies'; so occupied in flattering herself, the woman who does nothing becomes nothing and flatters a n...","['this', 'is', 'what', 'gives', 'the', 'impression', 'of', 'poverty', 'found', 'in', 'secret', 'diaries', 'or', 'feminine', 'autobiographies', 'so', 'occupied', 'in', 'flattering', 'herself', 'the...","this be what give the impression of poverty find in ' secret diary ' or ' feminine autobiography ' ; so occupy in flatter -PRON- , the woman who do nothing become nothing and flatter a nothing ."
90708,Essay Concerning Human Understanding,Locke,empiricism,"As it is in the motions of the body, so it is in the thoughts of our minds: where any one is such, that we have power to take it up, or lay it by, according to the preference of the mind, there we...","As it is in the motions of the body, so it is in the thoughts of our minds: where any one is such, that we have power to take it up, or lay it by, according to the preference of the mind, there we...",1689,2004,212,"as it is in the motions of the body, so it is in the thoughts of our minds: where any one is such, that we have power to take it up, or lay it by, according to the preference of the mind, there we...","['as', 'it', 'is', 'in', 'the', 'motions', 'of', 'the', 'body', 'so', 'it', 'is', 'in', 'the', 'thoughts', 'of', 'our', 'minds', 'where', 'any', 'one', 'is', 'such', 'that', 'we', 'have', 'power',...","as -PRON- be in the motion of the body , so -PRON- be in the thought of -PRON- mind : where any one be such , that -PRON- have power to take -PRON- up , or lie -PRON- by , accord to the preferenc..."
266072,The System Of Ethics,Fichte,german_idealism,"In short, absolute sincerity and truthfulness is something I simply owe everyone; I am not permitted to say anything that contradicts the truth.","In short, absolute sincerity and truthfulness is something I simply owe everyone; I am not permitted to say anything that contradicts the truth.",1798,2005,144,"in short, absolute sincerity and truthfulness is something i simply owe everyone; i am not permitted to say anything that contradicts the truth.","['in', 'short', 'absolute', 'sincerity', 'and', 'truthfulness', 'is', 'something', 'simply', 'owe', 'everyone', 'am', 'not', 'permitted', 'to', 'say', 'anything', 'that', 'contradicts', 'the', 'tr...","in short , absolute sincerity and truthfulness be something -PRON- simply owe everyone ; -PRON- be not permit to say anything that contradict the truth ."
200591,The Order Of Things,Foucault,continental,It is no doubt difficult to maintain the human sciences that ethnology has a fundamental relation with historicity since it is traditionally the knowledge we have of peoples without histories; in ...,It is no doubt difficult to maintain the human sciences that ethnology has a fundamental relation with historicity since it is traditionally the knowledge we have of peoples without histories; in ...,1966,2002,358,it is no doubt difficult to maintain the human sciences that ethnology has a fundamental relation with historicity since it is traditionally the knowledge we have of peoples without histories; in ...,"['it', 'is', 'no', 'doubt', 'difficult', 'to', 'maintain', 'the', 'human', 'sciences', 'that', 'ethnology', 'has', 'fundamental', 'relation', 'with', 'historicity', 'since', 'it', 'is', 'tradition...",-PRON- be no doubt difficult to maintain the human science that ethnology have a fundamental relation with historicity since -PRON- be traditionally the knowledge -PRON- have of people without hi...
235114,Being And Time,Heidegger,phenomenology,"To say that it is illuminated means that it is cleared in itself as being in the world, not by another being, but in such a way that it is itself the clearing.","To say that it is illuminated means that it is cleared in itself as being in the world, not by another being, but in such a way that it is itself the clearing.",1927,1996,159,"to say that it is illuminated means that it is cleared in itself as being in the world, not by another being, but in such a way that it is itself the clearing.","['to', 'say', 'that', 'it', 'is', 'illuminated', 'means', 'that', 'it', 'is', 'cleared', 'in', 'itself', 'as', 'being', 'in', 'the', 'world', 'not', 'by', 'another', 'being', 'but', 'in', 'such', ...","to say that -PRON- be illuminate mean that -PRON- be clear in -PRON- as be in the world , not by another being , but in such a way that -PRON- be -PRON- the clearing ."


In [97]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.129434
Plato              0.101803
Hegel              0.060234
Anselm             0.042660
Foucault           0.040439
Heidegger          0.040436
Kant               0.037478
Marx               0.035793
Lewis              0.034814
Malebranche        0.034487
Deleuze            0.033275
Kripke             0.033113
Smith              0.031027
Wittgenstein       0.023971
Locke              0.023576
Hume               0.022056
Merleau-Ponty      0.020145
Quine              0.019564
Nietzsche          0.018081
Derrida            0.015918
Davis              0.015698
Husserl            0.015236
Hobbes             0.014623
Fichte             0.014085
Russell            0.013461
Leibniz            0.013339
Seneca             0.013318
Popper             0.012413
Lenin              0.011858
Spinoza            0.010065
Moore              0.009733
Keynes             0.009051
Ricardo            0.008199
Beauvoir           0.008125
Berkeley           0.007255
Augustine          0

In [98]:
len(og_df)

376865

In [99]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.124896
Plato              0.098234
Hegel              0.058122
Anselm             0.041164
Foucault           0.039021
Heidegger          0.039019
Kant               0.036164
Sartre             0.035058
Marx               0.034538
Lewis              0.033593
Malebranche        0.033278
Deleuze            0.032108
Kripke             0.031952
Smith              0.029939
Wittgenstein       0.023131
Locke              0.022750
Hume               0.021282
Merleau-Ponty      0.019439
Quine              0.018878
Nietzsche          0.017447
Derrida            0.015360
Davis              0.015148
Husserl            0.014702
Hobbes             0.014111
Fichte             0.013591
Russell            0.012989
Leibniz            0.012871
Seneca             0.012851
Popper             0.011978
Lenin              0.011443
Spinoza            0.009712
Moore              0.009392
Keynes             0.008734
Ricardo            0.007912
Beauvoir           0.007840
Berkeley           0

In [100]:
new_df[new_df['author']=='Sartre'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
1207,Being and Nothingness,Sartre,Existentialism,"(The, earlier, resolution, of, ', not, playing, anymore, ', is, always, there, ,, and, in, the, majority, of, cases, the, gambler, when, in, the, presence, of, the, gaming, table, ,, turns, toward...","The earlier resolution of 'not playing anymore' is always there, and in the majority of cases the gambler when in the presence of the gaming table, turns toward it as if to ask it for help; for he...",1943,2011,385,,"[the, earlier, resolution, of, not, playing, anymore, is, always, there, and, in, the, majority, of, cases, the, gambler, when, in, the, presence, of, the, gaming, table, turns, toward, it, as, if...","the early resolution of ' not play anymore ' be always there , and in the majority of case the gambler when in the presence of the gaming table , turn toward -PRON- as if to ask -PRON- for help ;..."
10719,Being and Nothingness,Sartre,Existentialism,"(We, shall, see, in, the, next, chapter, what, is, the, meaning, of, the, word, having, and, to, what, extent, doing, is, a, method, of, appropriating, .)",We shall see in the next chapter what is the meaning of the word having and to what extent doing is a method of appropriating.,1943,2011,126,,"[we, shall, see, in, the, next, chapter, what, is, the, meaning, of, the, word, having, and, to, what, extent, doing, is, method, of, appropriating]",-PRON- shall see in the next chapter what be the meaning of the word have and to what extent do be a method of appropriate .
4006,Being and Nothingness,Sartre,Existentialism,"(that, the, reflected, on, is, an, appearance, for, the, reflective, ,, and, the, reflective, can, be, witness, only, in, so, far, as, it, is, consciousness, (, of, ), being, so, ;, that, is, ,, t...","that the reflected on is an appearance for the reflective, and the reflective can be witness only in so far as it is consciousness (of) being so; that is, to the exact extent that this witness, wh...",1943,2011,266,,"[that, the, reflected, on, is, an, appearance, for, the, reflective, and, the, reflective, can, be, witness, only, in, so, far, as, it, is, consciousness, of, being, so, that, is, to, the, exact, ...","that the reflect on be an appearance for the reflective , and the reflective can be witness only in so far as -PRON- be consciousness ( of ) be so ; that is , to the exact extent that this witnes..."
7364,Being and Nothingness,Sartre,Existentialism,"(But, it, can, not, be, both, at, the, same, time, .)",But it can not be both at the same time.,1943,2011,40,,"[but, it, can, not, be, both, at, the, same, time]",but -PRON- can not be both at the same time .
8857,Being and Nothingness,Sartre,Existentialism,"(The, lover, is, irritated, and, feels, himself, cheapened, when, he, thinks, that, the, beloved, has, chosen, him, from, among, others, ., ')",The lover is irritated and feels himself cheapened when he thinks that the beloved has chosen him from among others. ',1943,2011,118,,"[the, lover, is, irritated, and, feels, himself, cheapened, when, he, thinks, that, the, beloved, has, chosen, him, from, among, others]",the lover be irritated and feel -PRON- cheapen when -PRON- think that the beloved have choose -PRON- from among other . '


In [101]:
len(new_df)

390557

In [102]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [103]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
373091,Leviathan,Hobbes,Hobbes,1651,"For though in the institution or acquisition of a Commonwealth, which is independent, there needs no writing, because the power of the representative has there no other bounds but such as are set ..."
272467,Science Of Logic,Hegel,German Idealism,1817,it is only one ordinate with only one determination and law for it.
226318,The Phenomenology Of Perception,Merleau-Ponty,Phenomenology,1945,so etwas wie Selbstbewusstsein kann.'
339383,The Second Sex,Beauvoir,Feminism,1949,"But I have already replied to this: Everything that proceeds from weakness, envy and revenge."
220410,The Phenomenology Of Perception,Merleau-Ponty,Phenomenology,1945,A psychology is always brought face to face with the problem of the constitution of the world.


In [104]:
len(for_db)

390557

In [105]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('po',
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp',          
              con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Hobbes'""").fetchone()) 

  """)


('Leviathan', 'Hobbes', 'Hobbes', '1651', 'All which qualities called sensible are in the object that causeth them')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [106]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Sartre'""").fetchone()) 

('Being and Nothingness', 'Sartre', 'Existentialism', '1943', 'In vain do I fulfill the functions of a caf waiter.')
