<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [4]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
chuang_tzu_book = get_text(drive_path + '/phil_txts/chuang_tzu_book.txt')

In [15]:
chuang_tzu_book = chuang_tzu_book.split('thinkers the world has ever')[1][61:].split('1. The Illus')[0][:-2]

In [16]:
len(chuang_tzu_book)

530357

### Clean the Text



In [17]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)
  result = re.sub(r'è', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [None]:
# note extras like bracketed footnotes or specific words to remove


In [18]:
# build a dictionary for the book
chuang_tzu_book_dict = {
    'author': 'Zhuangzi',
    'title': 'The Book of Zhuangzi ',
    'text': chuang_tzu_book,
    'school': 'Daoism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': -300,
    'corpus date': 2006
}

In [22]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [23]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [24]:
# use the function
f_t_df = from_raw_to_df(chuang_tzu_book_dict)
df = f_t_df

In [25]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
1406,The Book of Zhuangzi,Zhuangzi,Daoism,"(My, description, of, being, wise, has, nothing, to, do, with, benevolence, and, righteousness, ,, it, is, that, one, should, be, led, by, one, 's, innate, nature, ,, nothing, more, .)",-300,2006,"My description of being wise has nothing to do with benevolence and righteousness, it is that one should be led by one's innate nature, nothing more."
2857,The Book of Zhuangzi,Zhuangzi,Daoism,"(Or, would, he, rather, have, lived, and, continued, to, crawl, about, in, the, mud, ?, ')",-300,2006,Or would he rather have lived and continued to crawl about in the mud?'
2207,The Book of Zhuangzi,Zhuangzi,Daoism,"(If, the, nobleman, is, without, benevolence, ,, he, has, no, purpose, ;, if, without, righteousness, ,, he, has, no, life, .)",-300,2006,"If the nobleman is without benevolence, he has no purpose; if without righteousness, he has no life."
2389,The Book of Zhuangzi,Zhuangzi,Daoism,"(The, result, was, that, the, tree, was, chopped, down, in, Sung, ;, he, was, forced, out, of, Wei, ;, he, got, into, considerable, problems, in, Shang, and, Chou, .)",-300,2006,The result was that the tree was chopped down in Sung; he was forced out of Wei; he got into considerable problems in Shang and Chou.
454,The Book of Zhuangzi,Zhuangzi,Daoism,"(The, great, Way, does, n't, get, involved, like, this, ,, mixing, many, things, together, .)",-300,2006,"The great Way doesn't get involved like this, mixing many things together."
5417,The Book of Zhuangzi,Zhuangzi,Daoism,"(In, the, ways, in, which, they, affect, their, true, form, ,, and, change, their, innate, natures, ,, they, are, different, .)",-300,2006,"In the ways in which they affect their true form, and change their innate natures, they are different."
4813,The Book of Zhuangzi,Zhuangzi,Daoism,"(His, heart, was, troubled, ,, so, he, asked, the, diviner, ,, who, said, ,, ', Kill, the, turtle, and, use, it, to, make, divinations, and, receive, an, oracle, ., ')",-300,2006,"His heart was troubled, so he asked the diviner, who said, 'Kill the turtle and use it to make divinations and receive an oracle.'"
2414,The Book of Zhuangzi,Zhuangzi,Daoism,"(As, soon, as, they, saw, her, ,, the, wealthy, people, in, the, area, slammed, their, gates, shut, and, refused, to, venture, out, !)",-300,2006,"As soon as they saw her, the wealthy people in the area slammed their gates shut and refused to venture out!"
1256,The Book of Zhuangzi,Zhuangzi,Daoism,"(How, can, you, think, you, have, grasped, my, Tao, ?)",-300,2006,How can you think you have grasped my Tao?
248,The Book of Zhuangzi,Zhuangzi,Daoism,"(I, have, just, made, a, statement, ,, yet, I, do, not, know, whether, what, I, said, has, been, real, in, what, I, said, or, not, really, said, .)",-300,2006,"I have just made a statement, yet I do not know whether what I said has been real in what I said or not really said."


In [26]:
len(df)

6105

#### Remove Short Sentences

In [27]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 225 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
3455,The Book of Zhuangzi,Zhuangzi,Daoism,"(Why, ?, ', ')",-300,2006,Why?' ',7
1149,The Book of Zhuangzi,Zhuangzi,Daoism,('),-300,2006,',1
1712,The Book of Zhuangzi,Zhuangzi,Daoism,('),-300,2006,',1
3078,The Book of Zhuangzi,Zhuangzi,Daoism,('),-300,2006,',1
5176,The Book of Zhuangzi,Zhuangzi,Daoism,"(I, do, n't, know, ., ')",-300,2006,I don't know.',14


In [28]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

5880

#### Remove Cases of Self-Mention

In [30]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Chuang Tzu'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [31]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

5880

#### Deal with Duplicates

In [32]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

10

In [33]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
1272,The Book of Zhuangzi,Zhuangzi,Daoism,"(Bring, him, again, if, you, can, ., ')",-300,2006,Bring him again if you can.',28
1282,The Book of Zhuangzi,Zhuangzi,Daoism,"(Bring, him, again, if, you, can, ., ')",-300,2006,Bring him again if you can.',28
2206,The Book of Zhuangzi,Zhuangzi,Daoism,"(Certainly, ,, ', said, Confucius, ., ')",-300,2006,"Certainly,' said Confucius. '",29
3289,The Book of Zhuangzi,Zhuangzi,Daoism,"(Certainly, ,, ', said, Confucius, ., ')",-300,2006,"Certainly,' said Confucius. '",29
488,The Book of Zhuangzi,Zhuangzi,Daoism,"(Confucius, said, ,, ', Is, that, it, ?)",-300,2006,"Confucius said, 'Is that it?",28
512,The Book of Zhuangzi,Zhuangzi,Daoism,"(Confucius, said, ,, ', Is, that, it, ?)",-300,2006,"Confucius said, 'Is that it?",28
1167,The Book of Zhuangzi,Zhuangzi,Daoism,"(Confucius, said, ,, ', What, do, you, mean, ?, ', ')",-300,2006,"Confucius said, 'What do you mean?' '",37
1171,The Book of Zhuangzi,Zhuangzi,Daoism,"(Confucius, said, ,, ', What, do, you, mean, ?, ', ')",-300,2006,"Confucius said, 'What do you mean?' '",37
1472,The Book of Zhuangzi,Zhuangzi,Daoism,"(How, do, I, know, all, this, ?)",-300,2006,How do I know all this?,23
1488,The Book of Zhuangzi,Zhuangzi,Daoism,"(How, do, I, know, all, this, ?)",-300,2006,How do I know all this?,23


In [34]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [35]:
len(df)

5870

#### Check for Foreign Languages

In [36]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

0

In [37]:
df = df.drop(df[df['sentence_str'].str.contains('\sder\s')].index)

In [38]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

#### Some Ad Hoc Cleaning

In [39]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

5870

In [40]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

5870

In [41]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

5870

In [42]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

5870

In [43]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

5870

### Lemmatize and Tokenize

In [44]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [45]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [57]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length,tokenized_txt,lemmatized_str
547,The Book of Zhuangzi,Zhuangzi,Daoism,"(But, if, you, can, not, remain, still, ,, then, your, mind, goes, racing, off, ,, even, though, physically, you, remain, sitting, .)",-300,2006,"But if you cannot remain still, then your mind goes racing off, even though physically you remain sitting.",106,"[but, if, you, cannot, remain, still, then, your, mind, goes, racing, off, even, though, physically, you, remain, sitting]","but if -PRON- can not remain still , then -PRON- mind go race off , even though physically -PRON- remain sit ."
6060,The Book of Zhuangzi,Zhuangzi,Daoism,"(His, vision, of, the, origin, is, vast, and, penetrating, ,, ever, expanding, and, open, minded, ,, unshackled, by, anything, or, anybody, .)",-300,2006,"His vision of the origin is vast and penetrating, ever expanding and open minded, unshackled by anything or anybody.",116,"[his, vision, of, the, origin, is, vast, and, penetrating, ever, expanding, and, open, minded, unshackled, by, anything, or, anybody]","-PRON- vision of the origin be vast and penetrating , ever expand and open minded , unshackle by anything or anybody ."
3844,The Book of Zhuangzi,Zhuangzi,Daoism,"(The, Tao, has, no, name, ., ')",-300,2006,The Tao has no name.',21,"[the, tao, has, no, name]",the Tao have no name . '
5581,The Book of Zhuangzi,Zhuangzi,Daoism,"(In, the, middle, lands, it, restores, harmony, to, the, people, and, is, in, balance, with, the, four, directions, .)",-300,2006,In the middle lands it restores harmony to the people and is in balance with the four directions.,97,"[in, the, middle, lands, it, restores, harmony, to, the, people, and, is, in, balance, with, the, four, directions]",in the middle land -PRON- restore harmony to the people and be in balance with the four direction .
2677,The Book of Zhuangzi,Zhuangzi,Daoism,"(The, time, since, they, were, born, is, nothing, in, comparison, to, the, time, before, they, were, born, .)",-300,2006,The time since they were born is nothing in comparison to the time before they were born.,89,"[the, time, since, they, were, born, is, nothing, in, comparison, to, the, time, before, they, were, born]",the time since -PRON- be bear be nothing in comparison to the time before -PRON- be bear .


### Combine with the Old Dataframe & Export to CSV

In [47]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
232130,The Idea Of Phenomenology,Husserl,phenomenology,"Here we are not speaking of the acts of abstraction that occur in the psychological subject, and the psychological conditions under which they are performed.","Here we are not speaking of the acts of abstraction that occur in the psychological subject, and the psychological conditions under which they are performed.",1907,1999,157,"here we are not speaking of the acts of abstraction that occur in the psychological subject, and the psychological conditions under which they are performed.","['here', 'we', 'are', 'not', 'speaking', 'of', 'the', 'acts', 'of', 'abstraction', 'that', 'occur', 'in', 'the', 'psychological', 'subject', 'and', 'the', 'psychological', 'conditions', 'under', '...","here -PRON- be not speak of the act of abstraction that occur in the psychological subject , and the psychological condition under which -PRON- be perform ."
174170,Philosophical Troubles,Kripke,analytic,"Now, this is what I think in fact is true for those cases of naming where the reference is fixed by description.","Now, this is what I think in fact is true for those cases of naming where the reference is fixed by description.",1975,2011,112,"now, this is what i think in fact is true for those cases of naming where the reference is fixed by description.","['now', 'this', 'is', 'what', 'think', 'in', 'fact', 'is', 'true', 'for', 'those', 'cases', 'of', 'naming', 'where', 'the', 'reference', 'is', 'fixed', 'by', 'description']","now , this be what -PRON- think in fact be true for those case of name where the reference be fix by description ."
257017,Critique Of Pure Reason,Kant,german_idealism,"But the case is the same here as with other pure a priori rep resentations (e.g., space and time) that we can extract as clear concepts from experience only because we have put them into experienc...","But the case is the same here as with other pure a priori rep resentations (e.g., space and time) that we can extract as clear concepts from experience only because we have put them into experienc...",1781,1998,223,"but the case is the same here as with other pure a priori rep resentations (e.g., space and time) that we can extract as clear concepts from experience only because we have put them into experienc...","['but', 'the', 'case', 'is', 'the', 'same', 'here', 'as', 'with', 'other', 'pure', 'priori', 'rep', 'resentations', 'space', 'and', 'time', 'that', 'we', 'can', 'extract', 'as', 'clear', 'concepts...","but the case be the same here as with other pure a priori rep resentation ( e.g. , space and time ) that -PRON- can extract as clear concept from experience only because -PRON- have put -PRON- in..."
316673,The Wealth Of Nations,Smith,capitalism,"Under the local or provincial administration of the justices of the peace in Great Britain, the six days labour which the country people are obliged to give to the reparation of the highways, is n...","Under the local or provincial administration of the justices of the peace in Great Britain, the six days labour which the country people are obliged to give to the reparation of the highways, is n...",1776,2009,319,"under the local or provincial administration of the justices of the peace in great britain, the six days labour which the country people are obliged to give to the reparation of the highways, is n...","['under', 'the', 'local', 'or', 'provincial', 'administration', 'of', 'the', 'justices', 'of', 'the', 'peace', 'in', 'great', 'britain', 'the', 'six', 'days', 'labour', 'which', 'the', 'country', ...","under the local or provincial administration of the justice of the peace in Great Britain , the six day labour which the country people be oblige to give to the reparation of the highway , be not..."
70185,Aristotle - Complete Works,Aristotle,aristotle,"Why is it that, though in the rest of the body the left side is weaker than the right, this is not true of the eyes, but the sight of both eyes is equally acute?","Why is it that, though in the rest of the body the left side is weaker than the right, this is not true of the eyes, but the sight of both eyes is equally acute?",-320,1991,161,"why is it that, though in the rest of the body the left side is weaker than the right, this is not true of the eyes, but the sight of both eyes is equally acute?","['why', 'is', 'it', 'that', 'though', 'in', 'the', 'rest', 'of', 'the', 'body', 'the', 'left', 'side', 'is', 'weaker', 'than', 'the', 'right', 'this', 'is', 'not', 'true', 'of', 'the', 'eyes', 'bu...","why be -PRON- that , though in the rest of the body the left side be weak than the right , this be not true of the eye , but the sight of both eye be equally acute ?"


In [48]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.124896
Plato              0.098234
Hegel              0.058122
Anselm             0.041164
Foucault           0.039021
Heidegger          0.039019
Kant               0.036164
Sartre             0.035058
Marx               0.034538
Lewis              0.033593
Malebranche        0.033278
Deleuze            0.032108
Kripke             0.031952
Smith              0.029939
Wittgenstein       0.023131
Locke              0.022750
Hume               0.021282
Merleau-Ponty      0.019439
Quine              0.018878
Nietzsche          0.017447
Derrida            0.015360
Davis              0.015148
Husserl            0.014702
Hobbes             0.014111
Fichte             0.013591
Russell            0.012989
Leibniz            0.012871
Seneca             0.012851
Popper             0.011978
Lenin              0.011443
Spinoza            0.009712
Moore              0.009392
Keynes             0.008734
Ricardo            0.007912
Beauvoir           0.007840
Berkeley           0

In [None]:
len(og_df)

376865

In [49]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle          0.123047
Plato              0.096779
Hegel              0.057261
Anselm             0.040555
Foucault           0.038443
Heidegger          0.038441
Kant               0.035628
Sartre             0.034539
Marx               0.034026
Lewis              0.033096
Malebranche        0.032785
Deleuze            0.031633
Kripke             0.031479
Smith              0.029496
Wittgenstein       0.022789
Locke              0.022413
Hume               0.020967
Merleau-Ponty      0.019151
Quine              0.018599
Nietzsche          0.017189
Derrida            0.015133
Davis              0.014923
Zhuangzi           0.014807
Husserl            0.014484
Hobbes             0.013902
Fichte             0.013390
Russell            0.012797
Leibniz            0.012681
Seneca             0.012661
Popper             0.011800
Lenin              0.011273
Spinoza            0.009568
Moore              0.009253
Keynes             0.008604
Ricardo            0.007795
Beauvoir           0

In [50]:
new_df[new_df['author']=='Zhuangzi'].sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,original_publication_date,corpus_edition_date,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
6103,The Book of Zhuangzi,Zhuangzi,Daoism,"(Racing, after, the, multitude, of, things, in, this, world, and, never, returning, ,, he, was, indeed, like, someone, who, tries, to, have, the, last, word, with, an, echo, ,, or, who, tries, to,...","Racing after the multitude of things in this world and never returning, he was indeed like someone who tries to have the last word with an echo, or who tries to show that you can outrun your shadow.",-300,2006,198,,"[racing, after, the, multitude, of, things, in, this, world, and, never, returning, he, was, indeed, like, someone, who, tries, to, have, the, last, word, with, an, echo, or, who, tries, to, show,...","racing after the multitude of thing in this world and never return , -PRON- be indeed like someone who try to have the last word with an echo , or who try to show that -PRON- can outrun -PRON- sh..."
1541,The Book of Zhuangzi,Zhuangzi,Daoism,"(A, great, deal, of, knowledge, is, needed, to, make, fishing, lines, ,, traps, ,, baits, and, hooks, ,, but, the, result, is, that, the, fish, disperse, in, distress, in, the, water, .)","A great deal of knowledge is needed to make fishing lines, traps, baits and hooks, but the result is that the fish disperse in distress in the water.",-300,2006,149,,"[great, deal, of, knowledge, is, needed, to, make, fishing, lines, traps, baits, and, hooks, but, the, result, is, that, the, fish, disperse, in, distress, in, the, water]","a great deal of knowledge be need to make fishing line , trap , bait and hook , but the result be that the fish disperse in distress in the water ."
3094,The Book of Zhuangzi,Zhuangzi,Daoism,"(Surely, you, 're, with, that, ,, are, n't, you, ?, ')","Surely you're with that, aren't you?'",-300,2006,37,,"[surely, you, re, with, that, aren, you]","surely -PRON- be with that , be not -PRON- ? '"
440,The Book of Zhuangzi,Zhuangzi,Daoism,"(The, ancient, ones, considered, this, the, work, of, the, gods, who, free, us, from, bondage, ., ')",The ancient ones considered this the work of the gods who free us from bondage. ',-300,2006,81,,"[the, ancient, ones, considered, this, the, work, of, the, gods, who, free, us, from, bondage]",the ancient one consider this the work of the god who free -PRON- from bondage . '
2519,The Book of Zhuangzi,Zhuangzi,Daoism,"(They, 've, done, nothing, !)",They've done nothing!,-300,2006,21,,"[they, ve, done, nothing]",-PRON- have do nothing !


In [51]:
len(new_df)

396427

In [52]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [53]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
345604,"Women, Race, And Class",Davis,Feminism,1981,"Weep ye no more, Pallid Dudu!"
117033,The Search After Truth,Malebranche,Rationalism,1674,"I am not surprised that ordinary men or pagan philosophen consider only the soul's relation and union with the body, without recognizing lbe relation and union"
68361,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,"The ray from, the line will fall outside the line and the shadow is formed when the sun is higher in the heavens, and when it is lower, and it will be shortest wh the sun is at its highest and ove..."
124442,The Search After Truth,Malebranche,Rationalism,1674,"The lines and figures of geometry, then, are well suited for representing relations between magnitudes to the imagination, or between things that vary quantitatively, such as space, time, weight, ..."
387813,Being and Nothingness,Sartre,Existentialism,1943,This argument has never greatly troubled the partisans of human freedom.


In [54]:
len(for_db)

396427

In [55]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('p7joo',
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp',          
              con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Zhuangzi'""").fetchone()) 

  """)


('The Book of Zhuangzi ', 'Zhuangzi', 'Daoism', '300 BC', 'Can you be a little baby?')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [None]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Sartre'""").fetchone()) 

('Being and Nothingness', 'Sartre', 'Existentialism', '1943', 'In vain do I fulfill the functions of a caf waiter.')
