<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [4]:
# if you are deleting an old text that was added here, make sure it is in the 
# primary database construction notebook
seneca_all_texts = get_guten('http://www.gutenberg.org/files/56075/56075-0.txt')

In [30]:
seneca_on_benefits = seneca_all_texts.split('SENECA OF BENEFITS')[1][54:].split('SENECA OF ANGER')[0]
seneca_on_anger = seneca_all_texts.split('SENECA OF ANGER')[1][95:].split('SENECA OF CLEMENCY')[0]
seneca_on_clemency = seneca_all_texts.split('SENECA OF CLEMENCY')[1][7:].split('Obvious typographical errors have been corre')[0][:-80]


### Clean the Text



In [31]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [32]:
# note extras like bracketed footnotes or specific words to remove


In [72]:
# build a dictionary for the book
seneca_on_benefits_dict = {
    'author': 'Seneca',
    'title': 'On Benefits',
    'text': seneca_on_benefits,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 59,
    'corpus date': 2017
}

seneca_on_anger_dict = {
    'author': 'Seneca',
    'title': 'On Anger',
    'text': seneca_on_anger,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 45,
    'corpus date': 2017
}

seneca_on_clemency_dict = {
    'author': 'Seneca',
    'title': 'On Clemency',
    'text': seneca_on_clemency,
    'school': 'Stoicism',
    'words to remove': [],
    'remove capitals': True,
    'bracketed fn': False,
    'original date': 55,
    'corpus date': 2017
}

In [73]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sSure ly\s': ' Surely ',
                  '\spiness': 'piness',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\schil dren\s': ' children ',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [74]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['original_publication_date'] = text_dict['original date']
  text_df['corpus_edition_date'] = text_dict['corpus date']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [75]:
# use the function
on_benefits_df = from_raw_to_df(seneca_on_benefits_dict)
on_anger_df = from_raw_to_df(seneca_on_anger_dict)
on_clemency_df = from_raw_to_df(seneca_on_clemency_dict)

df = on_benefits_df.append(on_anger_df, ignore_index=True).append(on_clemency_df, ignore_index=True)

In [76]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str
2911,On Anger,Seneca,Stoicism,"(They, look, presently, about, them, from, one, to, another, ,, as, who, should, say, ;, ', Do, but, see, ,, my, masters, ,, how, these, rogues, abuse, us, ., ')",45,2017,"They look presently about them from one to another, as who should say; 'Do but see, my masters, how these rogues abuse us.'"
239,On Benefits,Seneca,Stoicism,"(Two, persons, may, part, with, the, same, sum, of, money, ,, and, yet, not, the, same, benefit, :, the, one, had, it, of, his, own, and, it, was, but, a, little, out, of, a, great, deal, the, oth...",59,2017,"Two persons may part with the same sum of money, and yet not the same benefit: the one had it of his own and it was but a little out of a great deal the other borrowed it, and bestowed upon me tha..."
392,On Benefits,Seneca,Stoicism,"(He, that, is, not, illustrious, in, himself, ,, may, yet, be, reputed, so, in, the, right, of, his, ancestors, :, and, there, is, a, gratitude, to, be, entailed, upon, the, offspring, of, famous,...",59,2017,"He that is not illustrious in himself, may yet be reputed so in the right of his ancestors: and there is a gratitude to be entailed upon the offspring of famous progenitors."
2858,On Benefits,Seneca,Stoicism,"(How, happy, is, he, that, owes, nothing, but, to, himself, ,, and, only, that, which, he, can, easily, refuse, or, easily, pay, !)",59,2017,"How happy is he that owes nothing but to himself, and only that which he can easily refuse or easily pay!"
3314,On Anger,Seneca,Stoicism,"(Neither, are, we, moved, at, the, impudence, and, bitterness, of, a, buffoon, though, he, fall, upon, his, own, master, as, well, as, the, guests, ;, but, ,, on, the, contrary, ,, we, encourage, ...",45,2017,"Neither are we moved at the impudence and bitterness of a buffoon though he fall upon his own master as well as the guests; but, on the contrary, we encourage and entertain the freedom."
3706,On Clemency,Seneca,Stoicism,"(It, would, tire, them, out, ,, if, either, they, were, to, execute, all, with, their, own, hands, ,, or, to, wound, others, at, the, peril, of, their, own, lives, .)",55,2017,"It would tire them out, if either they were to execute all with their own hands, or to wound others at the peril of their own lives."
604,On Benefits,Seneca,Stoicism,"(The, counsels, of, a, wise, man, are, certain, ,, but, events, are, uncertain, :, and, yet, if, I, have, passed, a, rash, promise, ,, I, will, in, some, degree, punish, the, temerity, of, making,...",59,2017,"The counsels of a wise man are certain, but events are uncertain: and yet if I have passed a rash promise, I will in some degree punish the temerity of making it with the damage of keeping it, unl..."
1150,On Benefits,Seneca,Stoicism,"(Socrates, looked, a, scandalous, death, in, the, face, with, the, same, constancy, that, he, had, before, practiced, towards, the, thirty, tyrants, :)",59,2017,Socrates looked a scandalous death in the face with the same constancy that he had before practiced towards the thirty tyrants:
2465,On Benefits,Seneca,Stoicism,"(To, have, it, said, ', that, such, a, one, is, never, out, of, his, study, ,, and, sees, nobody, ,, ', etc, ., ;, this, furnishes, matter, for, discourse, .)",59,2017,"To have it said 'that such a one is never out of his study, and sees nobody,' etc.; this furnishes matter for discourse."
3021,On Anger,Seneca,Stoicism,"(There, is, nothing, great, but, what, is, virtuous, ,, nor, indeed, truly, great, ,, but, what, is, also, composed, and, quiet, .)",45,2017,"There is nothing great but what is virtuous, nor indeed truly great, but what is also composed and quiet."


In [77]:
len(df)

3810

#### Remove Short Sentences

In [78]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 72 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length
366,On Benefits,Seneca,Stoicism,"(the, brother, ?)",59,2017,the brother?,12
2592,On Benefits,Seneca,Stoicism,"(how, well, .)",59,2017,how well.,9
2396,On Benefits,Seneca,Stoicism,"(Alas, !)",59,2017,Alas!,5
403,On Benefits,Seneca,Stoicism,"(And, it, is, but)",59,2017,And it is but,13
369,On Benefits,Seneca,Stoicism,"(the, mother, ?)",59,2017,the mother?,11


In [79]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

3738

#### Remove Cases of Self-Mention

In [80]:
# change the author name in this cell 

self_mentions = df[df['sentence_str'].str.contains('\s'+'Anselm'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,original_publication_date,corpus_edition_date,sentence_str,sentence_length


In [81]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Augustine'.lower())].index)

len(df)

3738

#### Deal with Duplicates

In [82]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

0

In [83]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

ValueError: ignored

In [None]:
df = df.drop(df[df['sentence_str'].duplicated(keep='first')].index)

In [None]:
len(df)

#### Check for Foreign Languages

In [None]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

In [None]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

#### Some Ad Hoc Cleaning

In [None]:
# miscellaneous nonsense sentences
df = df.drop(df[df['sentence_str'].str.contains('\spp\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\stotam\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\srree\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sflir\s')].index)
df = df.drop(df[(df['sentence_str'].str.contains('\smodis\s')) & (df['author'] != 'Kant')].index)

len(df)

In [None]:
# markers of french and notes
df = df.drop(df[df['sentence_str'].str.contains('\schapitre')].index)
df = df.drop(df[df['sentence_str'].str.contains('\salisme')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sHahn')].index)

len(df)

In [None]:
# some notes in Kant
df = df.drop(df[df['sentence_str'].str.contains('\sVorl\s')].index)
df = df.drop(df[df['sentence_str'].str.contains('\sberschwenglich')].index)

len(df)

In [None]:
# a common phrase in Plato / Aristotle footnotes
df = df.drop(df[(df['author']=='Plato') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)
df = df.drop(df[(df['author']=='Aristotle') & (df['sentence_str'].str.contains('(?i)reading')) & (df['sentence_length'] < 40)].index)

len(df)

In [None]:
# mentions of Aristotle in Plato
df = df.drop(df[(df['author']=='Plato') & df['sentence_str'].str.contains('Aristotle')].index)

len(df)

### Lemmatize and Tokenize

In [None]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [None]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [None]:
df.sample(5)

### Combine with the Old Dataframe & Export to CSV

In [None]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

In [None]:
og_df['author'].value_counts(normalize=True)

In [None]:
len(og_df)

In [None]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

In [None]:
new_df[new_df['author']=='Seneca'].sample(5)

In [None]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

###Upload Data to the SQL Server

In [84]:
# prepare to upload to the PostgreSQL database

# note which dataframe you set this to - new_df for the whole dataset, df for 
# just the new text

for_db = new_df
for_db['date'] = for_db['original_publication_date']
for_db['date'] = for_db['date'].apply(lambda x: str(x)[1:]+' BC' if x < 0 else str(x))
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 
                      'sentence_length',
                      'sentence_lowered', 
                      'sentence_str', 
                      'tokenized_txt', 
                      'lemmatized_str',
                      'corpus_edition_date',
                      'original_publication_date'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,DATE,SENTENCE
51317,Aristotle - Complete Works,Aristotle,Aristotle,320 BC,Indeed those animals which are hottest in the belly have the hottest excreta.
16994,Plato - Complete Works,Plato,Plato,350 BC,And has been for two days.
215256,Anti-Oedipus,Deleuze,Continental,1972,"Such neurosis, the displacement of the limit, in order to create a little colon al world of one's own."
170976,The Logic Of Scientific Discovery,Popper,Analytic,1959,"The Theory of Groups and Quantum Mechanics,."
328316,Meditations,Marcus Aurelius,Stoicism,170,(if he have any wit at all) can in a manner (for that they are all of one kind)


In [85]:
len(for_db)

369599

In [86]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine('post',
                       echo=False)
  
# attach the data frame to the sql server 
for_db.to_sql('phil_nlp', 
               con = engine,
              if_exists='replace',
              index=False,
              method='multi') 
  
# show the completed data as a test
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Seneca'""").fetchone()) 

('On Benefits', 'Seneca', 'Stoicism', '59', 'and so in infinitum.')


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!

In [71]:
print(engine.execute("""SELECT * FROM phil_nlp where "AUTHOR" = 'Anselm'""").fetchone()) 

('Proslogion', 'Anselm', 'Scholasticism', '1077', 'How is it, then, Lord, that You are all these things?')
