<a href="https://colab.research.google.com/github/kcalizadeh/PDP_data_processing/blob/master/new_text_introduction_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [1]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/philosophy_data_project'

sys.path.append(drive_path)

Mounted at /gdrive


In [2]:
from import_functions import *

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


###Load the Text and Clip Front and End Matter

In [97]:
beauvoir_second_sex = get_text(drive_path + '/phil_txts/beauvoir_second_sex.txt')

In [98]:
beauvoir_second_sex = beauvoir_second_sex.split('’s message of freedom and independence')[1][57:].split('* Bold Chronicl')[0]

### Clean the Text



In [109]:
def baseline_clean(to_correct, 
                   capitals=True, 
                   bracketed_fn=False, 
                   odd_words_dict={}):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â*_<>""⎫•{}Γ~]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace odd quotation marks with a standard
  result = re.sub(r'[‘’“”]', "'", result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # replace some accented characters for ease of searching
  result = re.sub(r'é', 'e', result)

  # remove or standardize some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # unify some abbreviations
  result = re.sub(r'&', 'and', result)
  result = re.sub(r'\se\.g\.\s', ' eg ', result)
  result = re.sub(r'\si\.e\.\s', ' ie ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s+,\s+', ', ', result)
  result = re.sub(r'\s+\.\s+', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)
  result = re.sub(r'\.\)\.', '.', result)
  result = re.sub(r'\.\(\.', '.', result)

  # reduce multiple periods, commas, or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r',+', ',', result)
  result = re.sub(r'\s+', ' ', result)

  # deal with isolated problem cases discovered in the data:
  for key in odd_words_dict.keys():
    result = re.sub(r''+key+'', odd_words_dict[key], result)

  return result

In [110]:
# note extras like bracketed footnotes or specific words to remove



In [111]:
# build a dictionary for the book
book_dict = {
    'author': 'mary wollstonecraft',
    'title': 'vindication of the rights of woman',
    'text': wollstonecraft_vindication,
    'school': 'feminism',
    'words to remove': [],
    'remove capitals': False,
    'bracketed fn': False
}


In [112]:
#@title Oddities Dictionary for Cleaning
# a dictionary of oddities to clean up
odd_words_dict = {'\sderstanding': 'derstanding',
                  '\sditference\s': ' difference ',
                  '\sforthe\s': ' for the ',
                  '\sject': 'ject',
                  '\sjects': 'jects', 
                  '\sness': 'ness',
                  '\sper\scent\s': ' percent ',
                  '\sper\scent\.': ' percent.',
                  '\sper\scent,': ' percent,',
                  '\wi\son': 'ion',
                  '\spri\sori': ' priori',
                  '\stences\s': 'tences ',
                  '\sprincipleb': ' principle',
                  '\ssciousness': 'sciousness',
                  '\stion': 'tion',
                  '\spri\s': ' pri',
                  '\scluding': 'cluding',
                  '\sdom': 'dom',
                  '\sers': 'ers',
                  '\scritiq\s': ' critique ',
                  '\ssensati\s': ' sensation ',
                  '(?i)\syou\sll': " you'll",
                  '\sI\sll': " I'll",
                  '(?i)\swe\sll': " we'll",
                  '(?i)he\sll': " he'll",
                  '(?i)who\sll': "who'll",
                  '(?i)\sthere\sll\s': " there'll ",
                  '\seduca\s': ' education ',
                  '\slity\s': 'lity ',
                  '\smultaneously\s': 'multaneously ',
                  '\stically\s': 'tically ',
                  '\sDa\ssein\s': ' Dasein ',
                  '(?i)\sthey\sll\s': " they'll ",
                  '(?i)\sin\tum\s': ' in turn ',
                  '\scon~\s': ' con',
                  '\sà\s': ' a ',
                  '\sjor\s': ' for ',
                  '\sluminating\s': 'luminating ',
                  '\sselj\s': ' self ',
                  '\stial\s': 'tial ',
                  '\sversal\s': 'versal ',
                  '\sexis\st': ' exist',
                  '\splauded\s': 'plauded ',
                  '\suiry\s': 'uiry ',
                  '\svithin\s': ' within ',
                  '\soj\s': ' of ',
                  '\sposi\st': ' posit',
                  '\sra\sther\s': ' rather ',
                  '(?i)\sthat\sll\s': " that'll ",
                  '(?i)\sa\sll\s': ' all ',
                  '\so\sther\s': ' other ',
                  '\sra\sther\s': ' rather ',
                  '\snei\sther\s': ' neither ',
                  '\sei\sther\s': ' either ',
                  '\sfur\sther\s': ' further ',
                  '\sano\sther': ' another ',
                  '\sneces\s': ' neces',
                  'u\slar\s': 'ular ',
                  '\sference\s': 'ference ',
                  '(?i)it\sll\s': "it'll ",
                  '\stoge\sther': ' together ',
                  '\sknowledgeb\s': ' knowledge ',
                  'r\stain\s': 'rtain ',
                  'on\stain\s': 'ontain',
                  '(?i)j\sect\s': 'ject',
                  '\sob\sect\s': ' object ',
                  '\sbtle\s': 'btle ',
                  '\snition\s': 'nition ',
                  '\sdering\s': 'dering ', 
                  '\sized\s': 'ized ',
                  '\sther\shand': ' other hand',
                  '\ture\s': 'ture ',
                  '\sabso\sl': ' absol',
                  '\stly\s': 'tly ',
                  '\serty\s': 'erty ',
                  '\sobj\se': ' obj',
                  '\sffiir\s': ' for ',
                  '\sndeed\s': ' indeed ',
                  '\sfonn\s': ' form ',
                  '\snally\s': 'nally ',
                  'ain\sty\s': 'ainty ',
                  'ici\sty\s': 'icity ',
                  '\scog\sni': ' cogni',
                  '\sacc\s': ' acc',
                  '\sindi\svid\sual': ' individual', 
                  '\sintu\sit': ' intuit',
                  'r\sance\s': 'rance ',
                  '\ssions\s': 'sions ',
                  '\sances\s': 'ances ',
                  '\sper\sception\s': ' perception ',
                  '\sse\sries\s': ' series ',
                  '\sque\sries\s': ' queries ',
                  '\sessary\s': 'essary ',
                  '\sofa\s': ' of a ',
                  '\scer\stainty\s': ' certainty ',
                  'ec\stivity\s': 'ectivity ',
                  '\stivity\s': 'tivity ',
                  '\slation\s': 'lation ',
                  '\sir\sr': ' irr',
                  '\ssub\sstance\s': ' substance ',
                  'sec\sond\s': 'second ',
                  '\s\.rv': '',
                  '\story\s': 'tory ',
                  '\sture\s': 'ture ',
                  '\sminate\s': 'minate ',
                  '\sing\s': 'ing ',
                  '\splicity\s': 'plicity ',
                  '\ssimi\slar\s': ' similar ',
                  '\scom\smunity\s': ' community ',
                  '\sitselfa\s': ' itself a ',
                  '\ssimp\s': ' simply ',
                  '\scon\stex': ' contex',
                  '\scon\sseq': ' conseq',
                  '\scon\stai': ' contai',
                  '\sofwhat\s': ' of what ',
                  '\sui\s': 'ui',
                  '\sofan\s': ' of an ',
                  '\saccor\sdance\s': ' accordance ',
                  '\stranscen\sdental\s': ' transcendental ',
                  '\sap\spearances\s': ' appearances ',
                  'e\squences\s': 'equences ',
                  '\sorits\s': ' or its ',
                  '\simma\sn': ' imman',
                  '\seq\sua': ' equa',
                  '\simpl\sied\s': ' implied ',
                  '\sbuta\s': ' but a ',
                  '\sa\snd\s': ' and ',
                  '\sence\s': 'ence ',
                  '\stain\s': 'tain ',
                  '\sunder\sstanding\s': ' understanding ',
                  'i\sence\s': 'ience ',
                  'r\sence\s': 'rence ',
                  '\stical\s': 'tical ',
                  '\sobjectsb\s': ' objects ',
                  '\stbe\s': ' the ',
                  '\smul\st': ' mult',
                  '\sgen\seral\s': ' general ',
                  '\suniver\ssal\s': ' universal ',
                  '\scon\stent\s': ' content ',
                  '\spar\sticular\s': ' particular ',
                  'ver\ssity\s': 'versity ',
                  '\sCritiq\s': ' Critique ',
                  '\sphilo\ssophy\s': ' philosophy ',
                  '\seq\s': ' eq'}

In [113]:
# a function that takes the dictionary and returns a dataframe of sentences
def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'],
                        odd_words_dict=odd_words_dict)
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

In [114]:
# use the function
df = from_raw_to_df(book_dict)

In [115]:
# checking the result
pd.options.display.max_colwidth = 200
df.sample(10)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str
184,The Second Sex,Simone de Beauvoir,feminism,"(The, same, vicious, circle, can, be, found, in, all, analogous, circumstances, :, when, an, individual, or, a, group, of, individuals, is, kept, in, a, situation, of, inferiority, ,, the, fact, i...","The same vicious circle can be found in all analogous circumstances: when an individual or a group of individuals is kept in a situation of inferiority, the fact is that he or they are inferior."
1651,The Second Sex,Simone de Beauvoir,feminism,"(As, soon, as, the, marriage, is, consummated, ,, ', Beaumanoir, says, ,, ', the, possessions, of, each, party, are, held, in, common, by, virtue, of, the, marriage, and, the, man, is, the, guardi...","As soon as the marriage is consummated,' Beaumanoir says, 'the possessions of each party are held in common by virtue of the marriage and the man is the guardian of them.'"
7659,The Second Sex,Simone de Beauvoir,feminism,"(She, detested, men, .)",She detested men.
6938,The Second Sex,Simone de Beauvoir,feminism,"(Not, enough, has, been, said, not, only, about, the, fear, of, masculine, aggressiveness, but, also, about, a, deep, feeling, of, frustration, at, the, disgust, that, is, mixed, with, her, desire...",Not enough has been said not only about the fear of masculine aggressiveness but also about a deep feeling of frustration at the disgust that is mixed with her desires: sexual satisfaction must be...
8368,The Second Sex,Simone de Beauvoir,feminism,"(This, means, it, must, above, all, be, free, .)",This means it must above all be free.
4321,The Second Sex,Simone de Beauvoir,feminism,"(She, is, a, ', free, genius, ,, something, like, one, of, those, spirits, of, the, air, which, certain, magical, practices, momentarily, permit, us, to, entertain, but, which, we, can, never, ove...","She is a 'free genius, something like one of those spirits of the air which certain magical practices momentarily permit us to entertain but which we can never overcome.'"
9957,The Second Sex,Simone de Beauvoir,feminism,"(The, light, plumage, of, her, lashes, ,, lowered, over, her, cheek, ,, interposed, between, the, scenes, of, earth, and, the, bluish, dream, of, her, eye)","The light plumage of her lashes, lowered over her cheek, interposed between the scenes of earth and the bluish dream of her eye"
1591,The Second Sex,Simone de Beauvoir,feminism,"(When, feudalism, emerges, out, of, the, convulsions, of, the, early, Middle, Ages, ,, woman, 's, condition, looks, very, uncertain, .)","When feudalism emerges out of the convulsions of the early Middle Ages, woman's condition looks very uncertain."
1231,The Second Sex,Simone de Beauvoir,feminism,"(He, must, have, heirs, who, will, extend, his, life, on, earth, because, he, bequeaths, them, his, possessions, ,, and, who, will, give, him, in, turn, ,, beyond, the, tomb, ,, the, necessary, ho...","He must have heirs who will extend his life on earth because he bequeaths them his possessions, and who will give him in turn, beyond the tomb, the necessary honors for the repose of his soul."
487,The Second Sex,Simone de Beauvoir,feminism,"(The, privilege, of, many, female, insects, comes, from, the, fact, that, fertilization, is, generally, a, rapid, process, while, ovulation, and, incubation, of, the, eggs, demand, a, long, period...",The privilege of many female insects comes from the fact that fertilization is generally a rapid process while ovulation and incubation of the eggs demand a long period of time.


In [116]:
len(df)

13646

#### Remove Short Sentences

In [117]:
df['sentence_length'] = df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(df[df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
df[df['sentence_length'] < 20].sample(5)

there are 559 so-called sentences with fewer than 20 characters


Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length
7,The Second Sex,Simone de Beauvoir,feminism,"(And, what, is, it, ?)",And what is it?,15
4590,The Second Sex,Simone de Beauvoir,feminism,"(The, Girls, .)",The Girls.,10
12137,The Second Sex,Simone de Beauvoir,feminism,(…),…,1
7134,The Second Sex,Simone de Beauvoir,feminism,(X.),X.,2
10217,The Second Sex,Simone de Beauvoir,feminism,"(But, my, God, !)",But my God!,11


In [118]:
df = df.drop(df[df['sentence_length'] < 20].index)
len(df)

13087

#### Remove Cases of Self-Mention

In [119]:
self_mentions = df[df['sentence_str'].str.contains('\s'+'Beauvoir'.lower())]
print(len(self_mentions))
self_mentions

0


Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length


In [120]:
df = df.drop(df[df['sentence_str'].str.contains('\s'+'Beauvoir'.lower())].index)

len(df)

13087

#### Deal with Duplicates

In [121]:
# find the total number of duplicates
len(df['sentence_str'])-len(df['sentence_str'].drop_duplicates())

38

In [122]:
doubles_df = pd.concat(g for _, g in df.groupby("sentence_str") if len(g) > 1)
doubles_df

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length
884,The Second Sex,Simone de Beauvoir,feminism,"((, Psychoanalytical, Method, and, the, Doctrine, of, Freud, ), .)",(Psychoanalytical Method and the Doctrine of Freud).,52
12279,The Second Sex,Simone de Beauvoir,feminism,"((, Psychoanalytical, Method, and, the, Doctrine, of, Freud, ), .)",(Psychoanalytical Method and the Doctrine of Freud).,52
934,The Second Sex,Simone de Beauvoir,feminism,"(And, this, has, become, possible, only, within, a, large, modern, industry, that, not, only, accepts, women, 's, work, on, a, grand, scale, but, formally, requires, it, ., ')",And this has become possible only within a large modern industry that not only accepts women's work on a grand scale but formally requires it.',143
2065,The Second Sex,Simone de Beauvoir,feminism,"(And, this, has, become, possible, only, within, a, large, modern, industry, that, not, only, accepts, women, 's, work, on, a, grand, scale, but, formally, requires, it, ., ')",And this has become possible only within a large modern industry that not only accepts women's work on a grand scale but formally requires it.',143
4869,The Second Sex,Simone de Beauvoir,feminism,"(And, yet, the, very, worst, curse, when, one, is, a, woman)",And yet the very worst curse when one is a woman,48
...,...,...,...,...,...,...
3047,The Second Sex,Simone de Beauvoir,feminism,"(You, should, always, dress, in, mourning, and, rags, ., ')",You should always dress in mourning and rags.',46
883,The Second Sex,Simone de Beauvoir,feminism,"(la, doctrine, freudienne)",la doctrine freudienne,22
7805,The Second Sex,Simone de Beauvoir,feminism,"(la, doctrine, freudienne)",la doctrine freudienne,22
7971,The Second Sex,Simone de Beauvoir,feminism,"(of, marriages, are, based, on, money, %, ), .)",of marriages are based on money %).,35


In [123]:
df = df.drop(df[df['sentence_str'].duplicated(keep=False)].index)

In [124]:
len(df)

13022

#### Check for Foreign Languages

In [125]:
# checking for 'der', a common article in German
len((df[df['sentence_str'].str.contains('\sder\s')]))

0

In [126]:
# checking for 'il', a common article in French
len(df[df['sentence_str'].str.contains('\sil\s')])

0

### Lemmatize and Tokenize

In [127]:
from gensim.utils import simple_preprocess

# use gensim to tokenize sentences
df['tokenized_txt'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=200))

# use spacey to get intelligent lemmatization
def lemmatize_sentence(sentence):
  lemmatized_txt = ''
  for word in sentence:
    lemmatized_txt += ' ' + str(word.lemma_)
  return lemmatized_txt

In [128]:
df['lemmatized_str'] = df['sentence_spacy'].apply(lemmatize_sentence)

In [129]:
df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length,tokenized_txt,lemmatized_str
1818,The Second Sex,Simone de Beauvoir,feminism,"(one, after, the, other, ,, Mme, de, Prie, ,, Mme, de, Mailly, ,, Mme, de, Ch, teauneuf, ,, Mme, de, Pompadour, ,, and, Mme, du, Barry, govern, Louis, there, is, barely, a, minister, without, his,...","one after the other, Mme de Prie, Mme de Mailly, Mme de Ch teauneuf, Mme de Pompadour, and Mme du Barry govern Louis there is barely a minister without his Egeria, to such a point that Montesquieu...",461,"[one, after, the, other, mme, de, prie, mme, de, mailly, mme, de, ch, teauneuf, mme, de, pompadour, and, mme, du, barry, govern, louis, there, is, barely, minister, without, his, egeria, to, such,...","one after the other , Mme de Prie , Mme de Mailly , Mme de Ch teauneuf , Mme de Pompadour , and Mme du Barry govern Louis there be barely a minister without -PRON- Egeria , to such a point that M..."
13560,The Second Sex,Simone de Beauvoir,feminism,"(The, woman, has, to, understand, that, an, exchange, a, basic, law, of, political, economy, is, negotiated, according, to, the, value, the, proposed, merchandise, has, for, the, buyer, and, not, ...",The woman has to understand that an exchange a basic law of political economy is negotiated according to the value the proposed merchandise has for the buyer and not for the seller: she was duped ...,904,"[the, woman, has, to, understand, that, an, exchange, basic, law, of, political, economy, is, negotiated, according, to, the, value, the, proposed, merchandise, has, for, the, buyer, and, not, for...",the woman have to understand that an exchange a basic law of political economy be negotiate accord to the value the propose merchandise have for the buyer and not for the seller : -PRON- be dupe ...
11854,The Second Sex,Simone de Beauvoir,feminism,"(Socially, Manichaean, ,, the, woman, has, a, deep, need, to, be, ontologically, optimistic, :, the, moralities, of, action, do, not, suit, her, ,, since, it, is, forbidden, for, her, to, act, ;, ...","Socially Manichaean, the woman has a deep need to be ontologically optimistic: the moralities of action do not suit her, since it is forbidden for her to act; she submits to the given: so the give...",321,"[socially, manichaean, the, woman, has, deep, need, to, be, ontologically, optimistic, the, moralities, of, action, do, not, suit, her, since, it, is, forbidden, for, her, to, act, she, submits, t...","socially Manichaean , the woman have a deep need to be ontologically optimistic : the morality of action do not suit -PRON- , since -PRON- be forbid for -PRON- to act ; -PRON- submit to the give ..."
12435,The Second Sex,Simone de Beauvoir,feminism,"(She, first, sought, in, love, a, confirmation, of, what, she, was, ,, her, past, ,, her, personage, ;, but, she, also, commits, her, future, :, to, justify, it, ,, she, destines, it, to, the, one...","She first sought in love a confirmation of what she was, her past, her personage; but she also commits her future: to justify it, she destines it to the one who possesses all values; she thus give...",293,"[she, first, sought, in, love, confirmation, of, what, she, was, her, past, her, personage, but, she, also, commits, her, future, to, justify, it, she, destines, it, to, the, one, who, possesses, ...","-PRON- first seek in love a confirmation of what -PRON- be , -PRON- past , -PRON- personage ; but -PRON- also commit -PRON- future : to justify -PRON- , -PRON- destine -PRON- to the one who posse..."
1380,The Second Sex,Simone de Beauvoir,feminism,"(Woman, can, share, in, her, husband, 's, estate, if, the, son, proves, himself, unworthy, ;, if, she, is, a, ', privileged, wife, ,, ', she, is, entrusted, with, the, guardianship, of, minor, chi...","Woman can share in her husband's estate if the son proves himself unworthy; if she is a 'privileged wife,' she is entrusted with the guardianship of minor children in the case of her husband's dea...",263,"[woman, can, share, in, her, husband, estate, if, the, son, proves, himself, unworthy, if, she, is, privileged, wife, she, is, entrusted, with, the, guardianship, of, minor, children, in, the, cas...","woman can share in -PRON- husband 's estate if the son prove -PRON- unworthy ; if -PRON- be a ' privileged wife , ' -PRON- be entrust with the guardianship of minor child in the case of -PRON- hu..."


### Combine with the Old Dataframe & Export to CSV

In [130]:
# load the old version and check it out
og_df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/philosophy_data_project/philosophy_data.csv')
og_df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str,sentence
672,Plato - Complete Works,Plato,Plato,"The jury now gives its verdict of guilty, and Meletus asks for the penalty of death.","The jury now gives its verdict of guilty, and Meletus asks for the penalty of death.",84,"the jury now gives its verdict of guilty, and meletus asks for the penalty of death.","['the', 'jury', 'now', 'gives', 'its', 'verdict', 'of', 'guilty', 'and', 'meletus', 'asks', 'for', 'the', 'penalty', 'of', 'death']","the jury now give -PRON- verdict of guilty , and Meletus ask for the penalty of death .","The jury now gives its verdict of guilty, and Meletus asks for the penalty of death."
10148,Plato - Complete Works,Plato,Plato,"And though it's not quite fair to those of us who have to speak last, if the first speeches turn out to be good enough and to exhaust our subject, I promise we won't complain.","And though it's not quite fair to those of us who have to speak last, if the first speeches turn out to be good enough and to exhaust our subject, I promise we won't complain.",175,"and though it's not quite fair to those of us who have to speak last, if the first speeches turn out to be good enough and to exhaust our subject, i promise we won't complain.","['and', 'though', 'it', 'not', 'quite', 'fair', 'to', 'those', 'of', 'us', 'who', 'have', 'to', 'speak', 'last', 'if', 'the', 'first', 'speeches', 'turn', 'out', 'to', 'be', 'good', 'enough', 'and...","and though -PRON- be not quite fair to those of -PRON- who have to speak last , if the first speech turn out to be good enough and to exhaust -PRON- subject , -PRON- promise -PRON- will not compl...","And though it's not quite fair to those of us who have to speak last, if the first speeches turn out to be good enough and to exhaust our subject, I promise we won't complain."
293493,Capital,Marx,Communism,"On this supposition, the half hours yield an annual product of one half hour yields half hours yield .e.","On this supposition, the half hours yield an annual product of one half hour yields half hours yield .e.",104,"on this supposition, the half hours yield an annual product of one half hour yields half hours yield .e.","['on', 'this', 'supposition', 'the', 'half', 'hours', 'yield', 'an', 'annual', 'product', 'of', 'one', 'half', 'hour', 'yields', 'half', 'hours', 'yield']","on this supposition , the half hour yield an annual product of one half hour yield half hour yield .e .","On this supposition, the half hours yield an annual product of one half hour yields half hours yield .e."
52442,Aristotle - Complete Works,Aristotle,Aristotle,"They are thus affected because they have never contemplated what is nobler the Universe and the greatest things of the Universe; for if they had properly attended to these things, they would.","They are thus affected because they have never contemplated what is nobler the Universe and the greatest things of the Universe; for if they had properly attended to these things, they would.",191,"they are thus affected because they have never contemplated what is nobler the universe and the greatest things of the universe; for if they had properly attended to these things, they would.","['they', 'are', 'thus', 'affected', 'because', 'they', 'have', 'never', 'contemplated', 'what', 'is', 'nobler', 'the', 'universe', 'and', 'the', 'greatest', 'things', 'of', 'the', 'universe', 'for...","-PRON- be thus affect because -PRON- have never contemplate what be noble the Universe and the great thing of the Universe ; for if -PRON- have properly attend to these thing , -PRON- would .","They are thus affected because they have never contemplated what is nobler the Universe and the greatest things of the Universe; for if they had properly attended to these things, they would."
81504,Aristotle - Complete Works,Aristotle,Aristotle,Each of these three admits of two varieties.,Each of these three admits of two varieties.,44,each of these three admits of two varieties.,"['each', 'of', 'these', 'three', 'admits', 'of', 'two', 'varieties']",each of these three admit of two variety .,Each of these three admits of two varieties.


In [131]:
og_df['author'].value_counts(normalize=True)

Aristotle          0.142557
Plato              0.112125
Hegel              0.066341
Foucault           0.044539
Heidegger          0.044536
Kant               0.041289
Nietzsche          0.039594
Marx               0.039422
Lewis              0.038343
Malebranche        0.037984
Deleuze            0.036648
Kripke             0.036470
Smith              0.034173
Wittgenstein       0.026402
Locke              0.025966
Hume               0.024292
Merleau-Ponty      0.022188
Quine              0.021548
Derrida            0.017532
Husserl            0.016781
Fichte             0.015513
Russell            0.014826
Leibniz            0.014691
Popper             0.013671
Lenin              0.013061
Spinoza            0.011085
Moore              0.010720
Keynes             0.009969
Ricardo            0.009031
Berkeley           0.007990
Marcus Aurelius    0.006465
Descartes          0.003308
Epictetus          0.000944
Name: author, dtype: float64

In [132]:
# append the new data
new_df = og_df.append(df)
new_df['author'].value_counts(normalize=True)

Aristotle             0.137330
Plato                 0.108014
Hegel                 0.063909
Foucault              0.042906
Heidegger             0.042903
Kant                  0.039775
Nietzsche             0.038142
Marx                  0.037976
Lewis                 0.036937
Simone de Beauvoir    0.036662
Malebranche           0.036591
Deleuze               0.035305
Kripke                0.035133
Smith                 0.032920
Wittgenstein          0.025434
Locke                 0.025014
Hume                  0.023401
Merleau-Ponty         0.021374
Quine                 0.020758
Derrida               0.016889
Husserl               0.016166
Fichte                0.014944
Russell               0.014282
Leibniz               0.014153
Popper                0.013170
Lenin                 0.012582
Spinoza               0.010679
Moore                 0.010327
Keynes                0.009603
Ricardo               0.008699
Berkeley              0.007697
Marcus Aurelius       0.006228
Descarte

In [133]:
# export as csv
from google.colab import files
new_df.to_csv('phil_nlp.csv', index=False) 
files.download('phil_nlp.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

###Upload Data to the SQL Server

In [134]:
# prepare to upload to the PostgreSQL database
for_db = df
for_db['sentence'] = for_db['sentence_str']
for_db['school'] = for_db['school'].apply(lambda x: x.replace('_', ' ').title())
for_db = for_db.drop(['sentence_spacy', 'sentence_length', 'sentence_str', 'tokenized_txt', 'lemmatized_str'], axis=1)
for_db.columns = [i.upper() for i in for_db.columns]

for_db.sample(5)

Unnamed: 0,TITLE,AUTHOR,SCHOOL,SENTENCE
5060,The Second Sex,Simone de Beauvoir,Feminism,But what is very important is that there is no fundamental opposition between this objective figure that is his and his will for self affirmation in concrete projects.
488,The Second Sex,Simone de Beauvoir,Feminism,"For termites, the enormous mush stuffed queen that lays an egg a second until she is sterile and then is pitilessly massacred is no less a slave than the dwarf male attached to her abdomen that fe..."
12644,The Second Sex,Simone de Beauvoir,Feminism,"I thought it would never end waiting for you, and now I feel it went too quickly since I did not see you.'"
10367,The Second Sex,Simone de Beauvoir,Feminism,Her erotic qualities are integrated into social life and can only appear in this toned down form.
4121,The Second Sex,Simone de Beauvoir,Feminism,So it was good for him to love me?


In [135]:
len(for_db)

13022

In [136]:
#importing sql library 
from sqlalchemy import create_engine 
  
# create a reference  
# for sql library 
engine = create_engine(URL,
                       echo = False)
  
# attach the data frame to the sql  
for_db.to_sql('phil_nlp', 
               con = engine,
              if_exists='append') 
  
# # show the complete data 
print(engine.execute("""SELECT * FROM phil_nlp WHERE "AUTHOR" = 'Simone de Beauvoir'""").fetchone()) 

(294, 'The Second Sex', 'Simone de Beauvoir', 'Feminism', "Literal translation: 'different but equal.' .")


Remember to add to the clipping and other elements to the notebook that creates the database as a whole. Then you're done!