<a href="https://colab.research.google.com/github/kcalizadeh/phil_nlp/blob/master/add_new_text_to_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import sys

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/Phil_NLP'

sys.path.append(drive_path)

Mounted at /gdrive


In [12]:
#@title imports
!pip install symspellpy

import re
from google.colab import files
import spacy
import pandas as pd
import pkg_resources
from symspellpy.symspellpy import SymSpell

import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

# gets text from a gutenberg URL
def get_guten(url):
    # retrieve the source text
    r = requests.get(url)
    r.encoding = 'utf-8'
    text = r.text
    return text

# gets the text from a txt file
def get_text(path, encoding='utf-8'):
    f = open(path, 'r', encoding=encoding)
    text = f.read()
    f.close()
    return text

def baseline_clean(to_correct, capitals=True, bracketed_fn=False):
  # remove utf8 encoding characters and some punctuations
  result = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff\xad\x0c6§\\\£\Â‘’“”*_<>''""⎫•{}]', ' ', to_correct)
  result = re.sub(r'[\u2014\u2013\u2012-]', ' ', result)

  # replace whitespace characters with actual whitespace
  result = re.sub(r'\s', ' ', result)

  # replace the ﬀ, ﬃ and ﬁ with the appropriate counterparts
  result = re.sub(r'ﬀ', 'ff', result)
  result = re.sub(r'ﬁ', 'fi', result)
  result = re.sub(r'ﬃ', 'ffi', result)

  # remove some recurring common and meaninless words/phrases
  result = re.sub(r'\s*This\s*page\s*intentionally\s*left\s*blank\s*', ' ', result)
  result = re.sub(r'(?i)Aufgabe\s+', ' ', result)
  result = re.sub(r',*\s+cf\.', ' ', result)
  result = re.sub('coroll\.', 'coroll', result)
  result = re.sub('pt\.', 'pt', result)

  # some texts have footnotes conveniently in brackets - this removes them all, 
  # with a safety measure for unpaired brackets, and deletes all brackets afterwards
  if bracketed_fn:
    result = re.sub(r'\[.{0,300}\]|{.{0,300}}|{.{0,300}\]|\[.{0,300}}', ' ', result)
  result = re.sub(r'[\[\]{}]', ' ', result)

  # replace ampersands with 'and'
  result = re.sub(r'&', 'and', result)

  # remove roman numerals, first capitalized ones
  result = re.sub(r'\s((I{2,}V*X*\.*)|(IV\.*)|(IX\.*)|(V\.*)|(V+I*\.*)|(X+L*V*I*]\.*))\s', ' ', result)
  # then lowercase
  result = re.sub(r'\s((i{2,}v*x*\.*)|(iv\.*)|(ix\.*)|(v\.*)|(v+i*\.*)|(x+l*v*i*\.*))\s', ' ', result)

  # remove periods and commas flanked by numbers
  result = re.sub(r'\d\.\d', ' ', result)
  result = re.sub(r'\d,\d', ' ', result)

  # remove the number-letter-number pattern used for many citations
  result = re.sub(r'\d*\w{,2}\d', ' ', result)

  # remove numerical characters
  result = re.sub(r'\d+', ' ', result)

  # remove words of 2+ characters that are entirely capitalized 
  # (these are almost always titles, headings, or speakers in a dialogue)
  # remove capital I's that follow capital words - these almost always roman numerals
  # some texts do use these capitalizations meaningfully, so we make this optional
  if capitals:
    result = re.sub(r'[A-Z]{2,}\s+I', ' ', result)
    result = re.sub(r'[A-Z]{2,}', ' ', result)

  # remove isolated colons and semicolons that result from removal of titles
  result = re.sub(r'\s+:\s*', ' ', result)
  result = re.sub(r'\s+;\s*', ' ', result)

  # remove isolated letters (do it several times because strings of isolated letters do not get captured properly)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)
  result = re.sub(r'\s[^aAI\.]\s', ' ', result)

  # remove isolated letters at the end of sentences or before commas
  result = re.sub(r'\s[^aI]\.', '.', result)
  result = re.sub(r'\s[^aI],', ',', result)

  # deal with spaces around periods and commas
  result = re.sub(r'\s,\s', ', ', result)
  result = re.sub(r'\s\.\s;', '. ', result)

  # remove empty parantheses
  result = re.sub(r'(\(\s*\.*\s*\))|(\(\s*,*\s*)\)', ' ', result)

  # reduce multiple periods or whitespaces into a single one
  result = re.sub(r'\.+', '.', result)
  result = re.sub(r'\s+', ' ', result)

  return result

def remove_words(text, word_list):
  for word in word_list:
    text = re.sub(r''+word+'', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text

def from_raw_to_df(text_dict):
  nlp.max_length = 9000000
  text = text_dict['text']
  text = remove_words(text, text_dict['words to remove'])
  text = baseline_clean(text, capitals=text_dict['remove capitals'],
                        bracketed_fn=text_dict['bracketed fn'])
  text_nlp = nlp(text, disable=['ner'])
  text_df = pd.DataFrame(columns=['title', 'author', 'school', 'sentence_spacy'])
  text_df['sentence_spacy'] = list(text_nlp.sents)
  text_df['author'] = text_dict['author']
  text_df['title'] = text_dict['title']
  text_df['school'] = text_dict['school']
  text_df['sentence_str'] = text_df['sentence_spacy'].apply(lambda x: ''.join(list(str(x))))
  return text_df

def space_words(str, 
                dict_path=pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt"),
                edit_distance=1,
                prefix_length=3):
  sym_spell = SymSpell(max_dictionary_edit_distance=edit_distance, prefix_length=prefix_length)
  sym_spell.load_dictionary(dict_path, term_index=0, count_index=1)

  input_term = str
  result = sym_spell.word_segmentation(input_term)

  return result.corrected_string



In [None]:
# load the existing csv so we can add to it
main_df = pd.read_csv('phil_nlp.csv')

In [None]:
# load the text
new_text = get_text('filepath')

In [None]:
# clip the front and end matter
new_text = new_text.split('front')[1].split('end')[0]

In [None]:
# define terms that need to be removed ad hoc
new_text_to_rm = ['headers', 'footers', 'oddities']

In [None]:
# build a dictionary for the text
# be sure to mark if capitals should not be removed or if footnotes are bracketed
new_text_dict = {'title': title,
                 'author': author,
                 'school': school,
                 'text': new_text,
                 'words to remove': new_text_to_rm,
                 'remove capitals': True,
                 'bracketed fn': False}

In [None]:
# turn the text into a dataframe
new_text_df = from_raw_to_df(new_text_dict)

len(new_text_df)

In [None]:
new_text_df.sample(10)

In [None]:
# examine short entries
new_text_df['sentence_length'] = new_text_df['sentence_str'].map(lambda x: len(x))
num_of_short_entries = len(new_text_df[new_text_df['sentence_length'] < 20])
print(f"there are {num_of_short_entries} so-called sentences with fewer than 20 characters")
new_text_df[new_text_df['sentence_length'] < 20].sample(5)

In [None]:
# drop short entries if you choose
new_text_df = new_text_df.drop(new_text_df[new_text_df['sentence_length'] < 20].index)
len(new_text_df)

In [None]:
# check for self-reference (especially if it is an older thinker)
self_mention_df = new_text_df[new_text_df['sentence_lowered'].str
                              .contains('\s'+'author name'.lower())].copy()

len(self_mention_df)

In [None]:
# preview to see if they seem problematic
self_mention_df.sample(10)

In [None]:
# drop self-referring entries
new_text_df = new_text_df.drop(new_text_df[new_text_df['sentence_lowered'].str
                                           .contains('/s' + 'author name'.lower())].index)

len(new_text_df)

In [None]:
# check the number of duplicates
len(new_text_df['sentence_str'])-len(new_text_df['sentence_str'].drop_duplicates())

In [None]:
# examine the duplicate entries
doubles_df = pd.concat(g for _, g in new_text_df.groupby("sentence_str") if len(g) > 1)
doubles_df.sample(5)

In [None]:
# drop the duplicates
# this defaults to dropping both copies of duplicated rows
new_text_df = new_text_df.drop(new_text_df['sentence_str'].duplicated(keep=False))].index

In [None]:
# you're almost done! merge with the main dataframe 
main_df = main_df.append(new_text_df)

In [None]:
# double check the length to make sure all is well
len(main_df)

In [None]:
# if you're confident all went well, re-export
# be sure, since this will overwrite the old file
main_df.to_csv('phil_nlp.csv') 
files.download('phil_nlp.csv')