Skip to content

Commit

Permalink
process now removes sentences unlikely to have meaning, or with only …
Browse files Browse the repository at this point in the history
…one word
  • Loading branch information
jaysonmc committed Jul 30, 2020
1 parent 4a821f3 commit c0e96e3
Showing 1 changed file with 38 additions and 6 deletions.
44 changes: 38 additions & 6 deletions project/processtext.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# from __future__ import unicode_literals, print_function
import spacy
import en_core_web_sm
import settings
from pathlib import Path
import os
Expand All @@ -9,6 +10,31 @@ class ProcessText():

def __init__(self):
self.processText()

def __removeWhitespace(self, line):
if line.strip():
line = line.strip()
return line + os.linesep
else:
return ""

def __removeMeaninglessSentences_helper(self, line):
nlp = en_core_web_sm.load()
doc = nlp(line)
isMeaningfulSentence = None

lines = [[] for token in doc]

for token in doc:
if ((len(lines)) <= 6 and (token.pos_ == "ADJ" or token.pos_ == "ADP" or token.pos_ == "PROPN" or token.pos_ == "NUM") or (len(lines)) == 1):
isMeaningfulSentence = False
else:
isMeaningfulSentence = True

if (isMeaningfulSentence):
return line + os.linesep
else:
return ""

def tokenizeSentences(self, raw_text):
nlp = spacy.load("en_core_web_sm")
Expand All @@ -20,15 +46,21 @@ def processText(self):
pathlist = Path(settings.OUTPUT_FOLDER).rglob('*.html')
for path in pathlist:
output_contents = open(path, "r").read()
output_contents = self.cleanText(output_contents)
print("Cleaning white space from " + path.name)
output_contents = self.cleanWhitespace(output_contents)
print("Removing non-meaningful sentences " + path.name)
output_contents = self.removeMeaninglessSentences(output_contents)
open(settings.PROCESSED_FOLDER + '/' +
path.name, "w+").write('\n'.join(self.tokenizeSentences(output_contents)))

def cleanText(self, output_contents):
def cleanWhitespace(self, output_contents):
new_output_lines = ""
for line in iter(output_contents.splitlines()):
if line.strip():
line = line.strip()
new_output_lines += line + os.linesep
for line in iter(output_contents.splitlines()):
new_output_lines += self.__removeWhitespace(line)
return new_output_lines

def removeMeaninglessSentences(self, output_contents):
new_output_lines = ""
for line in iter(output_contents.splitlines()):
new_output_lines += self.__removeMeaninglessSentences_helper(line)
return new_output_lines

0 comments on commit c0e96e3

Please sign in to comment.