In [1]:
# import logging
import pickle

from nltk.parse.stanford import StanfordParser
from tqdm.notebook import tqdm_notebook as tqdm

from notebook_utils.constants import PROJ_ROOT, GENRES

# logging.basicConfig(filename="text_processor_saver.log", filemode="w", format="%(asctime)s::%(levelname)s - %(message)s", level=logging.INFO)

clausal = ["S", "SBAR", "SBARQ", "SINV", "SQ"]
phrasal = ["ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", "NX", "PP", "PRN", "PRT", "QP", "RRC", "UCP", "VP",
           "WHADJP", "WHAVP", "WHNP", "WHPP", "X"]

In [2]:
def traverse(tree, tags):
    for subtree in tree:
        if not isinstance(subtree, str):
            label = subtree.label()
            if label in clausal:
                if label in tags["clausal"].keys():
                    tags["clausal"][label] += 1
                else:
                    tags["clausal"][label] = 1
            elif label in phrasal:
                if label in tags["phrasal"].keys():
                    tags["phrasal"][label] += 1
                else:
                    tags["phrasal"][label] = 1
            if len(subtree) > 1:
                traverse(subtree, tags)


all_tags = {genre: pickle.load(open(str(PROJ_ROOT.joinpath("data", f"{genre}_pos_tags.txt")), "rb+")) for genre in GENRES}
clausal_data = {genre: [] for genre in GENRES}
phrasal_data = {genre: [] for genre in GENRES}

scp = StanfordParser(path_to_jar=str(PROJ_ROOT.joinpath("stanford-parser-full-2018-10-17", "stanford-parser.jar")),
                     path_to_models_jar=str(PROJ_ROOT.joinpath("stanford-parser-full-2018-10-17", "stanford-parser-3.9.2-models.jar")),
                     java_options="-Xmx4G")

Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.


In [3]:
bar_length = sum(len(all_tags[genre]) for genre in GENRES) * 1000

with tqdm(total=bar_length) as pbar:
    for genre in GENRES:
        # logging.info(f"Extracting data from {genre} books...")
        for i, (b_num, tagged_sents) in enumerate(all_tags[genre].items()):
            pbar.set_postfix_str(f" -- {genre} -- [{i + 1}/{len(all_tags[genre])}] ")
            # logging.info(f"Analyzing book {i + 1} of {len(all_tags[genre])}")

            book_clausal = []
            book_phrasal = []
            for sentence in tagged_sents:
                sent_tags = {"clausal": {}, "phrasal": {}}
                
                try:
                    parsed = scp.tagged_parse(sentence)
                    results = [item for item in parsed]
                    traverse(results[0], sent_tags)
                    c = {"Book #": b_num, "@Genre": genre}
                    p = {"Book #": b_num, "@Genre": genre}
                    
                    for k, v in sent_tags["clausal"].items():
                        c.update({k: v})
                    for k, v in sent_tags["phrasal"].items():
                        p.update({k: v})
                        
                    book_clausal.append(c)
                    book_phrasal.append(p)
                    pbar.update(1)
                    
                except (AssertionError, RuntimeError, OSError) as e:
                    print(f"{genre}, {i}")
                    pbar.update(1)
                    continue

            if len(tagged_sents) < 1000:
                pbar.update(1000 - len(tagged_sents))

        with open(str(PROJ_ROOT.joinpath("data", f"{genre}_clausal_data.txt")), "wb+") as f:
            try:
                pickle.dump(clausal_data[genre], f)
            except MemoryError:
                print(f"There was a MemoryError when dumping {genre}_clausal_data")
                # logging.error(f"MemoryError dumping {genre}_clausal_data", exc_info=True)

        with open(str(PROJ_ROOT.joinpath("data", f"{genre}_phrasal_data.txt")), "wb+") as f:
            try:
                pickle.dump(phrasal_data[genre], f)
            except MemoryError:
                print(f"There was a MemoryError when dumping {genre}_phrasal_data")
                # logging.error(f"MemoryError dumping {genre}_phrasal_data", exc_info=True)

HBox(children=(FloatProgress(value=0.0, max=792000.0), HTML(value='')))

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
Parsing file: C:\Users\hgore\AppData\Local\Temp\tmp6me0l5zn
Parsing [sent. 1 len. 1078]: or good advice to cats and kittens , " 1859 ; " the world of ice , or adventures in the polar regions , " 1860 ( 1859 ) ; " the dog crusoe , a tale of the western prairies , " 1861 ( 1860 ) ; " the golden dream , or adventures in the far west , " 1861 ( 1860 ) ; " the gorilla hunters , a tale of the wilds of africa , " 1861 ; " the red eric , or the whaler 's last cruise , " 1861 ; " man on the ocean , a book for boys , " 1863 ( 1862 ) ; " the wild man of the west , a tale of the rocky mountains , " 1863 ( 1862 ) ; " gascoyne , the sandal - wood trader , a tale of the pacific , " 1864 ( 1863 ) ; " the lifeboat , a tale of our coast heroes , " 1864 ; " freaks on the fells , or three months ' r

KeyboardInterrupt: 