In [1]:
import pickle
import re
import string

from collections import Counter

from nltk.parse import CoreNLPParser
import stanza
from tqdm.notebook import tqdm_notebook as tqdm

from book_processor.Book import dump_books_by_genre
from notebook_utils.constants import PROJ_ROOT, NEW_GENRES
from notebook_utils.data_loader import load_all_books

# nlp = stanza.Pipeline("en", processors="tokenize", use_gpu=True)

In [2]:
clausal = ["S", "SBAR", "SBARQ", "SINV", "SQ"]
phrasal = ["ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", "NX", "PP", "PRN", "PRT", "QP", "RRC", "UCP", "VP",
           "WHADJP", "WHAVP", "WHNP", "WHPP", "X"]


def traverse(tree, tags):
    for subtree in tree:
        if not isinstance(subtree, str):
            label = subtree.label()
            if label in clausal:
                if label in tags["clausal"].keys():
                    tags["clausal"][label] += 1
                else:
                    tags["clausal"][label] = 1
            elif label in phrasal:
                if label in tags["phrasal"].keys():
                    tags["phrasal"][label] += 1
                else:
                    tags["phrasal"][label] = 1
            elif label not in string.punctuation:
                if label in tags["pos"].keys():
                    tags["pos"][label] += 1
                else:
                    tags["pos"][label] = 1
            if len(subtree) > 1:
                traverse(subtree, tags)


# clausal_data = {genre: [] for genre in NEW_GENRES}
# phrasal_data = {genre: [] for genre in NEW_GENRES}
# pos_data = {genre: [] for genre in NEW_GENRES}

all_books = load_all_books()
# all_tags = {genre: {"clausal": [], "phrasal": [], "pos": []} for genre in NEW_GENRES}
all_tags = {genre: {"G": [], "GG": [], "g": [], "gG": []} for genre in NEW_GENRES}

In [3]:
bar_length = sum(len(all_books[genre]) for genre in NEW_GENRES) * 1000

parser = CoreNLPParser()

with tqdm(total=bar_length) as pbar:
    for genre in NEW_GENRES:
        for i, book in enumerate(all_books[genre]):
            pbar.set_postfix_str(f" -- {genre} -- [{i + 1}/{len(all_books[genre])}] ")
            if book.book_number == "19513" or book.book_number == "19640" or book.book_number == "19678" \
                    or book.book_number == "19782" or book.book_number == "19836" or book.book_number == "22326" \
                    or book.book_number == "1322":
                pbar.update(1000)
                continue

            # if isinstance(book.first_1k_sentences, list):
            #     first_1k = "".join(book.first_1k_sentences)
            # first_1k = re.sub("_", "", first_1k)
            # first_1k = re.sub("chapter ([ivx]+\\s+|\\w+\\s+?)", "", first_1k, re.IGNORECASE)

            try:
                # doc = nlp(first_1k)
                # sentences = [sent.text for sent in doc.sentences[:1000]]
                sentences = all_books[genre][i].first_1k_sentences

                # all_books[genre][i].first_1k_sentences = sentences
                # all_tags[genre][all_books[genre][i].book_number] = []
                book_tags = []

                for sentence in sentences:
                    results = [r for r in parser.raw_parse(sentence, properties={"annotators": "tokenize,ssplit,pos,parse"})]
                    sent_tags = {"clausal": {}, "phrasal": {}, "pos": {}}
                    traverse(results[0], sent_tags)

                    book_tags.append(sent_tags)

                    # c = {"Book #": all_books[genre][i].book_number, "@Genre": genre}
                    # p = {"Book #": all_books[genre][i].book_number, "@Genre": genre}

                    # for k, v in sent_tags["clausal"].items():
                    #     c.update({k: v})
                    # for k, v in sent_tags["phrasal"].items():
                    #     p.update({k: v})

                    pbar.update(1)

                if len(all_books[genre][i].first_1k_sentences) < 1000:
                    pbar.update(1000 - len(all_books[genre][i].first_1k_sentences))

                counts = {"clausal": sum([Counter(book_tags[j]["clausal"]) for j in range(len(book_tags))], Counter()),
                          "phrasal": sum([Counter(book_tags[j]["phrasal"]) for j in range(len(book_tags))], Counter()),
                          "pos": sum([Counter(book_tags[j]["pos"]) for j in range(len(book_tags))], Counter())}

                full_book_data = {"clausal": {"Book #": all_books[genre][i].book_number, "@Genre": genre},
                                  "phrasal": {"Book #": all_books[genre][i].book_number, "@Genre": genre},
                                  "pos": {"Book #": all_books[genre][i].book_number, "@Genre": genre}}
                
                full_book_data["clausal"].update({k: v for k, v in counts["clausal"].items() if k != "''" and k != "``" and k not in string.punctuation})
                full_book_data["clausal"]["@Outcome"] = all_books[genre][i].success
                
                full_book_data["phrasal"].update({k: v for k, v in counts["phrasal"].items() if k != "''" and k != "``" and k not in string.punctuation})
                full_book_data["phrasal"]["@Outcome"] = all_books[genre][i].success
                
                full_book_data["pos"].update({k: v for k, v in counts["pos"].items() if k != "''" and k != "``" and k not in string.punctuation})
                full_book_data["pos"]["@Outcome"] = all_books[genre][i].success
                
                all_books[genre][i].first_1k_pos_counts = full_book_data
                
                for tag_type in ["clausal", "phrasal", "pos"]:
                    all_tags[genre][tag_type].append(full_book_data[tag_type])

            except (AssertionError, RuntimeError) as e:
                print(f"{genre}, {book.success}, {book.book_number}")
                pbar.update(1000)
                continue

        # dump_books_by_genre(all_books[genre], genre)

        for tag_type in ["clausal", "phrasal", "pos"]:
            with open(str(PROJ_ROOT.joinpath("data", f"{genre}_{tag_type}_data.txt")), "wb+") as f:
                try:
                    pickle.dump(all_tags[genre][tag_type], f)
                except MemoryError:
                    print(f"There was a MemoryError when dumping {genre}_{tag_type}_data")


HBox(children=(FloatProgress(value=0.0, max=799000.0), HTML(value='')))


######## Dumping books as pickle object to C:\Users\hgore\OneDrive - Fordham University\Documents\Fordham\Year 2\Research\Projects\CSE-538-Project-master\data\books_by_genre\all_Adventure_Stories_books.txt ########

######## Dumping books as pickle object to C:\Users\hgore\OneDrive - Fordham University\Documents\Fordham\Year 2\Research\Projects\CSE-538-Project-master\data\books_by_genre\all_Fiction_books.txt ########

######## Dumping books as pickle object to C:\Users\hgore\OneDrive - Fordham University\Documents\Fordham\Year 2\Research\Projects\CSE-538-Project-master\data\books_by_genre\all_Historical_Fiction_books.txt ########

######## Dumping books as pickle object to C:\Users\hgore\OneDrive - Fordham University\Documents\Fordham\Year 2\Research\Projects\CSE-538-Project-master\data\books_by_genre\all_Love_Stories_books.txt ########

######## Dumping books as pickle object to C:\Users\hgore\OneDrive - Fordham University\Documents\Fordham\Year 2\Research\Projects\CSE-538-Project-ma