In [123]:
import nltk
from nltk.corpus import wordnet as wn
from collections import OrderedDict
from multiprocessing import Pool

f = open("cryptonomicon_engl.txt_Ascii.txt", "r", encoding="latin-1")

# chapters is a dict with keys that are chapter names and values that are chapter's content
# chapters in a book are supposed to be separated by a line with a "chapter " keyword in it
chapters = OrderedDict()
chapter = ''
chapter_title = ''

line = f.readline().lower()
while line:
    # next chapter
    if 'chapter ' in line:
        chapter_title = line
        chapters[chapter_title] = chapter
        chapter = ''
    
    chapter += line    
    line = f.readline().lower()
    
# TODO: do the same with "from wordnik import *"

In [124]:
# tokenize all chapters
def tokenize_chapter(title, text):
    tokens = nltk.wordpunct_tokenize(text)
    tokens = [w.lower() for w in tokens if w.isalpha()]
    pos_tokens = nltk.pos_tag(tokens)
    return (title, pos_tokens)

# Parallel(n_jobs = multiprocessing.cpu_count())(delayed(tokenized_chapters)(title, text) for title, text in chapters.items())
pool = Pool()
result = pool.starmap(tokenize_chapter, [[title, text] for title, text in chapters.items()])

for t in result:
    chapters[t[0]] = t[1]

In [137]:
# find least common adjectives for each chapter and add them to dict
tokenized_chapters = OrderedDict()
for title, tokens in chapters.items():
    adjectives = [(word, token) for word, token in tokens if token == "JJ"]
    adjectives_freq = nltk.FreqDist(adjectives)
    
    adjectives_keys = adjectives_freq.keys()
    adjs = [wn.synsets(w, pos=wn.ADJ) for w, t in adjectives_keys]

    # drop na
    adjs = list(filter(None, adjs))

    # flatten list
    adjs = [item for sublist in adjs for item in sublist]
    adjs = [(item.lemma_names()[0], item.definition()) for item in adjs]
    
    fraction = int(len(adjs) * 0.3)
    least_common_adj =  adjs[-fraction:]
    
    adj_dict = {}
    
    for adj in least_common_adj:
        if adj[0] not in adj_dict:
            adj_dict[adj[0]] = adj[1]
        else:
            adj_dict[adj[0]] += "; " + adj[1]
    

    tokenized_chapters[title] = adj_dict

In [140]:
from pylatex import Document, Section, Subsection, Command, Itemize, Enumerate, Description
from pylatex.utils import italic, NoEscape

doc = Document("Thesaurus")
with doc.create(Section("Adjectives")):
    for title, token_dict in tokenized_chapters.items():
        with doc.create(Subsection(title)):
            with doc.create(Description()) as desc:
                    for name, description in token_dict.items():
                        desc.add_item(name, description)

doc.generate_pdf("Adjectives")

{'last': 'immediately past; coming after all others in time or space or degree or being the only one remaining; most unlikely or unsuitable; occurring at the time of death; highest in extent or degree; lowest in rank or importance', 'cheeseparing': 'giving or spending with reluctance', 'biased': 'favoring one person or side over another', 'several': '(used with count nouns) of an indefinite number more than 2 or 3 but not many; distinct and individual', 'final': 'conclusive in a process or progression; not to be altered or undone', 'other': 'not the same one or ones already mentioned or implied; recently past; very unusual; different in character or quality from the normal or expected', 'square': 'having four equal sides and four right angles or forming a right angle; leaving no balance; without evasion or compromise; rigidly conventional or old-fashioned', 'provincial': 'of or associated with a province; characteristic of the provinces or their people', 'retentive': 'good at rememberi