# Set up

In [18]:
#!conda update conda -y
#!conda install Cython -y

In [19]:
#%pip install -U pandas
#%pip install -U pyLDAvis

In [20]:
## imports
import os, sys
import pprint as pp

In [21]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [22]:
## Cython module の生成 (必要に応じて)
#!python clean setup.py build_ext --inplace

In [23]:
## Cython を使うかどうか
use_Cython = True
if use_Cython:
    %load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [24]:
## doc
doc_max_size = 30 # max count of words in a sentence
doc_min_size = 5  # min count of words in a sentence

## term: w_skippy4gram often fails
term_types    = [ 'w_1gram', 'w_2gram', 'w_3gram', 'w_4gram',
                'w_skippy2gram', 'w_skippy3gram', 'w_skippy4gram' ]
term_type     = term_types[-1]
# n-gram
ngram_is_inclusive  = True
ngram_inclusiveness = 1 # When the value is k, n_gram contains (n-k)-grams
## skippy n-grams
gap_mark      = "…"
max_gap_ratio = 0.23
max_gap_val   = round(doc_max_size * max_gap_ratio)

## check
print(f"term_type: {term_type}")
print(f"max_gap_val for skippy n-gram: {max_gap_val}")

term_type: w_skippy4gram
max_gap_val for skippy n-gram: 7


In [25]:
## HDP: The following parameters need to be relatively large for HDP, unlike LDA
bot_min_size     = doc_min_size
term_minfreq     = 3
abuse_threshold  = 0.1
n_docs_to_show   = 10
n_terms_to_show  = 15
# flags for HDP with larger topics
explore_90_topics  = False
explore_120_topics = False
explore_150_topics = False

# Get data from files

In [26]:
## variables
random_target = False

## Get target files
import glob
data_dir = "data/Darwin-texts/single-lined/"
target_files = glob.glob(f"{data_dir}/*")
target_files = [ file for file in target_files if ".txt" in file ]
pp.pprint(target_files)

['data/Darwin-texts/single-lined/sl-2485-body.utf-8.txt',
 'data/Darwin-texts/single-lined/sl-1227-body.utf-8.txt']


In [27]:
## Read data from files
import random
import pandas as pd

if random_target:
    file = random.choice(target_files)
else:
    file = target_files[0]
print(f"processing: {file}")
#
if file.endswith(".csv"):
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".txt"):
    with open(file, "rt") as f:
        raw_df = pd.read_table(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".xlsx"):
    with open(file, "rb") as f:
        raw_df = pd.read_excel(f, index_col = 0)
# 
raw_df.sample(10)

processing: data/Darwin-texts/single-lined/sl-2485-body.utf-8.txt


Unnamed: 0,sentence
1348,"When the stem is secured, the tendrils are see..."
487,It is remarkable that the average rate of revo...
1090,The tendrils after clasping a stick become thi...
294,"So it is with two species of _Ceropegia_, as I..."
1556,"In Anguria the lower surface of the tendril, a..."
439,One of the ends touched the angle between a te...
1366,To show the difference in the kind of sensitiv...
957,Thus the plant is secured to its support in a ...
800,"They now seized a thin, upright stick horizont..."
68,"The weather being hot, the plant was allowed t..."


In [28]:
## build w1gram
import re

w_1grams = raw_df['sentence'].apply(lambda x: re.split(r"\s+", x))

## convert to lowercase
w_1grams = [ [ x.lower() for x in w1gram ] for w1gram in w_1grams ]

## remove ineffective characters
removed_chars = r"[-.,:;!?()_\"\'“”‘’]"
w_1grams = [ [ re.sub(removed_chars, "", x) for x in w_1gram ] for w_1gram in w_1grams ]

## exclude single-character words
#w1grams = [ [x for x in w1gram if len(x) > 1 ] for w1gram in w1grams ]
[ words[:5] for words in random.sample(w_1grams, 3) ]

[['hence', 'if', 'the', 'leaves', 'on'],
 ['see', 'dr', 'h', 'de', 'vries'],
 ['the', 'surfaces', 'of', 'these', 'balls']]

In [29]:
## remove too frequent words
from collections import Counter
all_words = [ ]
[ all_words.extend(x) for x in w_1grams ] 
word_counts = Counter(all_words)
reduct_rate = 0.003 # needs to be optimized text-wise
too_frequents = word_counts.most_common(round(len(word_counts) * reduct_rate))
pp.pprint(too_frequents)
print(f"number of removed items: {len(too_frequents)}")

[('the', 4019),
 ('of', 1941),
 ('a', 1519),
 ('and', 1439),
 ('in', 1360),
 ('to', 1025),
 ('is', 527),
 ('it', 509),
 ('that', 486),
 ('as', 473),
 ('with', 460)]
number of removed items: 11


In [30]:
## exclude too frequent words
w_1grams = [ [ x for x in w1gram if not x in too_frequents ] for w1gram in w_1grams ]
raw_df['w_1gram'] = w_1grams
raw_df['size'] = raw_df['w_1gram'].apply(lambda x: len(x))
raw_df

Unnamed: 0,sentence,w_1gram,size
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10
1,PREFACE,[preface],1
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, a, corrected, a...",16
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9
...,...,...,...
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8
1854,Fritz Müller also has shown in relation to our...,"[fritz, müller, also, has, shown, in, relation...",47
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",31
1856,"Annales des Sc. Nat. 4th series, Bot. tom. vi....","[annales, des, sc, nat, 4th, series, bot, tom,...",12


In [31]:
## define df by filtering by length
print(f"originally: {len(raw_df)}")
df = raw_df[ (doc_min_size <= raw_df['size']) & (raw_df['size'] <= doc_max_size) ]
print(f"after filtering: {len(df)}")

originally: 1858
after filtering: 1031


In [32]:
df

Unnamed: 0,sentence,w_1gram,size
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, a, corrected, a...",16
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9
7,These memoirs ought to be carefully studied by...,"[these, memoirs, ought, to, be, carefully, stu...",26
...,...,...,...
1850,The aërial roots of some other species of Phil...,"[the, aërial, roots, of, some, other, species,...",21
1851,"Quoted by Cohn, in his remarkable memoir, “Con...","[quoted, by, cohn, in, his, remarkable, memoir...",20
1852,"Such slight spontaneous movements, I now find,...","[such, slight, spontaneous, movements, i, now,...",30
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8


# Build n-grams

In [39]:
## generic function for n-gram generation: for words, seg_joint = " "
reload_module_on_run = False
if reload_module_on_run:
    import importlib
def add_ngram_to_df(dfx, n_for_ngram: int, seg_joint: str = " ", var_prefix: str = "", ngram_is_skippy: bool = False, ngram_is_inclusive: bool = ngram_is_inclusive, ngram_inclusiveness: int = ngram_inclusiveness, use_Cython: bool = use_Cython, check: bool = False):
    """
    generic function for adding n-gram column to df with a specified n for ngram
    """
    print(f"use_Cython: {use_Cython}")
    inclusion_size = (n_for_ngram - ngram_inclusiveness)
    print(f"inclusion_size: {inclusion_size}")
    assert inclusion_size >= 0
    source_var = f"{var_prefix}1gram"
    print(f"source_var: {source_var}")
    unigrams = df[source_var]
    if use_Cython:
        import cy_gen_ngrams
        if reload_module_on_run:
            importlib.reload(cy_gen_ngrams)
        if ngram_is_skippy:
            ngrams = [ [seg_joint.join(x) for x in cy_gen_ngrams.cy_gen_skippy_ngrams(x, n = n_for_ngram, check = False)] for x in unigrams ]
        else:
            ngrams = [ [seg_joint.join(x) for x in cy_gen_ngrams.cy_gen_ngrams(x, n = n_for_ngram, check = False)] for x in unigrams ]
    else:
        import gen_ngrams
        if ngram_is_skippy:
            ngrams = [ gen_ngrams.gen_skippy_ngrams(x, n = n_for_ngram, sep = seg_joint, check = False) for x in unigrams ]
        else:
            ngrams = [ gen_ngrams.gen_ngrams(x, n = n_for_ngram, sep = seg_joint, check = False) for x in unigrams ]
    ## 包括的 2gramの生成
    if ngram_is_inclusive:
        assert (n_for_ngram - inclusion_size) > 0
        if ngram_is_skippy and n_for_ngram > 2:
            supplement_var = f"{var_prefix}skippy{n_for_ngram - 1}gram"
        else:
            supplement_var = f"{var_prefix}{n_for_ngram - 1}gram"
        print(f"supplement_var: {supplement_var}")
        for i, g in enumerate(ngrams):
            included = [ x for x in list(dfx[supplement_var])[i] if len(x) >= inclusion_size ]
            if len(included) > 0:
                g.extend(included)
    ## 変数の追加
    if ngram_is_skippy:
        added_var = f"{var_prefix}skippy{n_for_ngram}gram"
    else:
        added_var = f"{var_prefix}{n_for_ngram}gram"
    print(f"added_var: {added_var}")
    dfx[added_var] = ngrams
    ## check result
    print(dfx[added_var])

In [40]:
## word 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = False, ngram_is_inclusive = True, check = False)

use_Cython: True
inclusion_size: 1
source_var: w_1gram
supplement_var: w_1gram
added_var: w_2gram
0       [the movement, movement and, and habits, habit...
2       [this essay, essay first, first appeared, appe...
3       [it is, is here, here reproduced, reproduced i...
4       [the illustrations, illustrations were, were d...
7       [these memoirs, memoirs ought, ought to, to be...
                              ...                        
1850    [the aërial, aërial roots, roots of, of some, ...
1851    [quoted by, by cohn, cohn in, in his, his rema...
1852    [such slight, slight spontaneous, spontaneous ...
1853    [sachs textbook, textbook of, of botany, botan...
1856    [annales des, des sc, sc nat, nat 4th, 4th ser...
Name: w_2gram, Length: 1031, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [41]:
## word 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = False, ngram_is_inclusive = True, check = False)

use_Cython: True
inclusion_size: 2
source_var: w_1gram
supplement_var: w_2gram
added_var: w_3gram
0       [the movement and, movement and habits, and ha...
2       [this essay first, essay first appeared, first...
3       [it is here, is here reproduced, here reproduc...
4       [the illustrations were, illustrations were dr...
7       [these memoirs ought, memoirs ought to, ought ...
                              ...                        
1850    [the aërial roots, aërial roots of, roots of s...
1851    [quoted by cohn, by cohn in, cohn in his, in h...
1852    [such slight spontaneous, slight spontaneous m...
1853    [sachs textbook of, textbook of botany, of bot...
1856    [annales des sc, des sc nat, sc nat 4th, nat 4...
Name: w_3gram, Length: 1031, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [42]:
## word 4grams
add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = False, ngram_is_inclusive = True, check = False)

use_Cython: True
inclusion_size: 3
source_var: w_1gram
supplement_var: w_3gram
added_var: w_4gram
0       [the movement and habits, movement and habits ...
2       [this essay first appeared, essay first appear...
3       [it is here reproduced, is here reproduced in,...
4       [the illustrations were drawn, illustrations w...
7       [these memoirs ought to, memoirs ought to be, ...
                              ...                        
1850    [the aërial roots of, aërial roots of some, ro...
1851    [quoted by cohn in, by cohn in his, cohn in hi...
1852    [such slight spontaneous movements, slight spo...
1853    [sachs textbook of botany, textbook of botany ...
1856    [annales des sc nat, des sc nat 4th, sc nat 4t...
Name: w_4gram, Length: 1031, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [43]:
## word skippy 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = True, ngram_is_inclusive = True, check = False) # For words, seg_joint = " "

use_Cython: True
inclusion_size: 1
source_var: w_1gram
supplement_var: w_1gram
added_var: w_skippy2gram
0       [the movement, the … and, the … habits, the … ...
2       [this essay, this … first, this … appeared, th...
3       [it is, it … here, it … reproduced, it … in, i...
4       [the illustrations, the … were, the … drawn, t...
7       [these memoirs, these … ought, these … to, the...
                              ...                        
1850    [the aërial, the … roots, the … of, the … some...
1851    [quoted by, quoted … cohn, quoted … in, quoted...
1852    [such slight, such … spontaneous, such … movem...
1853    [sachs textbook, sachs … of, sachs … botany, s...
1856    [annales des, annales … sc, annales … nat, ann...
Name: w_skippy2gram, Length: 1031, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [44]:
## word skippy 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = True, ngram_is_inclusive = True, check = False) # For words, seg_joint = " "

use_Cython: True
inclusion_size: 2
source_var: w_1gram
supplement_var: w_skippy2gram
added_var: w_skippy3gram
0       [the movement and, the movement … habits, the ...
2       [this essay first, this essay … appeared, this...
3       [it is here, it is … reproduced, it is … in, i...
4       [the illustrations were, the illustrations … d...
7       [these memoirs ought, these memoirs … to, thes...
                              ...                        
1850    [the aërial roots, the aërial … of, the aërial...
1851    [quoted by cohn, quoted by … in, quoted by … h...
1852    [such slight spontaneous, such slight … moveme...
1853    [sachs textbook of, sachs textbook … botany, s...
1856    [annales des sc, annales des … nat, annales de...
Name: w_skippy3gram, Length: 1031, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


In [45]:
## word skippy 4grams
if term_type == "w_skippy4gram":
    add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = True, ngram_is_inclusive = True, check = False) # For words, seg_joint = " "

use_Cython: True
inclusion_size: 3
source_var: w_1gram
supplement_var: w_skippy3gram
added_var: w_skippy4gram
0       [the movement and habits, the movement and … o...
2       [this essay first appeared, this essay first …...
3       [it is here reproduced, it is here … in, it is...
4       [the illustrations were drawn, the illustratio...
7       [these memoirs ought to, these memoirs ought …...
                              ...                        
1850    [the aërial roots of, the aërial roots … some,...
1851    [quoted by cohn in, quoted by cohn … his, quot...
1852    [such slight spontaneous movements, such sligh...
1853    [sachs textbook of botany, sachs textbook of …...
1856    [annales des sc nat, annales des sc … 4th, ann...
Name: w_skippy4gram, Length: 1031, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfx[added_var] = ngrams


# DTM

In [46]:
#df['sentence']

In [47]:
## build doc_dict
doc_dict = { i : x for i, x in enumerate(df['sentence']) }
pp.pprint(random.sample(list(doc_dict.items()), 5)) # list(...) is needed after 3.10

[(815,
  'In short, we have given to the string the regular spiral arrangement of a '
  'tendril caught at both ends.'),
 (828,
  'This experiment can be tried only on the thicker tendrils, which are not '
  'affected by a thin crust of dried paint.'),
 (705,
  'Discs are never developed, as far as I have seen, without the stimulus of '
  'at least temporary contact with some object.'),
 (536,
  'No doubt all the lines would have been curvilinear if the course had been '
  'observed at much shorter intervals.'),
 (522,
  'The first branch then loosed itself, and, arranging its hooks, again caught '
  'hold.')]


In [48]:
## select bots for DTM
print(f"term_type: {term_type}")
bots = list(df[term_type])
bots = [ bot for bot in bots if len(bot) >= bot_min_size ]
random.sample(bots, 2)

term_type: w_skippy4gram


[['the young internodes sweep',
  'the young internodes … large',
  'the young internodes … circles',
  'the young internodes … one',
  'the young internodes … being',
  'the young internodes … completed',
  'the young internodes … in',
  'the young internodes … 2',
  'the young internodes … hrs',
  'the young internodes … 15',
  'the young internodes … m',
  'the young internodes … and',
  'the young internodes … a',
  'the young internodes … second',
  'the young internodes … in',
  'the young internodes … 2',
  'the young internodes … hrs',
  'the young internodes … 55',
  'the young internodes … m',
  'the young … sweep large',
  'the young … sweep … circles',
  'the young … sweep … one',
  'the young … sweep … being',
  'the young … sweep … completed',
  'the young … sweep … in',
  'the young … sweep … 2',
  'the young … sweep … hrs',
  'the young … sweep … 15',
  'the young … sweep … m',
  'the young … sweep … and',
  'the young … sweep … a',
  'the young … sweep … second',
  'th

In [49]:
## build diction, corpus = dtm
from gensim.corpora import Dictionary

## dtm
diction = Dictionary(bots)
print(diction)

## filtering
diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
print(diction)

## corpus building: allow_update prevents "Not all rows ..." errror but it takes considerably longer.
## Sanitization with nonzero filtering is more effective.
#corpus = [ diction.doc2bow(bot, allow_update = True) for bot in bots ]
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) >= doc_min_size ]

Dictionary<7985232 unique tokens: ['and', 'and habits', 'and habits of', 'and habits of climbing', 'and habits of … by']...>
Dictionary<53338 unique tokens: ['and … by', 'and … climbing', 'and … climbing plants', 'and … of climbing', 'and … of climbing plants']...>


In [50]:
## sanitize corpus: Crucial for HDP
original_size = len(corpus)
corpus = [ doc for doc in corpus if len(doc) > 0 ]
print(f"discarded {original_size - len(corpus)} docs")

discarded 1 docs


# Run HDP

In [51]:
## topic investigation のutility function
def investigate_topics(target_hdp, n_docs_to_show: int = n_docs_to_show,
                       n_terms_to_show: int = n_terms_to_show, precision: float = 0.4):
    import numpy as np
    import HDP_helper

    ## collect valid date
    documents_topics = np.zeros([target_hdp.m_T, len(corpus)])
    for doc_id, c in enumerate(corpus):
        for topic_id, prob in target_hdp[c]:
            documents_topics[topic_id][doc_id] = prob
    
    ## investigate topics
    target_hdp.optimal_ordering()
    for topic_id, probs in enumerate(documents_topics):
        print(f"==============")
        topic_t = target_hdp.print_topic(topic_id, topn = n_terms_to_show)
        print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
        print(f"nonzero count: ", len(probs.nonzero()[0]))
        for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
            doc = doc_dict[doc_id]
            print(f"\t{probs[doc_id]:{precision}f}: {doc}")

In [52]:
## HDP (max_n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp15 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data15 = pyLDAvis.gensim.prepare(hdp15, corpus, diction)
pyLDAvis.display(vis_data15)

In [53]:
## topic investigation
investigate_topics(hdp15)

topic_id 0: 0.001 * for … the … of + 0.001 * to … of + 0.001 * the … as + 0.001 * for … the + 0.001 * for … of + 0.001 * for … with + 0.001 * he … the + 0.001 * the … the … as + 0.001 * to … plants + 0.001 * have … for … the + 0.001 * the … spring … the + 0.001 * to … of … plants + 0.001 * that … the + 0.001 * i … the … of + 0.001 * the … they
nonzero count:  749
	0.9998: The young internodes and tendrils of this anomalous member of the family, revolve in the same manner and at about the same rate as those of the _Echinocystis_.
	0.9997: In certain species of _Tropæolum_, both the spontaneous movements of the internodes and the sensitiveness of the petioles have become much enfeebled, and in one species have been completely lost.
	0.9997: Hence it seems probable in this case and in others, that the curvature of the tendril from a touch depends on the contraction of the cells along the concave side.
	0.9997: When the stem is secured, the tendrils are seen to revolve in nearly the same m

In [54]:
## HDP (max_n_topics = 45)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 45
hdp45 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data45 = pyLDAvis.gensim.prepare(hdp45, corpus, diction)
pyLDAvis.display(vis_data45)

In [55]:
## topic investigation
investigate_topics(hdp45)

topic_id 0: 0.002 * for … the … of + 0.001 * for … of + 0.001 * to … of + 0.001 * for … the + 0.001 * some … of + 0.001 * he … the + 0.001 * to … plants + 0.001 * for … with + 0.001 * have … for … the + 0.001 * to … of … plants + 0.001 * the … spring … the + 0.001 * the … as + 0.001 * the … they + 0.001 * the … the … as + 0.001 * that … the
nonzero count:  358
	0.9995: All these facts taken together, show that the act of clasping a support and the spiral contraction of the whole length of the tendril, are phenomena not necessarily connected.
	0.9994: I have already referred to the case of the twining stem of Cuscuta, which, according to H. de Vries (ibid. p. 322) is sensitive to a touch like a tendril.
	0.9992: The tendrils of _Dicentra_, whilst the plant is young, are short and after attachment only become slightly flexuous; in older plants they are longer and then they contract spirally.
	0.9992: These threads proceeded from the bark of the rootlet at one end, and at the other end we

In [56]:
## HDP (max_n_topics = 90)
if explore_90_topics:
    import gensim.models
    import pyLDAvis.gensim

    max_n_topics = 90
    hdp90 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
    vis_data90 = pyLDAvis.gensim.prepare(hdp90, corpus, diction)
    pyLDAvis.display(vis_data90)

In [57]:
## topic investigation
if explore_90_topics:
    investigate_topics(hdp90)

In [58]:
## HDP (max_n_topics = 120)
if explore_120_topics:
    import gensim.models
    import pyLDAvis.gensim

    max_n_topics = 120
    hdp120 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
    vis_data120 = pyLDAvis.gensim.prepare(hdp120, corpus, diction)
    pyLDAvis.display(vis_data120)

In [59]:
## topic investigation
if explore_120_topics:
    investigate_topics(hdp120)

In [60]:
## HDP (max_n_topics = 150)
if explore_150_topics:
    import numpy as np
    import gensim.models
    import pyLDAvis.gensim

    max_n_topics = 150
    hdp150 = gensim.models.HdpModel(corpus, diction, T = max_n_topics, random_state = 1)
    vis_data150 = pyLDAvis.gensim.prepare(hdp150, corpus, diction)
    pyLDAvis.display(vis_data150)

In [61]:
## topic investigation
if explore_150_topics:
    investigate_topics(hdp150)

In [62]:
## save LDAvis output as a html file
save_LDAvis = True
ntops = [ 15, 45, 90, 120, 150 ]
vis_targets = [ f"vis_data{ntop}" for ntop in ntops ]
print(f"vis_targets: {vis_targets}")
vis_target_data = vis_targets[0]
print(f"vis_target_data: {vis_target_data}")
if save_LDAvis:
	output = f"results/LDAvis/Darwin-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(eval(vis_target_data), output)

vis_targets: ['vis_data15', 'vis_data45', 'vis_data90', 'vis_data120', 'vis_data150']
vis_target_data: vis_data15
