In [64]:
#%pip install -U pandas

In [65]:
#%pip install -U pyLDAvis

In [66]:
## imports
import os, sys
import pprint as pp

In [67]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [68]:
## variables
random_target = False

## HDP: The following parameters need to be relatively large for HDP, unlike LDA
min_bot_size    = 3
term_minfreq    = 3
abuse_threshold = 0.05

## doc
doc_max_size = 30 # max count of words in a sentence
doc_min_size = 5  # min count of words in a sentence

## term
ngram_is_inclusive = True
term_types    = [ 'w1gram', 'w2gram', 'w3gram', 'w4gram',
                'skippy_w2gram', 'skippy_w3gram', 'skippy_w4gram' ]
term_type     = term_types[-1]
## skippy n-grams
gap_mark      = " … "
max_gap_ratio = 0.5
max_gap_val   = round(doc_max_size * max_gap_ratio)

In [None]:
## check
print(f"term_type: {term_type}")
print(f"max_gap for skippy n-gram: {max_gap_val}")

In [69]:
## Get target files
import glob
data_dir = "data/Darwin-texts/single-lined/"
target_files = glob.glob(f"{data_dir}/*")
target_files = [ file for file in target_files if ".txt" in file ]
pp.pprint(target_files)

['data/Darwin-texts/single-lined/sl-2485-body.utf-8.txt',
 'data/Darwin-texts/single-lined/sl-1227-body.utf-8.txt']


In [70]:
## Read data from files
import random
import pandas as pd
if random_target:
    file = random.choice(target_files)
else:
    file = target_files[0]
print(f"processing: {file}")
#
if file.endswith(".csv"):
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".txt"):
    with open(file, "rt") as f:
        raw_df = pd.read_table(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".xlsx"):
    with open(file, "rb") as f:
        raw_df = pd.read_excel(f, index_col = 0)
# 
raw_df.sample(10)

processing: data/Darwin-texts/single-lined/sl-2485-body.utf-8.txt


Unnamed: 0,sentence
1438,"A tendril, on the other hand, which has caught..."
1684,"I have not observed any such cases, and know n..."
664,"They are sensitive on all sides, but in differ..."
436,"When we consider, on the one hand, the thickne..."
462,"These filaments or rudimentary leaves, as well..."
468,The present species would have been classed am...
1847,Fritz Müller informs me that he saw in the for...
1660,But he who believes in the slow modification o...
357,The leaves are so small compared with those of...
1849,"These roots therefore seem to be true twiners,..."


In [71]:
## build w1gram
import re
w1grams = raw_df['sentence'].apply(lambda x: re.split(r"\s+", x))

## convert to lowercase
w1grams = [ [ x.lower() for x in w1gram ] for w1gram in w1grams ]

## remove ineffective characters
removed_chars = r"[-.,:;!?()_\"\'“”‘’]"
w1grams = [ [ re.sub(removed_chars, "", x) for x in w1gram ] for w1gram in w1grams ]

## exclude single-character words
w1grams = [ [x for x in w1gram if len(x) > 1 ] for w1gram in w1grams ]

In [72]:
## remove too frequent words
from collections import Counter
all_words = [ ]
[ all_words.extend(x) for x in w1grams ] 
word_counts = Counter(all_words)
reduct_rate = 0.003 # needs to be optimized text-wise
too_frequents = word_counts.most_common(round(len(word_counts) * reduct_rate))
pp.pprint(too_frequents)
print(f"number of removed items: {len(too_frequents)}")

[('the', 4019),
 ('of', 1941),
 ('and', 1439),
 ('in', 1360),
 ('to', 1025),
 ('is', 527),
 ('it', 509),
 ('that', 486),
 ('as', 473),
 ('with', 460),
 ('this', 414)]
number of removed items: 11


In [73]:
## exclude too frequent words
w1grams = [ [x for x in w1gram if not x in too_frequents ] for w1gram in w1grams ]
raw_df['w1gram'] = w1grams
raw_df['size'] = raw_df['w1gram'].apply(lambda x: len(x))
raw_df

Unnamed: 0,sentence,w1gram,size
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10
1,PREFACE,[preface],1
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, corrected, and,...",14
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9
...,...,...,...
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8
1854,Fritz Müller also has shown in relation to our...,"[fritz, müller, also, has, shown, in, relation...",43
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",30
1856,"Annales des Sc. Nat. 4th series, Bot. tom. vi....","[annales, des, sc, nat, 4th, series, bot, tom,...",11


In [74]:
## define df by filtering by length
print(f"originally: {len(raw_df)}")
df = raw_df[ (doc_min_size <= raw_df['size']) & (raw_df['size'] <= doc_max_size) ]
print(f"after filtering: {len(df)}")

originally: 1858
after filtering: 1104


In [75]:
## skippy word 2-grams
#import ngrams_skippy
#skippy_w2grams = [ ngrams_skippy.gen_skippy2grams(x, sep = " ", missing_mark = gap_mark, max_distance = max_gap_val) for x in df['w1gram'] ]
## the inefficient code above was replaced by the following, generalized one 
import gen_ngrams
skippy_w2grams = [ gen_ngrams.gen_skippy_ngrams(x, 2, max_distance = max_gap_val, sep = " ", missing_mark = gap_mark) for x in df['w1gram'] ]
#
if ngram_is_inclusive:
    for i, gram in enumerate(skippy_w2grams):
        gram.extend(list(df['w1gram'])[i])
#
df['skippy_w2gram'] = skippy_w2grams
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skippy_w2gram'] = skippy_w2grams


Unnamed: 0,sentence,w1gram,size,skippy_w2gram
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10,"[the movement, the … and, the … habits, th..."
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18,"[this essay, this … first, this … appeared..."
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, corrected, and,...",14,"[it is, it … here, it … reproduced, it … ..."
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9,"[the illustrations, the … were, the … draw..."
7,These memoirs ought to be carefully studied by...,"[these, memoirs, ought, to, be, carefully, stu...",25,"[these memoirs, these … ought, these … to,..."
...,...,...,...,...
1851,"Quoted by Cohn, in his remarkable memoir, “Con...","[quoted, by, cohn, in, his, remarkable, memoir...",18,"[quoted by, quoted … cohn, quoted … in, qu..."
1852,"Such slight spontaneous movements, I now find,...","[such, slight, spontaneous, movements, now, fi...",29,"[such slight, such … spontaneous, such … m..."
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8,"[sachs textbook, sachs … of, sachs … botan..."
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",30,"[mr herbert, mr … spencer, mr … has, mr …..."


In [76]:
## skippy word 3-grams
#import ngrams_skippy
#skippy_w3grams = [ ngrams_skippy.gen_skippy3grams(x, sep = " ", missing_mark = gap_mark, max_distance = max_gap_val) for x in df['w1gram'] ]
## the inefficient code above was replaced by the following, generalized one
import gen_ngrams
skippy_w3grams = [ gen_ngrams.gen_skippy_ngrams(x, 3, max_distance = max_gap_val, sep = " ", missing_mark = gap_mark) for x in df['w1gram'] ]
#
if ngram_is_inclusive:
    for i, gram in enumerate(skippy_w3grams):
        gram.extend(list(df['skippy_w2gram'])[i])
#
df['skippy_w3gram'] = skippy_w3grams
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skippy_w3gram'] = skippy_w3grams


Unnamed: 0,sentence,w1gram,size,skippy_w2gram,skippy_w3gram
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10,"[the movement, the … and, the … habits, th...","[the movement and, the movement … habits, th..."
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18,"[this essay, this … first, this … appeared...","[this essay first, this essay … appeared, th..."
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, corrected, and,...",14,"[it is, it … here, it … reproduced, it … ...","[it is here, it is … reproduced, it is … i..."
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9,"[the illustrations, the … were, the … draw...","[the illustrations were, the illustrations … ..."
7,These memoirs ought to be carefully studied by...,"[these, memoirs, ought, to, be, carefully, stu...",25,"[these memoirs, these … ought, these … to,...","[these memoirs ought, these memoirs … to, th..."
...,...,...,...,...,...
1851,"Quoted by Cohn, in his remarkable memoir, “Con...","[quoted, by, cohn, in, his, remarkable, memoir...",18,"[quoted by, quoted … cohn, quoted … in, qu...","[quoted by cohn, quoted by … in, quoted by ..."
1852,"Such slight spontaneous movements, I now find,...","[such, slight, spontaneous, movements, now, fi...",29,"[such slight, such … spontaneous, such … m...","[such slight spontaneous, such slight … move..."
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8,"[sachs textbook, sachs … of, sachs … botan...","[sachs textbook of, sachs textbook … botany,..."
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",30,"[mr herbert, mr … spencer, mr … has, mr …...","[mr herbert spencer, mr herbert … has, mr he..."


In [77]:
## skippy word 4-grams
import gen_ngrams
if term_type == 'skippy_w4gram':
    skippy_w4grams = [ gen_ngrams.gen_skippy_ngrams(x, 4, max_distance = max_gap_val, sep = " ", missing_mark = gap_mark) for x in df['w1gram'] ]
    #
    if ngram_is_inclusive:
        for i, gram in enumerate(skippy_w4grams):
            gram.extend(list(df['skippy_w3gram'])[i])
    #
    df['skippy_w4gram'] = skippy_w4grams
    df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skippy_w4gram'] = skippy_w4grams


Unnamed: 0,sentence,w1gram,size,skippy_w2gram,skippy_w3gram,skippy_w4gram
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10,"[the movement, the … and, the … habits, th...","[the movement and, the movement … habits, th...","[the movement and habits, the movement and … ..."
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18,"[this essay, this … first, this … appeared...","[this essay first, this essay … appeared, th...","[this essay first appeared, this essay first ..."
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, corrected, and,...",14,"[it is, it … here, it … reproduced, it … ...","[it is here, it is … reproduced, it is … i...","[it is here reproduced, it is here … in, it ..."
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9,"[the illustrations, the … were, the … draw...","[the illustrations were, the illustrations … ...","[the illustrations were drawn, the illustratio..."
7,These memoirs ought to be carefully studied by...,"[these, memoirs, ought, to, be, carefully, stu...",25,"[these memoirs, these … ought, these … to,...","[these memoirs ought, these memoirs … to, th...","[these memoirs ought to, these memoirs ought ..."
...,...,...,...,...,...,...
1851,"Quoted by Cohn, in his remarkable memoir, “Con...","[quoted, by, cohn, in, his, remarkable, memoir...",18,"[quoted by, quoted … cohn, quoted … in, qu...","[quoted by cohn, quoted by … in, quoted by ...","[quoted by cohn in, quoted by cohn … his, qu..."
1852,"Such slight spontaneous movements, I now find,...","[such, slight, spontaneous, movements, now, fi...",29,"[such slight, such … spontaneous, such … m...","[such slight spontaneous, such slight … move...","[such slight spontaneous movements, such sligh..."
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8,"[sachs textbook, sachs … of, sachs … botan...","[sachs textbook of, sachs textbook … botany,...","[sachs textbook of botany, sachs textbook of ..."
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",30,"[mr herbert, mr … spencer, mr … has, mr …...","[mr herbert spencer, mr herbert … has, mr he...","[mr herbert spencer has, mr herbert spencer …..."


In [78]:
## build doc_dict
doc_dict = { i : x for i, x in enumerate(df['sentence']) }
pp.pprint(random.sample(doc_dict.items(), 5))

[(1053, 'See Dr. H. de Vries (ibid. p. 324) on this subject.'),
 (587,
  'The tendrils, on the other hand, when the internodes and petioles are '
  'secured, describe irregular spires or regular ellipses, exactly like those '
  'made by the internodes.'),
 (585, 'The direction followed is variable, either with or against the sun.'),
 (1042,
  'In the course of some hours it contracts into a spire, dragging up the '
  'stem, and forming an excellent spring.'),
 (98,
  'On August 15th the shoot followed, during a period of 10 hrs. 40 m., a long '
  'and deeply zigzag course and then made a broad ellipse.')]


since Python 3.9 and will be removed in a subsequent version.
  pp.pprint(random.sample(doc_dict.items(), 5))


In [79]:
## select bots for DTM
print(f"term_type: {term_type}")
bots = df[term_type]
bots = [ bot for bot in bots if len(bot) > min_bot_size ]
random.sample(bots, 3)

term_type: skippy_w4gram


[['this surprising difference in',
  'this surprising difference  …  the',
  'this surprising difference  …  leaves',
  'this surprising difference  …  have',
  'this surprising difference  …  also',
  'this surprising difference  …  observed',
  'this surprising difference  …  in',
  'this surprising difference  …  plant',
  'this surprising difference  …  of',
  'this surprising difference  …  marcgravia',
  'this surprising difference  …  dubia',
  'this surprising difference  …  in',
  'this surprising  …  in the',
  'this surprising  …  in  …  leaves',
  'this surprising  …  in  …  have',
  'this surprising  …  in  …  also',
  'this surprising  …  in  …  observed',
  'this surprising  …  in  …  in',
  'this surprising  …  in  …  plant',
  'this surprising  …  in  …  of',
  'this surprising  …  in  …  marcgravia',
  'this surprising  …  in  …  dubia',
  'this surprising  …  in  …  in',
  'this surprising  …  the leaves',
  'this surprising  …  the  …  have',
  'this surprising  …  

In [80]:
## build diction, corpus = dtm
from gensim.corpora import Dictionary

## dtm
diction = Dictionary(bots)
print(diction)
diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
print(diction)

## corpus
corpus = [ diction.doc2bow(bot) for bot in bots ]

Dictionary<4010444 unique tokens: ['and', 'and  …  by', 'and  …  by  …  darwin', 'and  …  by charles', 'and  …  by charles darwin']...>
Dictionary<31409 unique tokens: ['and  …  by', 'and  …  climbing', 'and  …  climbing plants', 'and  …  of  …  by', 'and  …  of  …  plants']...>


Run HDP

In [81]:
## HDP (max_n_topics = 150)
import numpy as np
import gensim.models
import pyLDAvis.gensim

hdp_full = gensim.models.HdpModel(corpus, diction, random_state = 1)
vis_data = pyLDAvis.gensim.prepare(hdp_full, corpus, diction)
pyLDAvis.display(vis_data)

In [82]:
## HDP (max_n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [83]:
## save LDAvis output as a html file
save_LDAvis = True
if save_LDAvis:
	output = f"results/LDAvis/Darwin-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(vis_data, output)

In [84]:
## HDP (max_n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [85]:
## HDP (max_n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)