In [78]:
#%pip install -U pandas

In [79]:
#%pip install -U pyLDAvis

In [80]:
## imports
import os, sys
import pprint as pp

In [81]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [82]:
## variables
random_target = False

## HDP: The following parameters need to be relatively large for HDP, unlike LDA
min_bot_size    = 3
term_minfreq    = 3
abuse_threshold = 0.03

## doc
doc_max_size = 30 # max count of words in a sentence
doc_min_size = 5  # min count of words in a sentence

## term
ngram_is_inclusive = True
term_types   = [ 'w1gram', 'w2gram', 'w3gram', 'skippy_w2gram', 'skippy_w3gram' ]
term_type    = term_types[-1]
max_gap_val  = round(doc_max_size * 0.5)
gap_mark     = " … "

In [83]:
## Get target files
import glob
data_dir = "data/Darwin-texts/single-lined/"
target_files = glob.glob(f"{data_dir}/*")
target_files = [ file for file in target_files if ".txt" in file ]
pp.pprint(target_files)

['data/Darwin-texts/single-lined/sl-2485-body.utf-8.txt',
 'data/Darwin-texts/single-lined/sl-1227-body.utf-8.txt']


In [84]:
## Read data from files

import random
import pandas as pd

if random_target:
    file = random.choice(target_files)
else:
    file = target_files[0]
print(f"processing: {file}")

if file.endswith(".csv"):
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".txt"):
    with open(file, "rt") as f:
        raw_df = pd.read_table(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".xlsx"):
    with open(file, "rb") as f:
        raw_df = pd.read_excel(f, index_col = 0)
# 
raw_df.sample(10)

processing: data/Darwin-texts/single-lined/sl-2485-body.utf-8.txt


Unnamed: 0,sentence
278,"When, however, Dutrochet cut off two whole sho..."
930,An open space is thus left for the next succee...
580,]
1196,After a tendril has spontaneously revolved for...
1321,"These tendrils are, however, but slightly sens..."
1660,But he who believes in the slow modification o...
225,"On the other hand, some plants take 24 hrs. fo..."
190,"Hibbertia dentata (Dilleniaceæ), placed in the..."
1426,The spiral contraction which ensues after a te...
871,A tendril in this state supported nearly seven...


In [85]:
## build w1gram
import re
w1grams = raw_df['sentence'].apply(lambda x: re.split(r"\s+", x))

## convert to lowercase
w1grams = [ [ x.lower() for x in w1gram ] for w1gram in w1grams ]

## remove ineffective characters
removed_chars = r"[-.,:;!?()_\"\'“”‘’]"
w1grams = [ [ re.sub(removed_chars, "", x) for x in w1gram ] for w1gram in w1grams ]

## exclude single-character words
w1grams = [ [x for x in w1gram if len(x) > 1 ] for w1gram in w1grams ]

In [86]:
## remove too frequent words
from collections import Counter
all_words = [ ]
[ all_words.extend(x) for x in w1grams ] 
word_counts = Counter(all_words)
reduct_rate = 0.003 # needs to be optimized text-wise
too_frequents = word_counts.most_common(round(len(word_counts) * reduct_rate))
pp.pprint(too_frequents)
len(too_frequents)

[('the', 4019),
 ('of', 1941),
 ('and', 1439),
 ('in', 1360),
 ('to', 1025),
 ('is', 527),
 ('it', 509),
 ('that', 486),
 ('as', 473),
 ('with', 460),
 ('this', 414)]


11

In [87]:
## exclude too frequent words
w1grams = [ [x for x in w1gram if not x in too_frequents ] for w1gram in w1grams ]
raw_df['w1gram'] = w1grams
raw_df['size'] = raw_df['w1gram'].apply(lambda x: len(x))
raw_df

Unnamed: 0,sentence,w1gram,size
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10
1,PREFACE,[preface],1
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, corrected, and,...",14
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9
...,...,...,...
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8
1854,Fritz Müller also has shown in relation to our...,"[fritz, müller, also, has, shown, in, relation...",43
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",30
1856,"Annales des Sc. Nat. 4th series, Bot. tom. vi....","[annales, des, sc, nat, 4th, series, bot, tom,...",11


In [88]:
## define df by filtering by length
print(len(raw_df))
df = raw_df[ (doc_min_size <= raw_df['size']) & (raw_df['size'] <= doc_max_size) ]
print(len(df))

1858
1104


In [89]:
## skippy word 2-grams
import ngrams_skippy
skippy_w2grams = [ ngrams_skippy.gen_skippy2grams(x, sep = " ", missing_mark = gap_mark, max_distance = max_gap_val)
                  for x in df['w1gram'] ]

if ngram_is_inclusive:
    for i, gram in enumerate(skippy_w2grams):
        gram.extend(list(df['w1gram'])[i])

df['skippy_w2gram'] = skippy_w2grams
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skippy_w2gram'] = skippy_w2grams


Unnamed: 0,sentence,w1gram,size,skippy_w2gram
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10,"[the movement, the … and, the … habits, the … ..."
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18,"[this essay, this … first, this … appeared, th..."
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, corrected, and,...",14,"[it is, it … here, it … reproduced, it … in, i..."
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9,"[the illustrations, the … were, the … drawn, t..."
7,These memoirs ought to be carefully studied by...,"[these, memoirs, ought, to, be, carefully, stu...",25,"[these memoirs, these … ought, these … to, the..."
...,...,...,...,...
1851,"Quoted by Cohn, in his remarkable memoir, “Con...","[quoted, by, cohn, in, his, remarkable, memoir...",18,"[quoted by, quoted … cohn, quoted … in, quoted..."
1852,"Such slight spontaneous movements, I now find,...","[such, slight, spontaneous, movements, now, fi...",29,"[such slight, such … spontaneous, such … movem..."
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8,"[sachs textbook, sachs … of, sachs … botany, s..."
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",30,"[mr herbert, mr … spencer, mr … has, mr … rece..."


In [90]:
## skippy word 3-grams
import ngrams_skippy
skippy_w3grams = [ ngrams_skippy.gen_skippy3grams(x, sep = " ", missing_mark = gap_mark, max_distance = max_gap_val)
                  for x in df['w1gram'] ]

if ngram_is_inclusive:
    for i, gram in enumerate(skippy_w3grams):
        gram.extend(list(df['skippy_w2gram'])[i])

df['skippy_w3gram'] = skippy_w3grams
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['skippy_w3gram'] = skippy_w3grams


Unnamed: 0,sentence,w1gram,size,skippy_w2gram,skippy_w3gram
0,The Movement and Habits of Climbing Plants by...,"[the, movement, and, habits, of, climbing, pla...",10,"[the movement, the … and, the … habits, the … ...","[the movement and, the movement … habits, the ..."
2,THIS Essay first appeared in the ninth volume ...,"[this, essay, first, appeared, in, the, ninth,...",18,"[this essay, this … first, this … appeared, th...","[this essay first, this essay … appeared, this..."
3,"It is here reproduced in a corrected and, I ho...","[it, is, here, reproduced, in, corrected, and,...",14,"[it is, it … here, it … reproduced, it … in, i...","[it is here, it is … reproduced, it is … in, i..."
4,"The illustrations were drawn by my son, George...","[the, illustrations, were, drawn, by, my, son,...",9,"[the illustrations, the … were, the … drawn, t...","[the illustrations were, the illustrations … d..."
7,These memoirs ought to be carefully studied by...,"[these, memoirs, ought, to, be, carefully, stu...",25,"[these memoirs, these … ought, these … to, the...","[these memoirs ought, these memoirs … to, thes..."
...,...,...,...,...,...
1851,"Quoted by Cohn, in his remarkable memoir, “Con...","[quoted, by, cohn, in, his, remarkable, memoir...",18,"[quoted by, quoted … cohn, quoted … in, quoted...","[quoted by cohn, quoted by … in, quoted by … h..."
1852,"Such slight spontaneous movements, I now find,...","[such, slight, spontaneous, movements, now, fi...",29,"[such slight, such … spontaneous, such … movem...","[such slight spontaneous, such slight … moveme..."
1853,"Sachs’ ‘Text-Book of Botany’ 1875, pp. 766, 785.","[sachs, textbook, of, botany, 1875, pp, 766, 785]",8,"[sachs textbook, sachs … of, sachs … botany, s...","[sachs textbook of, sachs textbook … botany, s..."
1855,Mr. Herbert Spencer has recently argued (‘Prin...,"[mr, herbert, spencer, has, recently, argued, ...",30,"[mr herbert, mr … spencer, mr … has, mr … rece...","[mr herbert spencer, mr herbert … has, mr herb..."


In [91]:
## build doc_dict
doc_dict = { i : x for i, x in enumerate(df['sentence']) }
pp.pprint(random.sample(doc_dict.items(), 5))

[(903,
  'With most tendrils the lower or basal part is either not at all sensitive, '
  'or sensitive only to prolonged contact.'),
 (325, 'The shoots, however, sometimes stood still.'),
 (45,
  'The stems, on the other hand, which had ascended ordinary rough sticks were '
  'all more or less and generally much twisted.'),
 (240, 'They never become developed into leaves.'),
 (957,
  'Has, for instance, any tendril-bearing plant assumed its present structure '
  'without having previously existed as a leaf-climber or a twiner?')]


since Python 3.9 and will be removed in a subsequent version.
  pp.pprint(random.sample(doc_dict.items(), 5))


In [92]:
## select bots for DTM
print(f"term_type: {term_type}")
bots = df[term_type]
bots = [ bot for bot in bots if len(bot) > min_bot_size ]
random.sample(bots, 3)

term_type: skippy_w3gram


[['this plant presents',
  'this plant … case',
  'this plant … not',
  'this plant … observed',
  'this plant … by',
  'this plant … me',
  'this plant … in',
  'this plant … any',
  'this plant … other',
  'this plant … leafclimber',
  'this plant … or',
  'this plant … twiner',
  'this plant … namely',
  'this plant … that',
  'this … presents case',
  'this … presents … not',
  'this … presents … observed',
  'this … presents … by',
  'this … presents … me',
  'this … presents … in',
  'this … presents … any',
  'this … presents … other',
  'this … presents … leafclimber',
  'this … presents … or',
  'this … presents … twiner',
  'this … presents … namely',
  'this … presents … that',
  'this … case not',
  'this … case … observed',
  'this … case … by',
  'this … case … me',
  'this … case … in',
  'this … case … any',
  'this … case … other',
  'this … case … leafclimber',
  'this … case … or',
  'this … case … twiner',
  'this … case … namely',
  'this … case … that',
  'this … 

In [93]:
## build diction, corpus = dtm
from gensim.corpora import Dictionary

## dtm
diction = Dictionary(bots)
print(diction)
diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
print(diction)

## corpus
corpus = [ diction.doc2bow(bot) for bot in bots ]

Dictionary(963540 unique tokens: ['and', 'and habits', 'and habits of', 'and habits … by', 'and habits … charles']...)
Dictionary(25613 unique tokens: ['and … climbing', 'and … climbing plants', 'and … of climbing', 'and … of … by', 'and … of … plants']...)


Run HDP

In [94]:
## HDP (max_n_topics = 150)
import numpy as np
import gensim.models
import pyLDAvis.gensim

hdp_full = gensim.models.HdpModel(corpus, diction, random_state = 1)
vis_data = pyLDAvis.gensim.prepare(hdp_full, corpus, diction)
pyLDAvis.display(vis_data)

In [95]:
## HDP (max_n_topics = 90)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 90
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [96]:
## HDP (max_n_topics = 45)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 45
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)

In [97]:
## HDP (max_n_topics = 15)
import gensim.models
import pyLDAvis.gensim
max_n_topics = 15
hdp = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data = pyLDAvis.gensim.prepare(hdp, corpus, diction)
pyLDAvis.display(vis_data)