# Set up

In [18]:
#!conda update conda -y
#!conda install Cython -y

: 

In [19]:
#%pip install -U pandas
#%pip install -U pyLDAvis

In [20]:
## imports
import os, sys
import pprint as pp

In [21]:
## 一つ上の階層のファイルを見るように設定
sys.path.append(os.path.join(os.path.dirname("__file__"), '..'))

In [22]:
## Cython module の生成 (必要に応じて)
#!python clean setup.py build_ext --inplace

In [None]:
## Cython を使うかどうか
use_Cython = True
if use_Cython:
    %load_ext Cython

In [None]:
## doc
doc_max_size = 30 # max count of words in a sentence
doc_min_size = 5  # min count of words in a sentence

## term: w_skippy4gram often fails
term_types    = [ 'w_1gram', 'w_2gram', 'w_3gram', 'w_4gram',
                'w_skippy2gram', 'w_skippy3gram', 'w_skippy4gram' ]
term_type     = term_types[-1]
# n-gram
ngram_is_inclusive  = True
ngram_inclusiveness = 1 # When the value is k, n_gram contains (n-k)-grams
## skippy n-grams
gap_mark      = "…"
max_gap_ratio = 0.23
max_gap_val   = round(doc_max_size * max_gap_ratio)

## check
print(f"term_type: {term_type}")
print(f"max_gap_val for skippy n-gram: {max_gap_val}")

In [25]:
## HDP: The following parameters need to be relatively large for HDP, unlike LDA
bot_min_size     = doc_min_size
term_minfreq     = 3
abuse_threshold  = 0.1
n_docs_to_show   = 10
n_terms_to_show  = 15
# flags for HDP with larger topics
explore_90_topics  = False
explore_120_topics = False
explore_150_topics = False

# Get data from files

In [None]:
## variables
random_target = False

## Get target files
import glob
data_dir = "data/Darwin-texts/single-lined/"
target_files = glob.glob(f"{data_dir}/*")
target_files = [ file for file in target_files if ".txt" in file ]
pp.pprint(target_files)

In [None]:
## Read data from files
import random
import pandas as pd

if random_target:
    file = random.choice(target_files)
else:
    file = target_files[0]
print(f"processing: {file}")
#
if file.endswith(".csv"):
    with open(file, "rt") as f:
        raw_df = pd.read_csv(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".txt"):
    with open(file, "rt") as f:
        raw_df = pd.read_table(f, encoding = 'utf8', header = None, names = ['sentence'])
elif file.endswith(".xlsx"):
    with open(file, "rb") as f:
        raw_df = pd.read_excel(f, index_col = 0)
# 
raw_df.sample(10)

In [None]:
## build w1gram
import re

w_1grams = raw_df['sentence'].apply(lambda x: re.split(r"\s+", x))

## convert to lowercase
w_1grams = [ [ x.lower() for x in w1gram ] for w1gram in w_1grams ]

## remove ineffective characters
removed_chars = r"[-.,:;!?()_\"\'“”‘’]"
w_1grams = [ [ re.sub(removed_chars, "", x) for x in w_1gram ] for w_1gram in w_1grams ]

## exclude single-character words
#w1grams = [ [x for x in w1gram if len(x) > 1 ] for w1gram in w1grams ]
[ words[:5] for words in random.sample(w_1grams, 3) ]

In [None]:
## remove too frequent words
from collections import Counter
all_words = [ ]
[ all_words.extend(x) for x in w_1grams ] 
word_counts = Counter(all_words)
reduct_rate = 0.003 # needs to be optimized text-wise
too_frequents = word_counts.most_common(round(len(word_counts) * reduct_rate))
pp.pprint(too_frequents)
print(f"number of removed items: {len(too_frequents)}")

In [None]:
## exclude too frequent words
w_1grams = [ [ x for x in w1gram if not x in too_frequents ] for w1gram in w_1grams ]
raw_df['w_1gram'] = w_1grams
raw_df['size'] = raw_df['w_1gram'].apply(lambda x: len(x))
raw_df

In [None]:
## define df by filtering by length
print(f"originally: {len(raw_df)}")
df = raw_df[ (doc_min_size <= raw_df['size']) & (raw_df['size'] <= doc_max_size) ]
print(f"after filtering: {len(df)}")

In [None]:
df

# Build n-grams

In [39]:
## generic function for n-gram generation: for words, seg_joint = " "
reload_module_on_run = False
if reload_module_on_run:
    import importlib
def add_ngram_to_df(dfx, n_for_ngram: int, seg_joint: str = " ", var_prefix: str = "", ngram_is_skippy: bool = False, ngram_is_inclusive: bool = ngram_is_inclusive, ngram_inclusiveness: int = ngram_inclusiveness, use_Cython: bool = use_Cython, check: bool = False):
    """
    generic function for adding n-gram column to df with a specified n for ngram
    """
    print(f"use_Cython: {use_Cython}")
    inclusion_size = (n_for_ngram - ngram_inclusiveness)
    print(f"inclusion_size: {inclusion_size}")
    assert inclusion_size >= 0
    source_var = f"{var_prefix}1gram"
    print(f"source_var: {source_var}")
    unigrams = df[source_var]
    if use_Cython:
        import cy_gen_ngrams
        if reload_module_on_run:
            importlib.reload(cy_gen_ngrams)
        if ngram_is_skippy:
            ngrams = [ [seg_joint.join(x) for x in cy_gen_ngrams.cy_gen_skippy_ngrams(x, n = n_for_ngram, check = False)] for x in unigrams ]
        else:
            ngrams = [ [seg_joint.join(x) for x in cy_gen_ngrams.cy_gen_ngrams(x, n = n_for_ngram, check = False)] for x in unigrams ]
    else:
        import gen_ngrams
        if ngram_is_skippy:
            ngrams = [ gen_ngrams.gen_skippy_ngrams(x, n = n_for_ngram, sep = seg_joint, check = False) for x in unigrams ]
        else:
            ngrams = [ gen_ngrams.gen_ngrams(x, n = n_for_ngram, sep = seg_joint, check = False) for x in unigrams ]
    ## 包括的 2gramの生成
    if ngram_is_inclusive:
        assert (n_for_ngram - inclusion_size) > 0
        if ngram_is_skippy and n_for_ngram > 2:
            supplement_var = f"{var_prefix}skippy{n_for_ngram - 1}gram"
        else:
            supplement_var = f"{var_prefix}{n_for_ngram - 1}gram"
        print(f"supplement_var: {supplement_var}")
        for i, g in enumerate(ngrams):
            included = [ x for x in list(dfx[supplement_var])[i] if len(x) >= inclusion_size ]
            if len(included) > 0:
                g.extend(included)
    ## 変数の追加
    if ngram_is_skippy:
        added_var = f"{var_prefix}skippy{n_for_ngram}gram"
    else:
        added_var = f"{var_prefix}{n_for_ngram}gram"
    print(f"added_var: {added_var}")
    dfx[added_var] = ngrams
    ## check result
    print(dfx[added_var])

In [None]:
## word 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = False, ngram_is_inclusive = True, check = False)

In [None]:
## word 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = False, ngram_is_inclusive = True, check = False)

In [None]:
## word 4grams
add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = False, ngram_is_inclusive = True, check = False)

In [None]:
## word skippy 2grams
add_ngram_to_df(df, n_for_ngram = 2, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = True, ngram_is_inclusive = True, check = False) # For words, seg_joint = " "

In [None]:
## word skippy 3grams
add_ngram_to_df(df, n_for_ngram = 3, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = True, ngram_is_inclusive = True, check = False) # For words, seg_joint = " "

In [None]:
## word skippy 4grams
if term_type == "w_skippy4gram":
    add_ngram_to_df(df, n_for_ngram = 4, var_prefix = "w_", seg_joint = " ", ngram_is_skippy = True, ngram_is_inclusive = True, check = False) # For words, seg_joint = " "

# DTM

In [46]:
#df['sentence']

In [None]:
## build doc_dict
doc_dict = { i : x for i, x in enumerate(df['sentence']) }
pp.pprint(random.sample(list(doc_dict.items()), 5)) # list(...) is needed after 3.10

In [None]:
## select bots for DTM
print(f"term_type: {term_type}")
bots = list(df[term_type])
bots = [ bot for bot in bots if len(bot) >= bot_min_size ]
random.sample(bots, 2)

In [None]:
## build diction, corpus = dtm
from gensim.corpora import Dictionary

## dtm
diction = Dictionary(bots)
print(diction)

## filtering
diction.filter_extremes(no_below = term_minfreq, no_above = abuse_threshold)
print(diction)

## corpus building: allow_update prevents "Not all rows ..." errror but it takes considerably longer.
## Sanitization with nonzero filtering is more effective.
#corpus = [ diction.doc2bow(bot, allow_update = True) for bot in bots ]
corpus = [ diction.doc2bow(bot) for bot in bots if len(bot) >= doc_min_size ]

In [None]:
## sanitize corpus: Crucial for HDP
original_size = len(corpus)
corpus = [ doc for doc in corpus if len(doc) > 0 ]
print(f"discarded {original_size - len(corpus)} docs")

# Run HDP

In [51]:
## topic investigation のutility function
def investigate_topics(target_hdp, n_docs_to_show: int = n_docs_to_show,
                       n_terms_to_show: int = n_terms_to_show, precision: float = 0.4):
    import numpy as np
    import HDP_helper

    ## collect valid date
    documents_topics = np.zeros([target_hdp.m_T, len(corpus)])
    for doc_id, c in enumerate(corpus):
        for topic_id, prob in target_hdp[c]:
            documents_topics[topic_id][doc_id] = prob
    
    ## investigate topics
    target_hdp.optimal_ordering()
    for topic_id, probs in enumerate(documents_topics):
        print(f"==============")
        topic_t = target_hdp.print_topic(topic_id, topn = n_terms_to_show)
        print(f"topic_id {topic_id}: {HDP_helper.reformat_topic (topic_t, n_terms_to_show)}")
        print(f"nonzero count: ", len(probs.nonzero()[0]))
        for doc_id in probs.argsort()[::-1][:n_docs_to_show]:
            doc = doc_dict[doc_id]
            print(f"\t{probs[doc_id]:{precision}f}: {doc}")

In [None]:
## HDP (max_n_topics = 15)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 15
hdp15 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data15 = pyLDAvis.gensim.prepare(hdp15, corpus, diction)
pyLDAvis.display(vis_data15)

In [None]:
## topic investigation
investigate_topics(hdp15)

In [None]:
## HDP (max_n_topics = 45)
import gensim.models
import pyLDAvis.gensim

max_n_topics = 45
hdp45 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
vis_data45 = pyLDAvis.gensim.prepare(hdp45, corpus, diction)
pyLDAvis.display(vis_data45)

In [None]:
## topic investigation
investigate_topics(hdp45)

In [56]:
## HDP (max_n_topics = 90)
if explore_90_topics:
    import gensim.models
    import pyLDAvis.gensim

    max_n_topics = 90
    hdp90 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
    vis_data90 = pyLDAvis.gensim.prepare(hdp90, corpus, diction)
    pyLDAvis.display(vis_data90)

In [57]:
## topic investigation
if explore_90_topics:
    investigate_topics(hdp90)

In [58]:
## HDP (max_n_topics = 120)
if explore_120_topics:
    import gensim.models
    import pyLDAvis.gensim

    max_n_topics = 120
    hdp120 = gensim.models.HdpModel(corpus, diction, random_state = 1, T = max_n_topics)
    vis_data120 = pyLDAvis.gensim.prepare(hdp120, corpus, diction)
    pyLDAvis.display(vis_data120)

In [59]:
## topic investigation
if explore_120_topics:
    investigate_topics(hdp120)

In [60]:
## HDP (max_n_topics = 150)
if explore_150_topics:
    import numpy as np
    import gensim.models
    import pyLDAvis.gensim

    max_n_topics = 150
    hdp150 = gensim.models.HdpModel(corpus, diction, T = max_n_topics, random_state = 1)
    vis_data150 = pyLDAvis.gensim.prepare(hdp150, corpus, diction)
    pyLDAvis.display(vis_data150)

In [61]:
## topic investigation
if explore_150_topics:
    investigate_topics(hdp150)

In [None]:
## save LDAvis output as a html file
save_LDAvis = True
ntops = [ 15, 45, 90, 120, 150 ]
vis_targets = [ f"vis_data{ntop}" for ntop in ntops ]
print(f"vis_targets: {vis_targets}")
vis_target_data = vis_targets[0]
print(f"vis_target_data: {vis_target_data}")
if save_LDAvis:
	output = f"results/LDAvis/Darwin-HDP-max_ntop{max_n_topics}-{term_type}.html"
	pyLDAvis.save_html(eval(vis_target_data), output)