# LDA Pipeline for Contracts

Generates a topic-by-topic graph (K=16) comparing mean of obligation_constraint with mean of permission_entitlement for employers, and then another set of graphs comparing this mean for workers

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import os

from tqdm import tqdm
import pandas as pd
import gensim
#from contracts_global import doc2tokens, batchStreamObjectBranches
from joblib import Parallel, delayed

import pipeline
import plutil

# Part 0: Initialize the pipeline

In [130]:
pl = pipeline.Pipeline("canadian", config_fname="canadian_jj.conf", mode="s3",
                       lang_list=["eng"], splitter="elliott", batch_mode="contract",
                       use_aws=False, num_lda_topics=20, verbose=True)

Looking for config file ./configs/canadian_jj.conf
Full output path: C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/
Looking for config file ./configs/canadian_jj.conf


# Part 1: Prepare LDA inputs

## Step 1: Extract object branches

In [6]:
# This will hold DFs for each contract, which we combine at the end to make a
# single obranch.pkl file
all_obranch_dfs = []
pdata_pkl_path = pl.get_pdata_output_path()
pl.vprint("Loading pdata files from " + str(pdata_pkl_path))
pdata_pkl_fpaths = glob.glob(os.path.join(pdata_pkl_path,"*.pkl"))
pdata_iter = tqdm(pdata_pkl_fpaths)
for cur_pdata_pkl_fpath in pdata_iter:
    pkl_fname = os.path.basename(cur_pdata_pkl_fpath)
    pdata_iter.set_description(pkl_fname)
    pdata_df = plutil.safe_load_pickle(cur_pdata_pkl_fpath)
    object_df = pdata_df[["contract_id","article_num","sentence_num",
                          "statement_num","object_branches"]].copy()
    num_obj_branches = len(object_df)
    # Transforms the object_branches string into a single space-separated string
    def parse_obj_branch(obj_branch_str):
        all_words = [' '.join(word_list) for word_list in obj_branch_str]
        all_words_str = ' '.join(all_words)
        return all_words_str
    # Convert the object_branches string into a Python list
    object_df["obj_branch_doc"] = object_df["object_branches"].apply(parse_obj_branch)
    # So now we don't need the list version anymore
    del object_df["object_branches"]
    # But now we need to "sum" (concatenate) up to the article level
    # First to the sentence level. agg(lambda col: " ".join(col)) is crucial here
    sent_df = object_df.groupby(["contract_id","article_num","sentence_num"], as_index=False)["obj_branch_doc"].agg(" ".join)
    # And then the article level
    art_df = sent_df.groupby(["contract_id","article_num"], as_index=False)["obj_branch_doc"].agg(" ".join)
    # Now we have a df of object branches at the article level!
    all_obranch_dfs.append(art_df)
# Now we combine the obranch dfs into a single DF across all contracts
obranch_df = pd.concat(all_obranch_dfs)
obranch_pkl_fpath = pl.get_obranch_fpath(ext="pkl")
obranch_csv_fpath = pl.get_obranch_fpath(ext="csv")
#pl.vprint(f"Object branches parsed. Saving {obranch_pkl_fpath}")
plutil.safe_to_pickle(obranch_df, obranch_pkl_fpath)
plutil.safe_to_csv(obranch_df, obranch_csv_fpath)


Loading pdata files from C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/03b_pdata_pkl


0003405a_eng.pkl: 100%|██████████| 20/20 [00:00<00:00, 48.25it/s]


## Step 2: Construct Gensim dictionary

In [7]:
# Uses gensim instead of spacy to run the topic model
import codecs
import csv
import os
from six import iteritems
import string

import gensim
import joblib
import pandas as pd
import numpy as np

import plutil

In [8]:
# Preprocessing options
# (Set to -1 to not remove any tokens)
remove_top_n = -1

In [10]:
# Initialize the dictionary
contract_dict = gensim.corpora.Dictionary()
# We can just directly load the .pkl file of the exported df from
# extract_object_branches
obranch_fpath = pl.get_obranch_fpath(ext="pkl")
obranch_df = plutil.safe_load_pickle(obranch_fpath)
article_stream = obranch_df["obj_branch_doc"]

# Now use this generator to add all the documents with preprocessing
# We also want to save the preprocessed articles (so we don't have to run
# through and preprocess them twice) so we accumulate them into a big list
preprocessed_articles = []
def stream_preprocessed():
    c = 0
    for cur_article in tqdm(article_stream):
        article_clean = pl.preprocess_text(cur_article)
        preprocessed_articles.append(article_clean)
        yield article_clean
contract_dict.add_documents(stream_preprocessed())
# Final preprocessing step: remove tokens that appear in too many contracts
if remove_top_n > -1:
    contract_dict.filter_n_most_frequent(remove_top_n)
dict_fpath = pl.get_lda_dict_fpath()
# Make sure the dirs exist before saving
plutil.make_dirs(dict_fpath)
contract_dict.save(dict_fpath)
# And save the list of preprocessed docs as well
joblib.dump(preprocessed_articles, pl.get_preprocessed_fpath())
# AND if we're working with the object branches, add it as a column in the
# original object branch df and save the new version as well
obranch_df["doc_clean"] = preprocessed_articles
plutil.safe_to_pickle(obranch_df, pl.get_preprocessed_df_fpath())

100%|██████████| 423/423 [00:00<00:00, 444.31it/s]


In [11]:
obranch_df

Unnamed: 0,contract_id,article_num,obj_branch_doc,doc_clean
0,0000102a,0,the general purpose of this agreement to set f...,"[purpos, agreement, set, forth, work, condit, ..."
1,0000102a,1,into force on the date of its by the party sha...,"[forc, date, parti, shall, remain, forc, effec..."
2,0000102a,2,in wood operation in t h e province of newfoun...,"[wood, oper, provinc, newfoundland, save, wood..."
3,0000102a,3,for the purpose of erect structure outside the...,"[purpos, erect, structur, outsid, limit, compa..."
4,0000102a,4,s h agreement a t i t h e procurement of woo...,"[agreement, procur, wood, manufactur, comparis..."
...,...,...,...,...
21,0003405a,21,a vision care plan which will provide for expe...,"[vision, care, plan, provid, expens, incur, em..."
22,0003405a,23,applicable for the aforesaid employee where be...,"[applic, aforesaid, employe, collect, agreemen..."
23,0003405a,24,to become eligible under this plan,"[elig, plan]"
24,0003405a,25,if be an authorized leave of absence have be u...,"[author, leav, absenc, term, plan, employe, co..."


# Part 2: AWS processing

## Step 3: Split the object branch docs into chunks

In [61]:
# Split the full set of docs up into smaller chunks, so corpora can be constructed
# for each chunk (on different AWS instances) and then combined at the end
import pandas as pd
import numpy as np
import gensim

#import aws_api
import plutil

In [79]:
def txt_to_doc(cur_txt, dictionary):
    # Takes in a string containing the plaintext document, and outputs a gensim
    # format doc
    return dictionary.doc2bow(cur_txt.lower().split())

def split_and_save(art_df):
    if pl.get_num_lda_chunks() == -1:
        # No splitting required, just save the df
        output_fpath = pl.get_lda_doclist_fpath()
        plutil.safe_to_pickle(art_df, output_fpath)
        pl.vprint(f"Saved {output_fpath}")
        return [output_fpath]
    # Otherwise, if we're here, split into a list of dfs, one for each subcorpus,
    # with num_lda_subsets subcorpora total
    fpath_list = []
    df_list = np.array_split(art_df, pl.get_num_lda_chunks())
    df_lens = [len(df) for df in df_list]
    pl.vprint("DF split into " + str(pl.num_lda_chunks) + " pieces, lengths: " + str(df_lens))
    # Finally, save each one to a separate pickle file
    for doclist_num, cur_doclist in enumerate(df_list):
        output_fpath = pl.get_lda_doclist_fpath(doclist_num)
        plutil.safe_to_pickle(cur_doclist, output_fpath)
        pl.vprint("Saved " + str(output_fpath))
        fpath_list.append(output_fpath)
    return fpath_list


In [97]:
pl.vprint("split_docs()")
# New version: I'm just going to load all the object branches, tokenize them,
# and pickle them
# The extra float('inf') element is just to make the loop simpler
#if not pl.num_lda_subsets:
#    raise Exception("num_lda_subsets must be specified in the Pipeline "
#                    + "constructor if you are running the LDA pipeline")
# It's a bit easier in object branch mode, since we just split a big df
obranch_df_fpath = pl.get_preprocessed_df_fpath()
pl.vprint("Loading " + str(obranch_df_fpath))
obranch_df = plutil.safe_load_pickle(obranch_df_fpath)
# And just split this bad boi
obranch_fpath_list = split_and_save(obranch_df)

split_docs()
Loading C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/lda02c_preprocessed_df.pkl
Saved C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/lda03_doclist.pkl


In [98]:
obranch_df

Unnamed: 0,contract_id,article_num,obj_branch_doc,doc_clean
0,0000102a,0,the general purpose of this agreement to set f...,"[purpos, agreement, set, forth, work, condit, ..."
1,0000102a,1,into force on the date of its by the party sha...,"[forc, date, parti, shall, remain, forc, effec..."
2,0000102a,2,in wood operation in t h e province of newfoun...,"[wood, oper, provinc, newfoundland, save, wood..."
3,0000102a,3,for the purpose of erect structure outside the...,"[purpos, erect, structur, outsid, limit, compa..."
4,0000102a,4,s h agreement a t i t h e procurement of woo...,"[agreement, procur, wood, manufactur, comparis..."
...,...,...,...,...
21,0003405a,21,a vision care plan which will provide for expe...,"[vision, care, plan, provid, expens, incur, em..."
22,0003405a,23,applicable for the aforesaid employee where be...,"[applic, aforesaid, employe, collect, agreemen..."
23,0003405a,24,to become eligible under this plan,"[elig, plan]"
24,0003405a,25,if be an authorized leave of absence have be u...,"[author, leav, absenc, term, plan, employe, co..."


## Step 4: Export to AWS for batch processing

In [99]:
# Need to install paramiko and fabric if you're using AWS
#!pip install paramiko
#!pip install fabric

In [100]:
# Exports the necessary files to the various AWS instances, before
# remote_control_aws runs them
import datetime
import glob
import os

import aws_api
import fabric_api

In [101]:
# SSH settings
dns_template = "DNS[{i}]=\"{the_dns}\""
env_var_template = "export DNS{i}=\"{dns}\""
ssh_template = "ssh -i /home/jjacobs/aws/jeff.pem -o StrictHostKeyChecking=no ubuntu@{dns} \"mkdir obranch_lda; mv 02* 03* obranch_lda; /home/ubuntu/anaconda3/bin/conda install -y -q fabric; /home/ubuntu/anaconda3/bin/python canadian_aws_corpus_run.py {inst_num};\""

In [102]:
def export_to_aws(custom_glob=None, instances=None):
    # Set to True if you want to export the files via shell script rather than
    # within Python
    create_shell_script = False
    if custom_glob:
        pl.iprint("Using custom glob: \"" + custom_glob + "\"")
    # First we have to launch the aws instances
    inst_ids = aws_api.start_instances(pl.num_lda_subsets)
    # Get their DNS addresses
    dns_list = aws_api.get_all_dns()
    if create_shell_script:
        # Print the DNS array for the aws_corpus_export.sh file
        for dns_num, cur_dns in enumerate(dns_list):
            print(dns_template.format(i=dns_num, the_dns=cur_dns))
        # Now print the export commands that will store the DNS names
        for dns_num, cur_dns in enumerate(dns_list):
            print(env_var_template.format(i=dns_num, dns=cur_dns))
        # Print the ssh commands for the ssh_run_aws.sh file
        ssh_list = []
        for dns_num, cur_dns in enumerate(dns_list):
            ssh_list.append("( " + ssh_template.format(dns=cur_dns, inst_num=dns_num) + " ) & ")
        # Ugh
        ssh_str = "".join(ssh_list)
        print(ssh_str)
    else:
        # Do the exporting directly through python
        fpath_list = []
        # Standard set of files
        fpath_list.append(os.path.join("configs","canadian_dominik.conf"))
        lda_fpath_list = glob.glob(os.path.join("lda_pipeline","lda*"))
        fpath_list.extend(lda_fpath_list)
        fpath_list.append("pipeline.py")
        fpath_list.append("pipeline_util.py")
        fpath_list.append("canadian_aws_corpus_run.py")
        fpath_list.append("aws_api.py")
        fpath_list.append("fabric_api.py")
        #fpath_list.append(pl.get_lda_dict_fpath())
        # And the actual exporting
        for dns_num, cur_dns in enumerate(dns_list):
            if dns_num not in instances:
                continue
            print("Copying to instance #" + str(dns_num))
            # Need to make sure to copy the specific doclist file
            cur_fpath_list = fpath_list.copy()
            #cur_fpath_list.append(pl.get_lda_doclist_fpath(dns_num))
            fabric_api.copy_to_instance(cur_fpath_list, cur_dns)

## Step 5: Remote control the AWS instances

In [103]:
# Use Fabric to run construct_corpus on AWS remotely
import datetime
import os

import plutil
import aws_api
import fabric_api

In [104]:
if pl.use_aws:
    dns_list = aws_api.get_all_dns()
    # Run the python file remotely on each instance
    commands = [
        "mkdir lda",
        "mv 02* 03* lda",
        "/home/ubuntu/anaconda3/bin/python canadian_aws_corpus_run.py $INST_NUM",
        #"scp obranch_lda/05* jjacobs@textlab.econ.columbia.edu:~/ashmacnaidu/canadian_data/obranch_lda/"
    ]
    result_dict = fabric_api.run_commands(dns_list, commands)
    print("DNS info:")
    print([(dns_num, cur_dns) for dns_num, cur_dns in enumerate(dns_list)])
    timestamp = str(datetime.datetime.now()).replace(" ","_").split(".")[0]
    result_fpath = os.path.join(".","logs","aws_output_" + timestamp + ".pkl")
    print("Saving result_dict to " + result_fpath)
    plutil.safe_to_pickle(result_dict, result_fpath)

## Step 6: Construct the LDA corpus

(The key step, necessary whether or not you're running on AWS)

In [105]:
# Uses joblib to parallelize the construction of the corpora.
import itertools
import logging
import multiprocessing
import os
import sys

import pandas as pd
import numpy as np
import gensim
import joblib
from joblib import Parallel, delayed

import plutil

In [116]:
class CanadianCorpus(object):
    def __init__(self, dictionary):
        self.dictionary = dictionary

    def __iter__(self):
        return (self.dictionary.doc2bow(f.lower().split()) for f in stream_object_branches())

def construct_streaming_corpus(pl, dictionary):
    # Sets up the corpus and dictionary for topic modeling
    #print("constructCorpus()")
    corpus = CanadianCorpus(dictionary)
    #corpus = CanadianObjectBranchCorpus(dictionary)
    return corpus

def load_doclist_df(doclist_num):
    # Loads the doc list created by serialize_docs(). On TL this will be in the
    # lda subfolder, whereas on AWS it will just be in the working directory
    serialized_fpath = pl.get_lda_doclist_fpath(chunk_num=doclist_num)
    doc_df = plutil.safe_load_pickle(serialized_fpath)
    return doc_df

def save_static_corpus(subcorpus, subcorp_num):
    # Save the *actual* subcorpora (i.e., we're done with the doclists now)
    # Using current directory since this will be run on AWS
    if subcorp_num >= 0:
        pl.vprint("Saving subcorpus #" + str(subcorp_num))
        static_fpath = pl.get_lda_subcorp_fpath(subcorp_num)
    else:
        # Full corpus
        pl.vprint("Saving full corpus")
        static_fpath = pl.get_lda_corpus_fpath()
    plutil.make_dirs(static_fpath)
    gensim.corpora.MmCorpus.serialize(static_fpath, subcorpus)
    pl.vprint(f"Corpus serialized to {static_fpath}")

def stream_to_static_corpus(pl, dictionary):
    # Loads all of the object branches using streamObjectBranches but saves
    # them into a static (serializable, non-streaming) corpus
    corpus = [dictionary.doc2bow(f.lower().split()) for f in pl.stream_object_branches()]
    return corpus

def vectorize_docs(dictionary, cur_doc_list):
    # Converts the list of documents from strings to bag-of-words vectors
    return [dictionary.doc2bow(l) for l in cur_doc_list]


In [117]:
# [NOTE: Used to be construct_static_corpus(), until 2019-02-19 -JJ]
#def construct_corpus(num_cores=8):
# Construct the static corpus in parallel, to make the weight computations
# quicker
subcorp_num = -1
if pl.use_aws:
    # Get the subcorpus number for this AWS/TL instance
    subcorp_num = pl.get_instance_num()
    # Load the doclist for this subcorpus number
    doclist_df = load_doclist_df(subcorp_num)
else:
    doclist_df_fpath = pl.get_lda_doclist_fpath()
    pl.vprint(f"Loading doclist_df from {doclist_df_fpath}")
    doclist_df = plutil.safe_load_pickle(doclist_df_fpath)
doc_list = list(doclist_df["doc_clean"])

# Now load the dictionary so we have the doc2bow function
dictionary = joblib.load(pl.get_lda_dict_fpath())
pl.vprint("Dictionary loaded")

# Use multiprocessing to process batches of docs in parallel
batch_size = 1000
doc_chunks = [doc_list[i:i+batch_size] for i in range(0,len(doc_list),batch_size)]
num_workers = plutil.get_num_workers()
par_obj = Parallel(n_jobs=num_workers, verbose=5)
chunk_vectors = par_obj(delayed(vectorize_docs)(dictionary, cur_chunk) for cur_chunk in doc_chunks)
# Now put the chunks together and save in mm format
all_docs = []
for chunk in chunk_vectors:
    all_docs.extend(chunk)
pl.vprint("Corpus construction complete.")
save_static_corpus(all_docs, subcorp_num)

Loading doclist_df from C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/lda03_doclist.pkl
Dictionary loaded


[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.


Corpus construction complete.
Saving full corpus
Corpus serialized to C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/lda06_gensim_corpus.pkl


[Parallel(n_jobs=15)]: Done   1 out of   1 | elapsed:    0.4s finished


## Step 7: Combine AWS corpora

In [118]:
# Takes the separate corpora created on the various AWS instances and combines
# them into one final corpus for use by the LDA model in the next step
import itertools

import gensim

def load_subcorp(pl, subcorp_num):
    subcorp_fpath = pl.get_lda_subcorp_fpath(subcorp_num=subcorp_num)
    pl.iprint("Loading subcorpus #" + str(subcorp_num) + " from " + str(subcorp_fpath))
    subcorp = gensim.corpora.MmCorpus(subcorp_fpath)
    return subcorp

def combine_corpora(pl):
    # See http://thread.gmane.org/gmane.comp.ai.gensim/1842
    # (after a longer-than-needed search :( )
    corpora = [load_subcorp(pl, i) for i in range(pl.num_lda_subsets)]
    #gensim.corpora.MmCorpus.serialize("./canadian_obj_v2_corpus.mm", itertools.chain(corpus0,corpus1))
    # Note the * before corpora, so that each corpus gets passed in as an argument to the
    # itertools.chain() function separately
    gensim.corpora.MmCorpus.serialize(pl.get_lda_corpus_fpath(),
                                      itertools.chain(*corpora))

## Step 8: Run LDA

In [119]:
# Trains an LDA model using gensim
import logging
import os

import gensim
import joblib

import plutil

In [133]:
def compute_lda_similarity(lda_model, doc1, doc2):
    lda1 = lda_model[doc1]
    lda2 = lda_model[doc2]
    sim = gensim.matutils.cossim(lda1, lda2)
    return sim

def compute_sims(lda_model, dictionary):
    # Loads in a list of pairs and computes similarities between the docs based
    # on the topic proportions
    pairs_df = pd.read_csv(pairs_file)
    pairs_df["sim_lda"] = np.nan
    for row_num, row_data in pairs_df.iterrows():
        debugPrint("*** Pair #" + str(row_num))
        debugPrint(row_data)
        # Now get the corresponding docs and compute their LDA sims
        contract_id = int(row_data["contract_id"])
        prev_id = int(row_data["prev_id"])
        cur_doc = txtToDoc(loadTxtById(contract_id), dictionary)
        prev_doc = txtToDoc(loadTxtById(prev_id), dictionary)
        sim = computeLDASimilarity(lda_model, cur_doc, prev_doc)
        pairs_df.set_value(row_num, "sim_lda", sim)
    return pairs_df

def launch_lda(corpus, dictionary):
    pl.vprint("Launching LDA model with " + str(pl.num_lda_topics) + " topics")
    print(corpus)
    print(dictionary)
    model = gensim.models.LdaMulticore(corpus, id2word=dictionary,
                                       num_topics=pl.num_lda_topics,
                                       workers=plutil.get_num_workers(),
                                       iterations=100)
    return model

def load_lda(filename):
    model = gensim.models.LdaMulticore.load(filename)
    return model

def pairwise_similarities(pl):
    ## Compute similarities
    sims = computeSims(model, canadian_dict)
    saveSims(sims, sims_file)

def print_lda(lda_model):
    all_topics = lda_model.show_topics(num_topics=20,num_words=20)
    for topic in all_topics:
        print(topic[0],topic[1])
        print("-----")

def save_lda(pl, model):
    model.save(pl.get_lda_model_fpath())

def save_sims(pairs_df, filename):
    pairs_df.to_csv(filename,index=False)

def similarity_test(lda_model):
    ## Similarity test
    dictionary = loadDictionary(os.path.join(LDA_PATH, CORPUS_NAME + "_dict.pkl"))
    with codecs.open("/home/ubuntu/mongo_txts/00001_eng.txt",'r','utf-8') as f:
        cur_txt = f.read()
        doc1 = dictionary.doc2bow(cur_txt.lower().split())
    with codecs.open("/home/ubuntu/mongo_txts/00002_eng.txt",'r','utf-8') as f:
        cur_txt = f.read()
        doc2 = dictionary.doc2bow(cur_txt.lower().split())
    doc1_lda = lda_model[doc1]
    doc2_lda = lda_model[doc2]
    same_sim = gensim.matutils.cossim(doc1_lda,doc1_lda)
    cur_sim = gensim.matutils.cossim(doc1_lda,doc2_lda)
    print(same_sim)
    print(cur_sim)


In [134]:
# TODO: Split into "regular" LDA run and then the pairwise contract similarities
# run (i.e., make two separate pipelines for these two separate tasks)
## Load the dictionary and corpus
lda_dict_fpath = pl.get_lda_dict_fpath()
pl.vprint("Loading LDA dictionary from " + str(lda_dict_fpath))
lda_dict = joblib.load(lda_dict_fpath)
# Load the combined corpus
corpus_fpath = pl.get_lda_corpus_fpath()
pl.vprint("Loading LDA corpus from " + str(corpus_fpath))
corpus = gensim.corpora.MmCorpus(corpus_fpath)
# Construct LDA model
if os.path.isfile(pl.get_lda_model_fpath()):
    if pl.force_overwrite:
        pl.vprint("OVERWRITING LDA MODEL")
    else:
        input("LDA MODEL ALREADY EXISTS. Press Enter to continue and overwrite it "
              + "or Ctrl+C to kill this script...")
model = launch_lda(corpus, lda_dict)
save_lda(pl, model)

print_lda(model)

Loading LDA dictionary from C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/lda02a_gensim_dict.pkl
Loading LDA corpus from C:/Dropbox/Labor_Contracts_Canada/./analysis/output/analysis/2022-12-jeff-python/lda06_gensim_corpus.pkl
Launching LDA model with 20 topics
MmCorpus(423 documents, 2977 features, 31956 non-zero entries)
Dictionary<2977 unique tokens: ['abil', 'adjust', 'advanc', 'agreement', 'ail']...>
0 0.039*"classif" + 0.035*"agreement" + 0.035*"job" + 0.031*"chang" + 0.026*"rate" + 0.022*"schedul" + 0.019*"wage" + 0.019*"establish" + 0.017*"attach" + 0.015*"employe" + 0.015*"compani" + 0.013*"dai" + 0.013*"alter" + 0.013*"oper" + 0.013*"grievanc" + 0.012*"parti" + 0.011*"work" + 0.010*"method" + 0.010*"thirti" + 0.010*"advers"
-----
1 0.048*"dai" + 0.037*"work" + 0.033*"employe" + 0.024*"time" + 0.021*"holidai" + 0.016*"compani" + 0.013*"period" + 0.012*"senior" + 0.011*"job" + 0.011*"oper" + 0.010*"pai" + 0.010*"vacat" + 0.009*"agreement" + 0.0

## Step 9: Save LDA topics for further analysis

In [136]:
# Print and save the LDA topic list to a .txt file
import gensim

import plutil

In [138]:
lda_model_fpath = pl.get_lda_model_fpath()
lda_model = gensim.models.LdaMulticore.load(lda_model_fpath)
all_topics = lda_model.show_topics(num_topics=pl.num_lda_topics)
all_topics.sort(key=lambda x: x[0])
output_buffer = ""
for cur_topic in all_topics:
    topic_label = "Topic " + str(cur_topic[0]) + "\n"
    output_buffer += topic_label
    print(topic_label)
    topic_terms = cur_topic[1] + "\n\n"
    output_buffer += topic_terms
    print(topic_terms)
plutil.safe_write_to_file(output_buffer, pl.get_lda_output_fpath())

Topic 0

0.039*"classif" + 0.035*"agreement" + 0.035*"job" + 0.031*"chang" + 0.026*"rate" + 0.022*"schedul" + 0.019*"wage" + 0.019*"establish" + 0.017*"attach" + 0.015*"employe"


Topic 1

0.048*"dai" + 0.037*"work" + 0.033*"employe" + 0.024*"time" + 0.021*"holidai" + 0.016*"compani" + 0.013*"period" + 0.012*"senior" + 0.011*"job" + 0.011*"oper"


Topic 2

0.032*"holidai" + 0.030*"pai" + 0.027*"dai" + 0.024*"work" + 0.024*"employe" + 0.019*"time" + 0.017*"oper" + 0.017*"compani" + 0.015*"disabl" + 0.013*"agreement"


Topic 3

0.026*"employe" + 0.017*"work" + 0.013*"compani" + 0.013*"provid" + 0.012*"benefit" + 0.011*"employ" + 0.011*"agreement" + 0.011*"plan" + 0.011*"job" + 0.011*"period"


Topic 4

0.036*"work" + 0.023*"employe" + 0.018*"provid" + 0.018*"requir" + 0.018*"transport" + 0.018*"board" + 0.016*"lodg" + 0.013*"marshal" + 0.013*"camp" + 0.012*"agreement"


Topic 5

0.037*"employe" + 0.036*"work" + 0.022*"dai" + 0.021*"employ" + 0.018*"compani" + 0.016*"oper" + 0.013*"hour" 

## Step 10: Compute subnorm weights