In [1]:
import pickle

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
import scattertext as st

from scattertext import RankDifference

from sklearn.feature_extraction.text import CountVectorizer
from itertools import combinations 
import umap
from gensim.models.word2vec import Word2Vec

from tqdm import tqdm

from collections import Counter

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load data
first_corpus = pickle.load(open("pickles/first_party_corpus.p", "rb" ))
second_corpus = pickle.load(open("pickles/second_party_corpus.p", "rb" ))
third_corpus = pickle.load(open("pickles/third_party_corpus.p", "rb" ))
fourth_corpus = pickle.load(open("pickles/fourth_party_corpus.p", "rb" ))
fifth_corpus = pickle.load(open("pickles/fifth_party_corpus.p", "rb" ))
sixth_corpus = pickle.load(open("pickles/sixth_party_corpus.p", "rb" ))

# Load stop words
stop_words = list(pickle.load(open("pickles/stop_words.p", "rb" )))

# Load Spacy english model
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 1500000

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [4]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [5]:
def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

In [6]:
def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [7]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """

    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [9]:
def process_presidential_speech(president, corpus, filename, ngram = 1, display_output = False):
    
    nlp = spacy.load('en_core_web_sm')
    nlp.max_length = 1500000
        
    text = list(corpus[corpus.presidents == president].transcripts.values)
    
    words = list(sent_to_words(text))
    
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    
    # Remove Stop Words
    words = remove_stopwords(words)

    # Form Bigrams
    if ngram == 2:
        words = make_bigrams(words, bigram_mod)
    
    if ngram == 3:
        words = make_trigrams(words, bigram_mod, trigram_mod)
    
    data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    
    # issue
    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)
    
    limit=40; start=2; step=6;
    x = range(start, limit, step)  
    
   # Print the coherence scores
    find_first_peak = True
    optimal_value = -1
    optimal_topics = -1
    mapping = list(zip(x, coherence_values))
    for i, tup in enumerate(mapping[:-1]):
        m, cv = tup

        if find_first_peak:
            if cv > mapping[i+1][1]:
                find_first_peak = False
            optimal_value = cv
            optimal_topics = m

        if display_output:
            print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    if display_output:
        print("\nOptimal Topic Count: {}".format(optimal_topics))
    
    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=optimal_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)
    
    # Compute Perplexity
    if display_output:
        print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
    
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=optimal_topics, id2word=id2word)

    # Compute Coherence Score
    coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    if display_output:
        print('\nCoherence Score: ', coherence_ldamallet)
    
    # Visualize the topics
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    path = "Visualizations/" + filename + "/LDA/" + filename + "_" + president + "_LDA_visualization.html"
    pyLDAvis.save_html(vis, path)

    if display_output:
        limit=40; start=2; step=6;
        x = range(start, limit, step)
        plt.plot(x, coherence_values)
        plt.xlabel("Num Topics")
        plt.ylabel("Coherence score")
        plt.legend(("coherence_values"), loc='best')
        plt.show()
        
    # perplexity, nCoherence, topics    
    
    return (lda_model.log_perplexity(corpus), coherence_ldamallet, ldamallet.show_topics(formatted=False))

In [10]:
def generate_party_LDA_visualizations(corpus, filename, ngram = 1, display_output = False):
    
    nlp = spacy.load('en_core_web_sm')
    nlp.max_length = 1500000
    
    for party in set(corpus.party):
    
        text = list(corpus[corpus.party == party].transcripts.values)

        words = list(sent_to_words(text))
        
        # Build the bigram and trigram models
        bigram = gensim.models.Phrases(words, min_count=5, threshold=100) # higher threshold fewer phrases.
        trigram = gensim.models.Phrases(bigram[words], threshold=100)  

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)

        # Remove Stop Words
        words = remove_stopwords(words)

        # Form Bigrams
        if ngram == 2:
            words = make_bigrams(words, bigram_mod)

        if ngram == 3:
            words = make_trigrams(words, bigram_mod, trigram_mod)

        data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)

        # Create Corpus
        texts = data_lemmatized

        # Term Document Frequency
        tdf_corpus = [id2word.doc2bow(text) for text in texts]

        # issue
        model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=tdf_corpus, texts=data_lemmatized, start=2, limit=40, step=6)

        limit=40; start=2; step=6;
        x = range(start, limit, step)  

       # Print the coherence scores
        find_first_peak = True
        optimal_value = -1
        optimal_topics = -1
        mapping = list(zip(x, coherence_values))
        for i, tup in enumerate(mapping[:-1]):
            m, cv = tup

            if find_first_peak:
                if cv > mapping[i+1][1]:
                    find_first_peak = False
                optimal_value = cv
                optimal_topics = m

            if display_output:
                print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
        if display_output:
            print("\nOptimal Topic Count: {}".format(optimal_topics))

        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=tdf_corpus,
                                                   id2word=id2word,
                                                   num_topics=optimal_topics, 
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=100,
                                                   passes=10,
                                                   alpha='auto',
                                                   per_word_topics=True)

        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=tdf_corpus, num_topics=optimal_topics, id2word=id2word)

        # Visualize the topics
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, tdf_corpus, id2word)
        path = "Visualizations/" + filename + "/LDA/" + filename + "_" + party + "party_LDA_visualization.html"
        pyLDAvis.save_html(vis, path)


In [11]:
def generate_corpus_LDA_visualizations(corpus, filename, ngram = 1, display_output = False):
    
    nlp = spacy.load('en_core_web_sm')
    nlp.max_length = 1500000
    
    text = list(corpus.transcripts.values)

    words = list(sent_to_words(text))

    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # Remove Stop Words
    words = remove_stopwords(words)

    # Form Bigrams
    if ngram == 2:
        words = make_bigrams(words, bigram_mod)

    if ngram == 3:
        words = make_trigrams(words, bigram_mod, trigram_mod)

    data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    tdf_corpus = [id2word.doc2bow(text) for text in texts]

    # issue
    model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=tdf_corpus, texts=data_lemmatized, start=2, limit=40, step=6)

    limit=40; start=2; step=6;
    x = range(start, limit, step)  

   # Print the coherence scores
    find_first_peak = True
    optimal_value = -1
    optimal_topics = -1
    mapping = list(zip(x, coherence_values))
    for i, tup in enumerate(mapping[:-1]):
        m, cv = tup

        if find_first_peak:
            if cv > mapping[i+1][1]:
                find_first_peak = False
            optimal_value = cv
            optimal_topics = m

        if display_output:
            print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
    if display_output:
        print("\nOptimal Topic Count: {}".format(optimal_topics))

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=tdf_corpus,
                                               id2word=id2word,
                                               num_topics=optimal_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=tdf_corpus, num_topics=optimal_topics, id2word=id2word)

    # Visualize the topics
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, tdf_corpus, id2word)
    path = "Visualizations/" + filename + "/LDA/" + filename + "_LDA_visualization.html"
    pyLDAvis.save_html(vis, path)


In [12]:
def generate_scattertext(corpus, cat_col, texts2analyze, html_category, html_label, html_other_label, metadata, filename):
        
    st_corpus = st.CorpusFromPandas(corpus, 
                              category_col=cat_col,  
                              text_col=texts2analyze, 
                              nlp=nlp).build()
        
    html = st.produce_scattertext_explorer(st_corpus,
          category= html_category, 
          category_name= html_label, 
          not_category_name= html_other_label,
          width_in_pixels=1000,
          metadata=st_corpus.get_df()[metadata])
                                           
    path = "Visualizations/" + filename + "/Scattertexts/" + filename + "_" +  html_label + "_" + html_other_label + "_scattertext_visualization.html"
    open(path, 'wb').write(html.encode('utf-8'))

In [13]:
def generate_empath_topics(corpus, cat_col, texts2analyze, html_category, html_label, html_other_label, metadata, filename):
    
    
    feat_builder = st.FeatsFromOnlyEmpath()
    
    empath_corpus = st.CorpusFromParsedDocuments(corpus,
                                             category_col=cat_col,
                                             feats_from_spacy_doc=feat_builder,
                                             parsed_col=texts2analyze).build()
    
    html = st.produce_scattertext_explorer(empath_corpus,
                                       category=html_category,
                                       category_name=html_label,
                                       not_category_name=html_other_label,
                                        width_in_pixels=1000,
                                       metadata=corpus[metadata],
                                      use_non_text_features=True,
                                       use_full_doc=True,
                                       topic_model_term_lists=feat_builder.get_top_model_term_lists())
    
    path = "Visualizations/" + filename + "/empaths/" + filename + "_" +  html_label + "_" + html_other_label + "_empath_topics_visualization.html"
    open(path, 'wb').write(html.encode('utf-8'))

In [14]:
def generate_term_frequency_visualization(corpus, filename):
    
    party_pairs = [(a, b) for (a, b) in set(combinations(set(corpus.party.values), 2)) if a != b]
    
    n = len(party_pairs)
    for i, party_pair in enumerate(party_pairs):
        party1 = party_pair[0]
        party2 = party_pair[1]
    
        corpus_1 = corpus[corpus.party == party1]
        transcript_1 = ' '.join(corpus_1.transcripts.values)

        corpus_2 = corpus[corpus.party == party2]
        transcript_2 = ' '.join(corpus_2.transcripts.values)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform([transcript_1, transcript_2])

        df = pd.DataFrame(data=X.toarray())
        df.columns = vectorizer.get_feature_names()
        df.index = [party1, party2]
        df = df.T

        term_cat_freq = st.TermCategoryFrequencies(df)

        html = st.produce_scattertext_explorer(
            term_cat_freq,
            category=party1,
            category_name=party1,
            not_category_name=party2)

        path = "Visualizations/" + filename + "/Term Frequency/" + filename + "_" + party1 + "_" + party2 + "_term_frequency_visualization.html"
        open(path, 'wb').write(html.encode('utf-8'))
        
        print("{}Processed {:.2f}%".format(' '*(len(filename) + 2), ((i+1)/n)*100))

In [15]:
def get_topic_list(topic_data):
    topic_list = []
    for topic_themes in topic_data:
        topics = topic_themes[1]
        for topic in topics:
            topic_list.append(topic[0])
    return topic_list

In [16]:
def categorize(word, categories):
    d = {}
    doc1 = nlp(word)
    for words in categories:
        doc2 = nlp(words)
        d[doc1.similarity(doc2)] = words
    return d[max(d.keys())]

In [17]:
def build_topic_model(topic_items, categories):
    """ takes a list of list of topics   """
    topic_models = {}
    for category in categories:
        topic_models[category] = []
    
    flat_list = [item for sublist in topic_items for item in sublist]
    
    for item in topic_items:
        tokens = get_topic_list(item)
        for token in tokens:
            topic_models[categorize(token, categories)].append(token)
            
    for category in categories:
        topic_models[category] = list(set(topic_models[category]))
    
    return topic_models

In [18]:
def generate_topic_frequency_word_similarity(corpus, texts2analyze, cat_col, html_category, html_label, html_other_label, metadata, list_of_topics, filename):
    
    categories = ['Agricultural', 
                  'Climate change',
                  'Commercial',
                  'Cultural',
                  'Domestic',
                  'Drug reform',
                  'Economic',
                  'Fiscal',
                  'Incomes',
                  'Industrial', 
                  'Investment', 
                  'Monetary',
                  'Tax',
                  'Education',
                  'Energy',
                  'Nuclear energy',
                  'Renewable energy', 
                  'Environmental', 
                  'Food',
                  'Foreign',
                  'Health',
                  'Pharmaceutical',
                  'Vaccination',
                  'Housing',
                  'Immigration', 
                  'Knowledge',
                  'Language Military',
                  'Science',
                  'Stem cell',
                  'Space',
                  'Technology',
                   'Social']
    
    categories = list(map(lambda x: x.lower(), categories))
    
    topic_models = build_topic_model(list_of_topics, categories)

    corpus['parse'] = corpus[texts2analyze].apply(st.whitespace_nlp_with_sentences)

    topic_feature_builder = st.FeatsFromTopicModel(topic_models)

    topic_corpus = st.CorpusFromParsedDocuments(
            corpus,
            category_col=cat_col,
            parsed_col='parse',
            feats_from_spacy_doc=topic_feature_builder
    ).build()

    html = st.produce_scattertext_explorer(
            topic_corpus,
            category=html_category,
            category_name=html_label,
            not_category_name=html_other_label,
            width_in_pixels=1000,
            metadata=corpus[metadata],
            use_non_text_features=True,
            use_full_doc=True,
            pmi_threshold_coefficient=0,
            topic_model_term_lists=topic_feature_builder.get_top_model_term_lists())
    
    path = "Visualizations/" + filename + "/Topic Frequency/" + filename + "_topic_frequency_word_similarity_visualization.html"
    
    open(path, 'wb').write(html.encode('utf-8'))

In [19]:
def generate_word_similarity_plot(corpus, texts2analyze, corpus_cat_col, html_category, html_label, html_other_label, metadata, filename):
    
    corpus['parse'] = corpus[texts2analyze].apply(st.whitespace_nlp_with_sentences)
    
    st_corpus = (st.CorpusFromParsedDocuments(corpus, category_col=corpus_cat_col, parsed_col='parse')
          .build().get_stoplisted_unigram_corpus())
    
    html = st.produce_projection_explorer(st_corpus, 
                                          category=html_category, 
                                          category_name=html_label,
                                          not_category_name=html_other_label, 
                                          metadata=corpus[metadata])
    
    path = "Visualizations/" + filename + "/Word Similarity/" + filename + "_word_similarity_plot_visualization.html"
    open(path, 'wb').write(html.encode('utf-8'))                                                         

In [20]:
mallet_path = 'Visualizations/mallet-2.0.8/bin/mallet' # update this path

In [21]:
def filter_corpus(corpus, party1, party2):
    return corpus[corpus['party'].isin([party1, party2])]

In [22]:
def generate_party_pairs(corpus):
    return list(combinations(set(corpus.party), 2))

In [23]:
def generate_party_LDA_visualizations(corpus, filename, ngram = 1, display_output = False):
    
    nlp = spacy.load('en_core_web_sm')
    nlp.max_length = 1500000
    
    n = len(set(corpus.party))
    for party in set(corpus.party):
    
        text = list(corpus[corpus.party == party].transcripts.values)

        words = list(sent_to_words(text))
        
        # Build the bigram and trigram models
        bigram = gensim.models.Phrases(words, min_count=5, threshold=100) # higher threshold fewer phrases.
        trigram = gensim.models.Phrases(bigram[words], threshold=100)  

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)

        # Remove Stop Words
        words = remove_stopwords(words)

        # Form Bigrams
        if ngram == 2:
            words = make_bigrams(words, bigram_mod)

        if ngram == 3:
            words = make_trigrams(words, bigram_mod, trigram_mod)

        data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

        # Create Dictionary
        id2word = corpora.Dictionary(data_lemmatized)

        # Create Corpus
        texts = data_lemmatized

        # Term Document Frequency
        tdf_corpus = [id2word.doc2bow(text) for text in texts]

        # issue
        model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=tdf_corpus, texts=data_lemmatized, start=2, limit=40, step=6)

        limit=40; start=2; step=6;
        x = range(start, limit, step)  

       # Print the coherence scores
        find_first_peak = True
        optimal_value = -1
        optimal_topics = -1
        mapping = list(zip(x, coherence_values))
        for i, tup in enumerate(mapping[:-1]):
            m, cv = tup

            if find_first_peak:
                if cv > mapping[i+1][1]:
                    find_first_peak = False
                optimal_value = cv
                optimal_topics = m

            if display_output:
                print("Num Topics =", m, " has Coherence Value of", round(cv, 4))
        if display_output:
            print("\nOptimal Topic Count: {}".format(optimal_topics))

        # Build LDA model
        lda_model = gensim.models.ldamodel.LdaModel(corpus=tdf_corpus,
                                                   id2word=id2word,
                                                   num_topics=optimal_topics, 
                                                   random_state=100,
                                                   update_every=1,
                                                   chunksize=100,
                                                   passes=10,
                                                   alpha='auto',
                                                   per_word_topics=True)

        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=tdf_corpus, num_topics=optimal_topics, id2word=id2word)

        # Visualize the topics
        pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(lda_model, tdf_corpus, id2word)
        path = "Visualizations/" + filename + "/LDA/" + filename + "_" + party + "party_LDA_visualization.html"
        pyLDAvis.save_html(vis, path)


In [24]:
def generate_corpus_visualizations(corpus, corpus_name):
    corpus = corpus.reset_index().rename(columns={'index':'presidents', 'Party':'party'})
    
    n = len(corpus.presidents)
    
    print("{}: Processing presidential speeches...".format(corpus_name))
    presidential_data = {}
    for i, president in enumerate(corpus.presidents):
        presidential_data[president] = process_presidential_speech(president, corpus, corpus_name, ngram = 2)
        print("{}Processed {:.2f}%".format(' '*(len(corpus_name) + 2), ((i+1)/n)*100))
      
    print("{}: Processing topics...".format(corpus_name))
    model_topic_list = []
    for i, president in enumerate(corpus.presidents):
        model_topic_list.append(presidential_data[president][2])
        print("{}Processed {:.2f}%".format(' '*(len(corpus_name) + 2), ((i+1)/n)*100))
        
    print("{}: Generating LDA visualizations for all parties...".format(corpus_name))
    generate_party_LDA_visualizations(corpus, corpus_name, ngram = 2)
    
    print("{}: Generating LDA visualizations for corpus...".format(corpus_name))
    generate_corpus_LDA_visualizations(corpus, corpus_name, ngram = 2)
    
    print("{}: Generating scattertext visualization...".format(corpus_name))
    for i, party_pair in enumerate(generate_party_pairs(corpus)):
        html_category = party_pair[0]
        html_label = party_pair[0]
        html_other_label = party_pair[1]
        temp_corpus = filter_corpus(corpus, html_label, html_other_label)
        generate_scattertext(temp_corpus, cat_col='party', texts2analyze='transcripts', html_category=html_category, html_label=html_label, html_other_label=html_other_label, metadata='presidents', filename = corpus_name)
        print("{}Processed {:.2f}%".format(' '*(len(corpus_name) + 2), ((i+1)/n)*100))

    print("{}: Generating empath topics visualization...".format(corpus_name))
    for i, party_pair in enumerate(generate_party_pairs(corpus)):
        html_category = party_pair[0]
        html_label = party_pair[0]
        html_other_label = party_pair[1]
        temp_corpus = filter_corpus(corpus, html_label, html_other_label)
        generate_empath_topics(temp_corpus, cat_col='party', texts2analyze='transcripts', html_category=html_category, html_label=html_label, html_other_label=html_other_label, metadata='presidents', filename = corpus_name)
        print("{}Processed {:.2f}%".format(' '*(len(corpus_name) + 2), ((i+1)/n)*100))
    
    print("{}: Generating term frequency visualization...".format(corpus_name))
    generate_term_frequency_visualization(corpus, filename = corpus_name)
    
    print("{}: Generating topic frequency word similarity visualization...".format(corpus_name))
    for i, party_pair in enumerate(generate_party_pairs(corpus)):
        html_category = party_pair[0]
        html_label = party_pair[0]
        html_other_label = party_pair[1]
        temp_corpus = filter_corpus(corpus, party_pair[0], party_pair[1])
        generate_topic_frequency_word_similarity(temp_corpus, texts2analyze = 'transcripts', cat_col = 'party', html_category = html_category, html_label = html_label, html_other_label = html_other_label, metadata = 'presidents', list_of_topics = model_topic_list, filename = corpus_name)
        print("{}Processed {:.2f}%".format(' '*(len(corpus_name) + 2), ((i+1)/n)*100))
        
    print("{}: Generating word similarity visualization...".format(corpus_name))
    for i, party_pair in enumerate(generate_party_pairs(corpus)):
        html_category = party_pair[0]
        html_label = party_pair[0]
        html_other_label = party_pair[1]
        temp_corpus = filter_corpus(corpus, party_pair[0], party_pair[1])
        generate_word_similarity_plot(temp_corpus, texts2analyze = 'transcripts', corpus_cat_col = 'party', html_category = html_category, html_label = html_label, html_other_label = html_other_label, metadata = 'presidents', filename = corpus_name)
        print("{}Processed {:.2f}%".format(' '*(len(corpus_name) + 2), ((i+1)/n)*100))
    
    print("\nProcess Complete!\n\n".format(corpus_name))

In [25]:
corpus_filenames = ["first_corpus", "second_corpus", "third_corpus", "fourth_corpus", "fifth_corpus", "sixth_corpus"]
corpus_files = [first_corpus, second_corpus, third_corpus, fourth_corpus, fifth_corpus, sixth_corpus]
corpus_list = list(zip(corpus_filenames, corpus_files))

for filename, file in corpus_list:
    generate_corpus_visualizations(file, filename)

'\ncorpus_filenames = ["first_corpus", "second_corpus", "third_corpus", "fourth_corpus", "fifth_corpus", "sixth_corpus"]\ncorpus_files = [first_corpus, second_corpus, third_corpus, fourth_corpus, fifth_corpus, sixth_corpus]\ncorpus_list = list(zip(corpus_filenames, corpus_files))\n\nfor filename, file in corpus_list:\n    generate_corpus_visualizations(file, filename)\n'