## Topic-reduction

The model initially found 229 topics which is too much to interpret - thus, we decided to reduce the number of topics using top2vec's hierarchical topic reduction method.

This notebook contains data-preparation part of topic reduction -- for vizzes please go visit this [repository](https://github.com/hcss-utils/streamlit-topic-reduction)

In [1]:
import umap
import pandas as pd
from top2vec import Top2Vec

In [2]:
def reduce_topics(model, min_topics, max_topics, n_words):
    for num in range(min_topics, max_topics+1):
        model.hierarchical_topic_reduction(num)
        topic_words, word_scores, topic_nums = model.get_topics(reduced=True)
        words = [", ".join(topic_words[topic][:n_words]) for topic in topic_nums]
        mapped = [words[num_label] for num_label in model.doc_top_reduced]
        yield {
            num: {
                "unique_words": words,
                "classification_num": model.doc_top_reduced,
                "classification_score": model.doc_dist_reduced,
                "classification_label": mapped,
            }
        }

In [3]:
model = Top2Vec.load("../models/pq-model")

In [4]:
classifications = reduce_topics(model, min_topics=2, max_topics=20, n_words=5)
mappings = {k:v for d in classifications for k, v in d.items()}

In [5]:
umap_model = umap.UMAP(
    n_neighbors=15,
    n_components=2,
    metric="cosine",
    random_state=42
).fit(model._get_document_vectors(norm=False))

In [6]:
embeddings = pd.DataFrame(umap_model.embedding_, columns=("x", "y"))

In [7]:
for num, values in mappings.items():
    embeddings[f"num_label_{num}"] = mappings.get(num)["classification_num"]
    embeddings[f"word_label_{num}"] = mappings.get(num)["classification_label"]
    embeddings[f"score_{num}"] = mappings.get(num)["classification_score"]

In [8]:
embeddings

Unnamed: 0,x,y,num_label_2,word_label_2,score_2,num_label_3,word_label_3,score_3,num_label_4,word_label_4,...,score_17,num_label_18,word_label_18,score_18,num_label_19,word_label_19,score_19,num_label_20,word_label_20,score_20
0,8.063369,-0.291642,0,"pyongyang, hardly, surely, scarcely, inevitably",0.216458,0,"pyongyang, hardly, sophism, scarcely, surely",0.227752,0,"hardly, scarcely, surely, fashionable, inevitably",...,0.321827,6,"republicans, republican, gop, democrat, democrats",0.319669,6,"republicans, republican, gop, democrat, democrats",0.322677,7,"republicans, republican, gop, democrat, democrats",0.320961
1,8.119796,-0.541421,0,"pyongyang, hardly, surely, scarcely, inevitably",0.251642,0,"pyongyang, hardly, sophism, scarcely, surely",0.268144,0,"hardly, scarcely, surely, fashionable, inevitably",...,0.304642,6,"republicans, republican, gop, democrat, democrats",0.303649,6,"republicans, republican, gop, democrat, democrats",0.305321,7,"republicans, republican, gop, democrat, democrats",0.305281
2,10.569654,5.105814,0,"pyongyang, hardly, surely, scarcely, inevitably",0.150307,0,"pyongyang, hardly, sophism, scarcely, surely",0.145502,0,"hardly, scarcely, surely, fashionable, inevitably",...,0.355095,8,"nato, russia, enlargement, russian, moscow",0.355415,7,"nato, russia, enlargement, russian, moscow",0.355408,6,"nato, russia, enlargement, russian, moscow",0.356287
3,11.484039,5.563009,0,"pyongyang, hardly, surely, scarcely, inevitably",0.189021,0,"pyongyang, hardly, sophism, scarcely, surely",0.199527,0,"hardly, scarcely, surely, fashionable, inevitably",...,0.265259,8,"nato, russia, enlargement, russian, moscow",0.265297,7,"nato, russia, enlargement, russian, moscow",0.264855,6,"nato, russia, enlargement, russian, moscow",0.265003
4,11.507264,5.589423,1,"airmen, warfighter, dod, cyber, sustainment",0.199532,1,"airmen, warfighter, dod, cyber, sustainment",0.199105,1,"airmen, warfighter, dod, cyber, sustainment",...,0.321802,13,"cyber, malicious, cyberattacks, cybersecurity,...",0.321942,13,"cyber, malicious, cyberattacks, cybersecurity,...",0.320756,13,"cyber, malicious, cyberattacks, cybersecurity,...",0.319590
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26520,9.129981,4.927581,1,"airmen, warfighter, dod, cyber, sustainment",0.189706,1,"airmen, warfighter, dod, cyber, sustainment",0.190536,0,"hardly, scarcely, surely, fashionable, inevitably",...,0.358555,8,"nato, russia, enlargement, russian, moscow",0.359681,7,"nato, russia, enlargement, russian, moscow",0.359415,6,"nato, russia, enlargement, russian, moscow",0.358017
26521,8.797188,4.527788,0,"pyongyang, hardly, surely, scarcely, inevitably",0.236410,0,"pyongyang, hardly, sophism, scarcely, surely",0.225602,0,"hardly, scarcely, surely, fashionable, inevitably",...,0.391227,8,"nato, russia, enlargement, russian, moscow",0.392030,7,"nato, russia, enlargement, russian, moscow",0.391933,6,"nato, russia, enlargement, russian, moscow",0.389229
26522,9.312355,-0.398733,0,"pyongyang, hardly, surely, scarcely, inevitably",0.217323,2,"arsenals, arsenal, reductions, weapons, abm",0.237136,2,"arsenals, reductions, arsenal, abm, weapons",...,0.216329,8,"nato, russia, enlargement, russian, moscow",0.216781,5,"arsenals, weapons, arsenal, nuclear, reductions",0.218185,5,"arsenals, weapons, arsenal, nuclear, reductions",0.217783
26523,6.655655,1.643605,0,"pyongyang, hardly, surely, scarcely, inevitably",0.221382,2,"arsenals, arsenal, reductions, weapons, abm",0.236196,2,"arsenals, reductions, arsenal, abm, weapons",...,0.259019,15,"labour, tory, cameron, trident, mps",0.258917,14,"labour, tory, cameron, mps, trident",0.259117,14,"labour, tory, cameron, mps, trident",0.258237


In [9]:
embeddings.to_csv("../data/processed/deciding-on-topic-reduction.csv", index=False)