In [21]:
import json
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

# Load the first and second JSON files
with open(r'C:\Users\leonc\Thesis 2024\Toxic-Symbology\Text Clustering\ontoxGPAHE.json', 'r') as f1, open(r'C:\Users\leonc\Thesis 2024\Toxic-Symbology\tools\OnToxMeme_dict.json', 'r') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Extract keys from the second file that have "Referenced_in_meme"
keys_with_references = {
    key for key, value in data2.items() if "Referenced_in_meme" in value
}

# Filter entries from the first file based on these keys
filtered_entries = {
    key: value for key, value in data1.items() if key in keys_with_references
}

# Save the filtered entries to a new JSON file
with open('ontoxGPAHE_56.json', 'w') as output:
    json.dump(filtered_entries, output, indent=4)

In [22]:
with open(r'C:\Users\leonc\Thesis 2024\Toxic-Symbology\Text Clustering\ontoxGPAHE_56.json', 'r') as f:
    data = json.load(f)

descriptions_1 = [entry["Description"] for entry in data.values()]
descriptions_2 = [entry["Description"] + entry["Title"] for entry in data.values()]
descriptions_3 = [entry["Description"] + entry["Title"] + entry["Ideology"] for entry in data.values()]

In [23]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer('all-mpnet-base-v2')
embeddings_1 = sentence_model.encode(descriptions_1, show_progress_bar=True)

sentence_model = SentenceTransformer('all-mpnet-base-v2')
embeddings_2 = sentence_model.encode(descriptions_2, show_progress_bar=True)

sentence_model = SentenceTransformer('all-mpnet-base-v2')
embeddings_3 = sentence_model.encode(descriptions_3, show_progress_bar=True)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

First Run

In [24]:
# Dimensionality Reduction
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=10)

# Clustering (this is where the number of topics are generated, if we boost min_cluster_size we get fewer topics; we can also use a different cluster algorithm)
hdbscan_model = HDBSCAN(min_cluster_size=5, min_samples=2, metric='euclidean', cluster_selection_method='eom')     # the default algorithm
# hdbscan_model = KMeans(n_clusters=50)                                                                               # K-Means allows us to force every document into a cluster

# Tokenizing (stopwords do help, and we can also add n-grams, and add a document frequency cutoff)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))

# Weighting Scheme (we'll use the same as in the basic version)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Representation Tuning (makes the extracted keywords better)
representation_model = KeyBERTInspired()

# Train our topic model with BERTopic
topic_model = BERTopic(
    embedding_model=sentence_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
)

topics_1, probs_1 = topic_model.fit_transform(descriptions_1, embeddings_1)

In [25]:
new_topics_1 = topic_model.reduce_outliers(descriptions_1, topics_1, strategy="c-tf-idf")
topic_model.update_topics(descriptions_1, topics=new_topics_1, vectorizer_model=vectorizer_model)
file = topic_model.get_topic_info()
file.to_csv("description.csv")



In [28]:
topic_model.get_topic(1)

[('white', 0.04388445138418539),
 ('term', 0.043465373854853086),
 ('racist', 0.038755921696056315),
 ('people', 0.03760234005512059),
 ('lgbtq', 0.03207332730907055),
 ('used', 0.030746749222875226),
 ('anti', 0.025889581685194365),
 ('4chan', 0.025255584173958043),
 ('trans', 0.023020331993949653),
 ('women', 0.022236623215192857)]

Second Run

In [29]:
# Train our topic model with BERTopic
topic_model = BERTopic(
    embedding_model=sentence_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
)

In [30]:
topics_2, probs_2 = topic_model.fit_transform(descriptions_2, embeddings_2)

In [31]:
new_topics_2 = topic_model.reduce_outliers(descriptions_2, topics_2, strategy="c-tf-idf")
topic_model.update_topics(descriptions_2, topics=new_topics_2, vectorizer_model=vectorizer_model)
file = topic_model.get_topic_info()
file.to_csv("description_title.csv")



In [32]:
topic_model.get_topic(4)

[('lgb', 0.05118351331695788),
 ('partner', 0.05118351331695788),
 ('scientific', 0.05118351331695788),
 ('trans', 0.0467576395388898),
 ('people', 0.046430170280825966),
 ('acronym', 0.03646749440404747),
 ('ywnbaw woman', 0.03646749440404747),
 ('explains', 0.03646749440404747),
 ('finding', 0.03646749440404747),
 ('globohomo', 0.03646749440404747)]

Third Run

In [33]:
# Train our topic model with BERTopic
topic_model = BERTopic(
    embedding_model=sentence_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model
)

In [34]:
topics_3, probs_3 = topic_model.fit_transform(descriptions_3, embeddings_3)

In [54]:
# new_topics_3 = topic_model.reduce_outliers(descriptions_3, topics_3, strategy="c-tf-idf")
topic_model.update_topics(descriptions_3, topics=new_topics_3, vectorizer_model=vectorizer_model)
file = topic_model.get_topic_info()
file.to_csv("description_title_ideology.csv")



In [36]:
topic_model.get_topic(4)

[('theory', 0.05810835371490405),
 ('conspiracy', 0.054099676752202404),
 ('world', 0.049181524320184),
 ('great', 0.043238715966755874),
 ('21', 0.03977728205066006),
 ('agenda', 0.03783387647091139),
 ('global', 0.03783387647091139),
 ('agenda 21', 0.035117861565955207),
 ('government', 0.034094813186280055),
 ('conspiracy theory', 0.032429036975066904)]

Toxic symbols in each topic

In [57]:
# Load the symbol data from the JSON file
with open(r'C:\Users\leonc\Thesis 2024\Toxic-Symbology\Text Clustering\ontoxGPAHE_56.json', 'r') as f:
    symbol_data = json.load(f)

# Create a list of symbols and their descriptions from the JSON file
symbols = []
for symbol_id, symbol_info in symbol_data.items():
    symbols.append({
        "Symbol_ID": symbol_id,
        "Title": symbol_info.get("Title", ""),
        "Description": symbol_info.get("Description", ""),
    })

# Create a DataFrame for symbols
symbol_df_1 = pd.DataFrame(symbols)

# Ensure that 'topics_1' (the topic assignments from BERTopic) matches the order of the symbols
# The length of 'topics_1' must be the same as the number of symbols (descriptions_1)
symbol_df_1['Topic'] = new_topics_1  # topics_1 should be the topic assignments from BERTopic

# Now, group by the topic to see which symbols belong to each topic
symbols_per_topic_1 = symbol_df_1.groupby('Topic').agg({
    'Symbol_ID': 'unique', 
    'Title': 'unique', 
    'Description': 'unique'
})

# Create a DataFrame for symbols
symbol_df_2 = pd.DataFrame(symbols)

# Ensure that 'topics_1' (the topic assignments from BERTopic) matches the order of the symbols
# The length of 'topics_1' must be the same as the number of symbols (descriptions_1)
symbol_df_2['Topic'] = new_topics_2  # topics_1 should be the topic assignments from BERTopic

# Now, group by the topic to see which symbols belong to each topic
symbols_per_topic_2 = symbol_df_2.groupby('Topic').agg({
    'Symbol_ID': 'unique', 
    'Title': 'unique', 
    'Description': 'unique'
})

# Create a DataFrame for symbols
symbol_df_3 = pd.DataFrame(symbols)

# Ensure that 'topics_1' (the topic assignments from BERTopic) matches the order of the symbols
# The length of 'topics_1' must be the same as the number of symbols (descriptions_1)
symbol_df_3['Topic'] = new_topics_3  # topics_1 should be the topic assignments from BERTopic

# Now, group by the topic to see which symbols belong to each topic
symbols_per_topic_3 = symbol_df_3.groupby('Topic').agg({
    'Symbol_ID': 'unique', 
    'Title': 'unique', 
    'Description': 'unique'
})

In [58]:
symbols_per_topic_3["Title"][4]

array(['Agenda 21', 'Cultural Marxism',
       'Grand Remplacement / être grand remplacé (Great Replacement)',
       'I will not eat the bugs', 'Revolt Against The Modern World',
       'You will own nothing and be happy', 'ZOG'], dtype=object)