In [None]:
!pip install bertopic

# Section One: Model training and Hyperparamter tuning


In [None]:
!pip install optuna

In [None]:
!pip install numpy==1.26.4 gensim==4.3.3


In [None]:
import os

current_working_directory = os.getcwd()
current_working_directory

In [None]:
# Load data (original text not pre-processed)
import pandas as pd
df = pd.read_csv('cleaned_wiredbert.csv')

In [None]:
docs = df['text_content']
docs = [doc if doc is not None and not (isinstance(doc, float) and np.isnan(doc)) else "" for doc in docs]
docs = [str(doc) for doc in docs]
non_string_elements = [doc for doc in docs if not isinstance(doc, str)]
if non_string_elements:
    print("Non-string elements found:", non_string_elements)
    

In [None]:
!pip install scikit-learn

In [None]:
!pip install spacy

In [None]:
import spacy

In [None]:
!pip install plotly kaleido


In [None]:
!pip install -U kaleido


In [None]:
# Run default out of the box bertopic model
from bertopic import BERTopic

defaultmodel = BERTopic()
topics, probs = defaultmodel.fit_transform(docs)


In [None]:
defaultmodel.get_topic_info()

In [None]:
from gensim.models import CoherenceModel
from gensim import corpora

# Load model vectorizer 
vectorizer = multi_topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Function to calculate topic coherence_npmi 
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = []
for topic_num in range(len(multi_topic_model.get_topics())):
    topic = multi_topic_model.get_topic(topic_num)
    if topic:  # Ensure the topic is not empty or invalid
        topic_words.append([word for word, _ in topic])

In [None]:
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='u_mass')

# Calculate the NPMI coherence score
coherence = coherence_model.get_coherence()

#print(f"NPMI Coherence Score: {coherence}")

In [None]:
from transformers import BertTokenizer
from sentence_transformers import SentenceTransformer

# Try models with longer maximum sequence length than the default 256

models = [
    {
        'name': 'distilbert-base-cased  (512 tokens)',
        'tokenizer': BertTokenizer.from_pretrained('distilbert-base-cased'),
        'max_seq_length': 512
    },
    {
        'name': 'BAAI/bge-base-en-v1.5 (512 tokens)',
        'tokenizer': SentenceTransformer('BAAI/bge-base-en-v1.5'),
        'max_seq_length': 512
    },
    {
        'name': 'all-MiniLM-L6-v2 (256 tokens)',
        'tokenizer': SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2'),
        'max_seq_length': 384
    }
]


In [None]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained  defualt model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')


In [None]:
tokenizer = model.tokenizer

max_seq_length = tokenizer.model_max_length
print(f"Maximum Sequence Length: {max_seq_length}")

# Initialize lists to store results
num_tokens_list = []
tokens_lost_list = []
percentage_lost_list = []

# Tokenize each document with built in tokenizer and calculate percentage loss for each document 
# Loop through each document in the list
for i, document in enumerate(docs):
    # Tokenize the document
    tokens = tokenizer.encode(document, truncation=False)

    # Calculate the number of tokens in the document
    num_tokens = len(tokens)
    num_tokens_list.append(num_tokens)

    # Check how many tokens are lost
    if num_tokens > max_seq_length:
        tokens_lost = num_tokens - max_seq_length
        percentage_lost = (tokens_lost / num_tokens) * 100
    else:
        tokens_lost = 0
        percentage_lost = 0.0

    tokens_lost_list.append(tokens_lost)
    percentage_lost_list.append(percentage_lost)

In [None]:
!pip install matplotlib

import matplotlib.pyplot as plt

# Plot percentage loss of documents 

# Create a histogram of percentage loss
plt.figure(figsize=(10, 6))
plt.hist(percentage_lost_list, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Percentage Loss Across Documents')
plt.xlabel('Percentage Loss')
plt.ylabel('Number of Documents')
plt.grid(True)
plt.savefig('percentage_loss_distribution.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Try second model 

model2 = SentenceTransformer('BAAI/bge-base-en-v1.5')

In [None]:
tokenizer = model2.tokenizer

max_seq_length = tokenizer.model_max_length
print(f"Maximum Sequence Length: {max_seq_length}")

# Initialize lists to store results
num_tokens_list = []
tokens_lost_list = []
percentage_lost_list = []

# Loop through each document in the list
for i, document in enumerate(docs):
    # Tokenize the document
    tokens = tokenizer.encode(document, truncation=False)

    # Calculate the number of tokens in the document
    num_tokens = len(tokens)
    num_tokens_list.append(num_tokens)

    # Check how many tokens are lost
    if num_tokens > max_seq_length:
        tokens_lost = num_tokens - max_seq_length
        percentage_lost = (tokens_lost / num_tokens) * 100
    else:
        tokens_lost = 0
        percentage_lost = 0.0

    tokens_lost_list.append(tokens_lost)
    percentage_lost_list.append(percentage_lost)

In [None]:
# Create a histogram of percentage loss
plt.figure(figsize=(10, 6))
plt.hist(percentage_lost_list, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Percentage Loss Across Documents')
plt.xlabel('Percentage Loss')
plt.ylabel('Number of Documents')
plt.grid(True)
plt.savefig('percentage_loss_distribution_baii.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
tokenizer2 = model2.tokenizer

# Check the maximum sequence length
max_seq_length = tokenizer2.model_max_length

print(f"Maximum Sequence Length: {max_seq_length}")


In [None]:
# Third model (transformer but not sentence transformer model)

model3 = SentenceTransformer('sentence-transformers/all-distilroberta-v1')


tokenizer3 = model3.tokenizer

# Check the maximum sequence length
max_seq_length = tokenizer3.model_max_length

print(f"Maximum Sequence Length: {max_seq_length}")



num_tokens_list = []
tokens_lost_list = []
percentage_lost_list = []

# Loop through each document in the list
for i, document in enumerate(docs):
    # Tokenize the document
    tokens = tokenizer3.encode(document, truncation=False)

    # Calculate the number of tokens in the document
    num_tokens = len(tokens)
    num_tokens_list.append(num_tokens)

    # Check how many tokens are lost
    if num_tokens > max_seq_length:
        tokens_lost = num_tokens - max_seq_length
        percentage_lost = (tokens_lost / num_tokens) * 100
    else:
        tokens_lost = 0
        percentage_lost = 0.0

    tokens_lost_list.append(tokens_lost)
    percentage_lost_list.append(percentage_lost)


In [None]:
plt.figure(figsize=(10, 6))
plt.hist(percentage_lost_list, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Percentage Loss Across Documents')
plt.xlabel('Percentage Loss')
plt.ylabel('Number of Documents')
plt.grid(True)
plt.savefig('percentage_loss_distribution_baii.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
from sentence_transformers import SentenceTransformer

# final selection of embedding model from MTEB leaderboard 

embedding_model_gte = SentenceTransformer('thenlper/gte-small')


In [None]:
tokenizer4 = embedding_model_gte.tokenizer

# Check the maximum sequence length
max_seq_length = tokenizer4.model_max_length

print(f"Maximum Sequence Length: {max_seq_length}")


In [None]:
num_tokens_list = []
tokens_lost_list = []
percentage_lost_list = []

# Loop through each document in the list
for i, document in enumerate(docs):
    # Tokenize the document
    tokens = tokenizer4.encode(document, truncation=False)

    # Calculate the number of tokens in the document
    num_tokens = len(tokens)
    num_tokens_list.append(num_tokens)

    # Check how many tokens are lost
    if num_tokens > max_seq_length:
        tokens_lost = num_tokens - max_seq_length
        percentage_lost = (tokens_lost / num_tokens) * 100
    else:
        tokens_lost = 0
        percentage_lost = 0.0

    tokens_lost_list.append(tokens_lost)
    percentage_lost_list.append(percentage_lost)


In [None]:
plt.figure(figsize=(10, 6))
plt.hist(percentage_lost_list, bins=20, color='skyblue', edgecolor='black')
plt.title('Distribution of Percentage Loss Across Documents')
plt.xlabel('Percentage Loss')
plt.ylabel('Number of Documents')
plt.grid(True)
plt.savefig('percentage_loss_distribution_baii.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
# Pre-calculate embeddings as recommended by documentation best practice

gte_embeddings = embedding_model_gte.encode(docs, show_progress_bar=True)


In [None]:
import pickle

# Pickle calculated embeddings 

with open('gte_embeddings.pkl', 'wb') as file:
    pickle.dump(gte_embeddings, file)

In [None]:
import pickle

# Load pickled embeddings for faster repeated model training 

with open('gte_embeddings.pkl', 'rb') as file:
    gte_embeddings = pickle.load(file)

In [None]:
from bertopic import BERTopic

# Train model with chosen embeddings but default hyperparameters as baseline 

topic_model_gte = BERTopic(embedding_model=embedding_model_gte)


In [None]:
topics, probs = topic_model_gte.fit_transform(docs, gte_embeddings)


In [None]:
topic_model_gte.get_topic_info()

In [None]:
# Calculate coherence npmi 

from gensim.models import CoherenceModel
from gensim import corpora

vectorizer = topic_model_gte.vectorizer_model
analyzer = vectorizer.build_analyzer()


words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = []
for topic_num in range(len(topic_model_gte.get_topics())):
    topic = topic_model_gte.get_topic(topic_num)
    if topic:  # Ensure the topic is not empty or invalid
        topic_words.append([word for word, _ in topic])

In [None]:
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_npmi')

# Step 7: Calculate the NPMI coherence score
coherence = coherence_model.get_coherence()

print(f"NPMI Coherence Score: {coherence}")

In [None]:
# Visualize hierarchical clustering 

topic_model_gte.visualize_hierarchy()

In [None]:
fig_gte = topic_model_gte.visualize_topics()

In [None]:
# Save interactive plots as html 

fig_gte.write_html("fig_gte_intertopic.html")

In [None]:
# Hyperparamter tuning with Optuna with the objective of maximizing npmi coherence 


import optuna
from bertopic import BERTopic
from hdbscan import HDBSCAN
import umap
from sklearn.metrics import silhouette_score
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.utils import simple_preprocess
import numpy as np

def calculate_npmi_score(model, docs):
    # Use the vectorizer from the BERTopic model to tokenize the documents
    vectorizer = model.vectorizer_model
    analyzer = vectorizer.build_analyzer()

    tokens = [analyzer(doc) for doc in docs]
    
    # Create Gensim dictionary and corpus
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]

    # Get the topics from BERTopic
    topics = model.get_topics()
    top_n_words = [[word for word, _ in topic] for topic in topics.values()]

    # Compute NPMI using Gensim
    coherence_model = CoherenceModel(
        topics=top_n_words, 
        texts=tokens, 
        dictionary=dictionary, 
        corpus=corpus, 
        coherence='c_npmi'
    )
    return coherence_model.get_coherence()

def objective(trial):
    # Set hyperparameters ranges
    n_neighbors = trial.suggest_int('n_neighbors', 15, 100)
    min_dist = trial.suggest_float('min_dist', 0.0, 0.5)
    min_cluster_size = trial.suggest_int('min_cluster_size', 5, 100)
    min_samples = trial.suggest_int('min_samples', 5, 100)

    # Create UMAP and HDBSCAN models with suggested parameters
    umap_model = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric='cosine')
    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples)

    # Create BERTopic model with pre-calculated embeddings
    topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
    
    # Fit the model using the full dataset
    topics, probs = topic_model.fit_transform(docs, embeddings=gte_embeddings)

    # Calculate NPMI coherence score using the vectorizer method
    npmi_score = calculate_npmi_score(topic_model, docs)
    
    return npmi_score

# Create study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Best parameters
print("Best Parameters: ", study.best_params)

In [None]:
# Save study object dataframe with all the trial information

trials_df = study.trials_dataframe()
sorted_trials_df = trials_df.sort_values(by='value', ascending=False)
sorted_trials_df.to_csv('optuna_trials_sorted2.csv', index=False)


In [None]:
# Train the model with the hyperparameters values of the best trial 

from umap import UMAP

umap_model = UMAP(n_neighbors=57, n_components=5, min_dist=0.023, metric='cosine', random_state=42)

In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=29, min_samples=26, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 1))

In [None]:
from bertopic.representation import KeyBERTInspired
keybert_model = KeyBERTInspired()

In [None]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model_gte,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  ctfidf_model=ctfidf_model,
  representation_model=keybert_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(docs, gte_embeddings)

In [None]:
topic_info = topic_model.get_topic_info()

In [None]:
# Load topic information for optimised model 

topic_info_df_optimisedgte = pd.DataFrame(topic_info)


In [None]:
topic_info_df_optimisedgte

In [None]:
topic_info_df_optimisedgte.to_csv('topic_info_optimisedgte.csv', index=False)


In [None]:
# Create intertopic chart for optimized model 

gte_optimised_vis = topic_model.visualize_topics()

In [None]:
gte_optimised_vis

In [None]:
gte_optimised_vis.write_html("intertopic_gte_optimised.html")

In [None]:
# Reduce outliers with c-tfidf method and 0.1 as probability threshold 

new_topics = topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)


In [None]:
# Update model with reduced outliers topics 

topic_model.update_topics(docs, topics=new_topics)


In [None]:
reducedoutlier_gte = topic_model.get_topic_info()

In [None]:
topic_info_df_reduced_gte = pd.DataFrame(reducedoutlier_gte)


In [None]:
topic_info_df_reduced_gte.to_csv('topic_info_reducedgte.csv', index=False)


In [None]:
topic_info_df_reduced_gte

# Section Two: Visualizing Optimized Model Results 

In [None]:
topic_model.visualize_topics()

In [None]:
# Calculate topic diversity 

top_n_words = 10  
topic_words = []
for topic_num in range(len(topic_model.get_topics())):
    topic = topic_model.get_topic(topic_num)
    if topic:  # Ensure the topic is not empty or invalid
        words = [word for word, _ in topic[:top_n_words]]
        topic_words.extend(words)  # Collect all top words across all topics

In [None]:
unique_words = set(topic_words)
unique_word_count = len(unique_words)
total_words = len(topic_words)

In [None]:
topic_diversity = unique_word_count / total_words
print(f"Topic Diversity: {topic_diversity:.4f}")

In [None]:
# Visualize hierachical clustering for optimized model 

topic_model.visualize_hierarchy()

In [None]:
!python -m spacy download en_core_web_sm


In [None]:
# Run multi-aspect topic modelling for the different representations 

from bertopic.representation import PartOfSpeech
from bertopic.representation import MaximalMarginalRelevance

aspect_model1 = PartOfSpeech("en_core_web_sm")
aspect_model2 = [KeyBERTInspired(top_n_words=30), MaximalMarginalRelevance(diversity=.5)]

In [None]:
representation_model = {
   "Keybert": keybert_model,
   "POS":  aspect_model1,
   "MMR":  aspect_model2 
}

In [None]:
multi_topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model_gte,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

topics, probs = multi_topic_model.fit_transform(docs, gte_embeddings)

In [None]:
new_topics2 = multi_topic_model.reduce_outliers(docs, topics , strategy="c-tf-idf", threshold=0.1)


In [None]:
multi_topic_model.update_topics(docs, topics=new_topics2)


In [None]:
multi_topic_model.get_topic_info()

In [None]:
# Create hierarchical clustering plot with linkages 

from scipy.cluster import hierarchy as sch

linkage_function = lambda x: sch.linkage(x, 'complete', optimal_ordering=True)
hierarchical_topics = multi_topic_model.hierarchical_topics(docs, linkage_function=linkage_function)

In [None]:
multi_vis = multi_topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)


In [None]:
multi_vis.write_html("multi_hierarchy_complete.html")

In [None]:
# Reduce embeddings to 2D for clustering visualization 

reduced_embeddings = UMAP(n_neighbors=10, n_components=2, 
                          min_dist=0.0, metric='cosine').fit_transform(gte_embeddings)

In [None]:
multi_topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings, 
                                hide_document_hover=True, hide_annotations=True)

In [None]:
multi = multi_topic_model.get_topic_info()

multi_df = pd.DataFrame(multi)

multi_df.to_csv('topic_info_multi_gte.csv', index=False)


In [None]:
!pip install bertopic transformers


# Section Three: Experimentation with NLP techniques for results interpretation


In [None]:
from transformers import pipeline
from bertopic.representation import TextGeneration

# Initialize the text generation pipeline for summary label generation and enable gpu acceleration 

generator = pipeline('text2text-generation', model='google/flan-t5-base', device=0)



In [None]:
def generate_topic_label(topic_model, topic_id, generator):
  
    # Retrieve keywords for the specified topic
    topic_keywords = topic_model.get_topic(topic_id)
    
    # Format the keywords as a single string
    keywords = ", ".join([word for word, _ in topic_keywords])
    
    # Set the prompt
    prompt_template = (
    "I have a topic described by the following keywords: [KEYWORDS]. "
    "Based on these keywords, provide a two-word descriptive label that best summarizes the topic. "
    "For example, if the keywords were 'AI, machine learning, deep learning', a good label might be 'AI Techniques'."
    )    
    prompt = prompt_template.replace("[KEYWORDS]", keywords)
    
    # Generate the label
    generated_label = generator(prompt, max_length=20, num_return_sequences=1, top_k=50, temperature=0.7)[0]['generated_text']
    
    # Clean up the label and ensure it is two words
    cleaned_label = ' '.join(generated_label.strip().split()[:2])
    
    # Further clean-up: remove repetitve generic words
    if cleaned_label.lower().startswith(('what', 'how', 'why')):
        cleaned_label = cleaned_label.split(' ', 1)[-1].strip()

    return cleaned_label


In [None]:
def generate_labels_for_all_topics(topic_model, generator):
    # Get all topic IDs from the BERTopic model
    topic_ids = topic_model.get_topic_info()['Topic'].tolist()
    
    # Dictionary to store topic labels
    topic_labels = {}
    
    for topic_id in topic_ids:
        # Skip the -1 topics which represents the outliers 
        if topic_id == -1:
            continue
        
        # Generate a label for each topic
        label = generate_topic_label(topic_model, topic_id, generator)
        topic_labels[topic_id] = label
        print(f"Generated label for topic {topic_id}: {label}")
    
    return topic_labels

topic_labels = generate_labels_for_all_topics(multi_topic_model, generator)

In [None]:
# Add the generated labels as the final column in the dataframe with the multi-representation topics 

topic_info_all_gte = multi_topic_model.get_topic_info()

# Add the generated labels to the DataFrame
topic_info_all_gte['Generated_Label'] = topic_info_all_gte['Topic'].map(topic_labels)

# Display the DataFrame with the new column
print(topic_info_all_gte)

topic_info_all_gte.to_csv('gtemodel_all_multi.csv', index=False)

print("DataFrame saved to 'final_model_all_multi.csv'")

In [None]:
topic_0_info = topic_info_all_gte[topic_info_all_gte['Topic'] == 0]

# Extract all the representations
topic_0_representations = topic_0_info[['Topic', 'Representation', 'Keybert', 'POS', 'MMR', 'Generated_Label']]  # Adjust columns as needed

# Save the extracted information to a CSV file
topic_0_representations.to_csv('topic_0_representations.csv', index=False)

#print("Topic 3 representations saved to 'topic_3_representations.csv'")

In [None]:
!pip install tabulate

In [None]:
from IPython.display import display, HTML
display(HTML(topic_0_representations.to_html(index=False)))



In [None]:
# Create zero shot classification pipeline with transformer model for labelling candidate labels to topics 

zero_shot_classifier = pipeline("zero-shot-classification", model="valhalla/distilbart-mnli-12-1", device=0)


In [None]:
candidate_labels = ["gloabl network", "gadgets", "virtual community", "freedom", "regulation", "cyberspace", "privacy", "international affairs", "silicon valley", "entrepreneur", "success",  
                   'innovation', 'new', 'revolution', 'progress','equal', 'communication', 'collaboration', 'risk', 'control', 'governemnt', 'hierarchy', 'protection', 'threat', 'danger']


In [None]:
topics_keywords = multi_topic_model.get_topics()


In [None]:
# Classify each topic to a candidate label with highest probability with a minimum threshold of 0.2 

def generate_zero_shot_labels(topics_keywords, classifier, candidate_labels, threshold=0.2):
    topic_labels = {}
    for topic_id, keywords in topics_keywords.items():
        if topic_id == -1:  # Skip outlier topic
            continue
        # Concatenate the keywords into a single string
        keywords_str = ", ".join([word for word, _ in keywords])
        # Use the zero-shot classifier to predict the label
        result = classifier(keywords_str, candidate_labels)
        # Select the label only if its score exceeds the threshold
        if result['scores'][0] >= threshold:
            topic_labels[topic_id] = result['labels'][0]
        else:
            topic_labels[topic_id] = "Uncertain"  
    return topic_labels

In [None]:
topic_labels = generate_zero_shot_labels(topics_keywords, zero_shot_classifier, candidate_labels, threshold=0.2)


In [None]:
topic_info_all_gte['Zero_Shot_Label'] = topic_info_all_gte['Topic'].map(topic_labels)

print(topic_info_all_gte)

In [None]:
topic_info_all_gte.to_csv('topic_labels_with_zero_shot_new.csv', index=False)


In [None]:
import pandas as pd
topic_info_all_gtedf = pd.read_csv('topic_labels_with_zero_shot_new.csv')

In [None]:
keyword_list = ["global network", "gadgets", "virtual community", "freedom", "regulation", "cyberspace", "privacy", "international affairs", "silicon valley", "entrepreneur", "success",  
                   'innovation', 'new', 'revolution', 'progress','equal', 'communication', 'collaboration', 'risk', 'control', 'governemnt', 'hierarchy', 'protection', 'threat', 'danger']


In [None]:
>>> similar_topics, similarity = multi_topic_model.find_topics("control", top_n=5)
>>> multi_topic_model.get_topic(similar_topics[2])

In [None]:
topic_id = similar_topics[0]
topic_similarity_score = similarity[0]

In [None]:
similar_topics

In [None]:
similarity

In [None]:
!pip install networkx

In [None]:
import matplotlib.pyplot as plt
import os


In [None]:
import networkx as nx

# Create node grpahs for keywords connected with the top 3 similar topics 


def create_and_save_keyword_topic_subgraphs(keyword_list, topic_model, output_dir, top_n=3):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for keyword in keyword_list:
        # Create a directed graph for each keyword
        G = nx.DiGraph()
        
        # Add the keyword node
        keyword_node = f"Keyword: {keyword}"
        G.add_node(keyword_node)
        
        # Find the top_n topics for the keyword
        similar_topics, similarity = topic_model.find_topics(keyword, top_n=top_n)
        
        # Add nodes and edges for the topics
        for i, topic_id in enumerate(similar_topics):
            topic_node = f"Topic {topic_id}"
            similarity_score = similarity[i]
            
            # Add topic node with its similarity score
            G.add_node(topic_node)
            
            # Create an edge from the keyword to the topic with the similarity score as a label
            G.add_edge(keyword_node, topic_node, weight=similarity_score)
        
        # Draw the graph
        pos = nx.spring_layout(G)
        edge_labels = nx.get_edge_attributes(G, 'weight')
        
        # Draw nodes and edges with labels
        nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', 
                node_size=3000, font_size=12, font_weight='bold', arrows=True)  # Increase font size for labels
        nx.draw_networkx_edge_labels(G, pos, edge_labels={k: f"{v:.2f}" for k, v in edge_labels.items()}, 
                                     font_color='red', font_size=14)  # Increase font size for edge labels
        
        plt.title(f"Keyword-Topic Similarity Graph for '{keyword}'", fontsize=16)  # Increase font size for title
        
        plt.savefig(os.path.join(output_dir, f"{keyword}_topic_graph.png"))
        plt.close()


In [None]:
output_dir = "keyword_topic_graphs_new" 
create_and_save_keyword_topic_subgraphs(newlist, multi_topic_model, output_dir)

In [None]:
def save_topic_representations_for_keywords(keyword_list, topic_model, top_n_topics=3, output_csv='topic_representations.csv'):
    data = {'Keyword': [], 'Topic ID': [], 'Topic Representation': []}
    
    for keyword in keyword_list:
        # Find the top_n_topics for the keyword
        similar_topics, similarity_scores = topic_model.find_topics(keyword, top_n=top_n_topics)
        
        for i, topic_id in enumerate(similar_topics):
            # Get the topic representation
            topic_representation = topic_model.get_topic(topic_id)
            # Convert the topic representation to a string format
            topic_representation_str = ', '.join([word for word, _ in topic_representation])
            
            # Add the keyword, topic ID, and topic representation to the DataFrame
            data['Keyword'].append(keyword)
            data['Topic ID'].append(topic_id)
            data['Topic Representation'].append(topic_representation_str)
    
    result_df = pd.DataFrame(data)
    result_df.to_csv(output_csv, index=False)

In [None]:
output_csv = "topic_representations.csv"  # Output CSV file path
save_topic_representations_for_keywords(newlist, multi_topic_model, top_n_topics=3, output_csv=output_csv)

In [None]:
def get_representative_docs_and_titles_for_topics(topic_ids, topic_model, df, top_n=3):
    
    # List to hold the output data
    output_data = []

    for topic_id in topic_ids:
        # Get the representative documents for the given topic
        representative_docs = topic_model.get_representative_docs(topic_id)[:top_n]
        
        for doc in representative_docs:
            # Find the corresponding title for each representative document
            title = df.loc[df['text_content'] == doc, 'title'].values[0]
            # Append the topic ID, title, and document text to the output data
            output_data.append({
                'topic_id': topic_id,
                'title': title,
                'document_text': doc
            })

    return pd.DataFrame(output_data)


In [None]:
output_df = get_representative_docs_and_titles_for_topics(china, multi_topic_model, df, top_n=3)

output_df.to_csv(f"representative_docs_for_topic_{china}.csv", index=False)


In [None]:
output_df.head()

In [None]:
from collections import Counter

topic_assignments = multi_topic_model.topics_  # Get the topic assignment for each document


# Count the number of documents assigned to each topic
topic_counts = Counter(topic_assignments)

# Get the number of documents for each selected topic
selected_topic_counts = {topic: topic_counts.get(topic, 0) for topic in china}

# Calculate the total number of documents assigned to the selected topics
total_documents = sum(selected_topic_counts.values())

print("Document counts per selected topic:")
for topic, count in selected_topic_counts.items():
    print(f"Topic {topic}: {count} documents")

print(f"\nTotal number of documents in selected topics: {total_documents}")

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
G = nx.Graph()

# Add edges based on co-occurrence of keywords and entities
for idx, row in df.iterrows():
    keywords = row['technoutopian_keywords']
    entities = [ent[0] for ent in row['entities']]
    for keyword in keywords:
        for entity in entities:
            G.add_edge(keyword, entity)

plt.figure(figsize=(10, 8))
nx.draw_networkx(G, with_labels=True)
plt.show()

In [None]:
print(output_df['document_text'])

In [None]:
!pip install numpy matplotlib scikit-learn pandas datashader scikit-image numba requests jinja2


In [None]:
topic_representations = multi_topic_model.get_topic_info()

topic_ids = topic_representations['Topic'].values
topic_names = topic_representations['Name'].values

In [None]:
labels = [f"Topic {topic_id}: {topic_repr}" for topic_id, topic_repr in zip(topic_ids, topic_names)]

In [None]:
topic_per_doc = multi_topic_model.get_document_info(docs)["Topic"].values
named_topic_per_doc = [labels[topic_id] for topic_id in topic_per_doc]

In [None]:
figure, axes = datamapplot.create_plot(
    reduced_embeddings,
    named_topic_per_doc,
    figsize=(10, 8),
    title="Topic Map",
    sub_title="Visualization of Topics with IDs and Representations"
)


plt.show()

In [None]:
# Final topic lists for defined theme categories

china =[121,66,110]
culture = [51, 129, 24, 107, 29, 19, 83, 71, 90, 34, 53]
Silicon_valley = [92,80, 64]
Online_activity = [11, 61, 11, 8, 4, 103, 1, 9, 97, 63, 38, 33, 22, 69]
cybersecurity = [32, 5, 16, 15, 14, 6, 13, 10, 53, 42, 88, 39, 119, 105, 95]
techcompanies = [127, 130, 37, 3, 124, 27, 46, 118]
stock = [100, 94, 23, 25, 114, 28, 12, 0, 101]

In [None]:
# Get document information
document_info = multi_topic_model.get_document_info(docs)


In [None]:
multi_topic_model.visualize_barchart(topics=[121, 66, 110])


In [None]:
available_topic_ids = list(topic_aspects.keys())
print("Available topic IDs:", available_topic_ids)

In [None]:
import matplotlib.pyplot as plt

# Select the representation type and topic ID
representation_type = 'POS'
topic_id = 121 

# Extract the representation terms and scores
try:
    representation = topic_aspects[representation_type][topic_id]
    
    terms, scores = zip(*representation)  # Unzip terms and scores

    # Manually plot the term bar chart
    plt.figure(figsize=(10, 6))
    plt.barh(terms[::-1], scores[::-1], color='skyblue')  # Reverse to plot highest score at the top
    plt.xlabel('Importance')
    plt.ylabel('Terms')
    plt.title(f'Term Chart for Topic {topic_id} ({representation_type} Representation)')
    plt.show()

except KeyError:
    print(f"Topic {topic_id} not found in the '{representation_type}' representation.")

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(24, 8)) 
topic_aspects = multi_topic_model.topic_aspects_

representation_type = 'POS'
# Font settings
font_size_title = 16
font_size_labels = 14
font_size_ticks = 12

# Loop over each topic and create a subplot
for ax, topic_id in zip(axes, techcompanies):
    try:
        representation = topic_aspects[representation_type][topic_id]
        terms, scores = zip(*representation)  # Unzip terms and scores

        # Plot the term bar chart in the current subplot
        ax.barh(terms[::-1], scores[::-1], color='skyblue')
        ax.set_xlabel('Importance', fontsize=font_size_labels)
        ax.set_ylabel('Terms', fontsize=font_size_labels)
        ax.set_title(f'Topic {topic_id} ({representation_type})', fontsize=font_size_title)
        
        # Set tick parameters
        ax.tick_params(axis='both', which='major', labelsize=font_size_ticks)

    except KeyError:
        ax.set_title(f'Topic {topic_id} not found', fontsize=font_size_title)
        ax.axis('off')  # Hide the axis if the topic is not found

plt.tight_layout()
plt.savefig(f'term_charts_topics_{techcompanies[0]}_{techcompanies[1]}_{techcompanies[2]}_{representation_type}.png', format='png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
representation_types = ['Keybert', 'POS', 'MMR']

for topic in selected_topics:
    for rep_type in representation_types:
        try:
            representation = topic_aspects[rep_type][topic]
            print(f"Topic {topic} ({rep_type}) Representation: {representation}")
        except KeyError:
            print(f"Topic {topic} not found in the '{rep_type}' representation.")

In [None]:
import math

num_topics = len(culture)
cols = 3  
rows = math.ceil(num_topics / cols) d

for ax, topic_id in zip(axes, culture):
    try:
        representation = topic_aspects[representation_type][topic_id]
        if not representation:
            raise ValueError("Empty representation data")

        terms, scores = zip(*representation)  # Unzip terms and scores

        print(f"Topic {topic_id} terms: {terms}")
        print(f"Topic {topic_id} scores: {scores}")

        # Plot the term bar chart in the current subplot
        ax.barh(terms[::-1], scores[::-1], color='skyblue')
        ax.set_xlabel('Importance')
        ax.set_ylabel('Terms')
        ax.set_title(f'Topic {topic_id} ({representation_type})')

    except KeyError:
        ax.set_title(f'Topic {topic_id} not found')
        ax.axis('off')  # Hide the axis if the topic is not found
    except ValueError as ve:
        print(f"ValueError: {ve}")
        ax.set_title(f'Topic {topic_id} Error')
        ax.axis('off')
    except Exception as e:
        print(f"Unexpected error: {e}")
        ax.set_title(f'Topic {topic_id} Error')
        ax.axis('off')

In [None]:
def extract_documents_by_topic_ids(topic_ids, document_info, docs):
    
    topic_docs = {}  # Initialize an empty dictionary
    
    # Verify that docs and document_info are valid
    if not docs:
        raise ValueError("The 'docs' list is empty.")
    
    if 'Topic' not in document_info:
        raise ValueError("'Topic' key not found in document_info.")
    
    if len(document_info['Topic']) != len(docs):
        raise ValueError("Length mismatch between 'document_info' and 'docs'.")
    
    for topic_id in topic_ids:
        # Reinitialize doc_indices for each topic_id
        doc_indices = [i for i, topic in enumerate(document_info['Topic']) if topic == topic_id]
        doc_indices = [i for i in doc_indices if i < len(docs)]
        
        # Extract the documents using the valid indices
        selected_docs = [docs[i] for i in doc_indices]
        topic_docs[topic_id] = selected_docs
    
    return topic_docs

In [None]:
extracted_docs = extract_documents_by_topic_ids(china, document_info, docs)


In [None]:
for topic_id, documents in extracted_docs.items():
    print(f"Topic {topic_id}: {len(documents)} documents")

In [None]:
for topic_id, documents in extracted_docs.items():
    print(f"Processing Topic ID: {topic_id}")
    
    # Combine all documents for this topic into a single text
    combined_text = ' '.join(documents)

In [None]:
def perform_ngram_analysis(text, ngram_range=(2, 2), top_n=20):
    
    # Initialize CountVectorizer to extract n-grams
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words='english')

    # Fit and transform the text data to get n-grams
    ngrams = vectorizer.fit_transform([text])
    
    # Get the frequency of each n-gram
    ngram_freq = ngrams.toarray().flatten()  # Convert sparse matrix to a dense array and flatten to 1D
    
    ngram_freq_df = pd.DataFrame({'frequency': ngram_freq}, index=vectorizer.get_feature_names_out())
    
    # Sort n-grams by frequency
    ngram_freq_df = ngram_freq_df.sort_values(by='frequency', ascending=False)
    
    # Return the top N n-grams
    return ngram_freq_df.head(top_n)

all_ngram_results = pd.DataFrame()


for topic_id, documents in extracted_docs.items():
    print(f"\nAnalyzing Topic ID: {topic_id}")
  
    combined_text = ' '.join(documents)
    
    ngram_results = perform_ngram_analysis(combined_text, ngram_range=(2, 2), top_n=20)  # Change ngram_range as needed
  
    ngram_results['Topic_ID'] = topic_id
    
    all_ngram_results = pd.concat([all_ngram_results, ngram_results])

all_ngram_results.to_csv('bigram_analysis_results.csv', index=True)

In [None]:
all_ngram_results

In [None]:
import spacy

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Load phrasematcher to match phrases with ner entities 
from spacy.matcher import PhraseMatcher



In [None]:
def apply_ner(documents):
    
    ner_results = []
    
    for doc in documents:
        spacy_doc = nlp(doc)
        entities = [(ent.text, ent.label_) for ent in spacy_doc.ents]
        ner_results.append(entities)
    
    return ner_results

In [None]:
def process_topic_ids_and_apply_ner(topic_ids, document_info, docs):
   
    # Extract documents for selected topic category list 
    extracted_docs = extract_documents_by_topic_ids(Silicon_valley, document_info, docs)
    
    # Apply NER to the extracted documents
    topic_entities = {}
    for topic_id, documents in extracted_docs.items():
        topic_entities[topic_id] = apply_ner(documents)
    
    return topic_entities

In [None]:
def find_context_sentences_for_ner(ner_results, term):
    term_sentences = {}
    
    # Initialize PhraseMatcher
    phrase_matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(term)]
    phrase_matcher.add('TERM', None, *patterns)
    
    for topic_id, entities_list in ner_results.items():
        # Extract documents for the topic
        docs_for_topic = extract_documents_by_topic_ids([topic_id], document_info, docs)[topic_id]
        context_sentences = []
        
        for doc in docs_for_topic:
            spacy_doc = nlp(doc)
            for sent in spacy_doc.sents:
                for match_id, start, end in phrase_matcher(nlp(sent.text)):
                    if nlp.vocab.strings[match_id] == 'TERM':
                        context_sentences.append(sent.text)
        
        term_sentences[topic_id] = context_sentences
    
    return term_sentences

term = "India" # Select NER term 
selected_topic_ids = Silicon_valley  

# Apply NER and extract context sentences for the given term
entities = process_topic_ids_and_apply_ner(selected_topic_ids, document_info, docs)
context_sentences = find_context_sentences_for_ner(entities, term)

for topic_id, sentences in context_sentences.items():
    print(f"Topic ID {topic_id}:")
    # Limit to top 10 sentences
    for sentence in sentences[:10]:
        print(f"  {sentence}")
    print() 

In [None]:
stock_entities = process_topic_ids_and_apply_ner(stock, document_info, docs)


In [None]:
multi_topic_model.get_document_info(docs)

In [None]:
def save_ner_to_csv(ner_results, filename):
    rows = []
    for topic_id, documents in ner_results.items():
        for i, doc_entities in enumerate(documents):
            for entity, label in doc_entities:
                rows.append({
                    'Topic ID': topic_id,
                    'Document Index': i,
                    'Entity': entity,
                    'Label': label
                })
    
    df = pd.DataFrame(rows)
    df.to_csv(filename, index=False)


In [None]:
save_ner_to_csv(entities, 'stock_ner_results.csv')


In [None]:
!python -m spacy download en_core_web_sm



In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
import csv

with open('china_topic_entities.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Topic_ID', 'Entity', 'Label'])
    for topic_id, entities in topic_entities.items():
        for entity, label in entities:
            writer.writerow([topic_id, entity, label])

In [None]:
topic_id = 121 
select_docs = multi_topic_model.get_representative_docs(topic_id)

In [None]:
if select_docs:
    first_document = select_docs[0]
    print("First document for topic ID", topic_id, ":", first_document)
else:
    print("No documents found for topic ID", topic_id)

In [None]:
from spacy import displacy


In [None]:
# Filter ner results by selected entity 

filename = 'Silicon_valley_ner_results.csv'
df = pd.read_csv(filename)

# Filter rows for selected label such as GPE or ORG
org_df = df[df['Label'] == 'GPE']

# Count the occurrences of each organization entity
org_counts = org_df['Entity'].value_counts()

# Get the top 5 most frequent entities
top_10_orgs = org_counts.nlargest(10)


In [None]:
# Plot frequency count for top 10 selected entity 
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
top_10_orgs.plot(kind='bar', color='skyblue')
plt.xlabel('Organization')
plt.ylabel('Count')
plt.title('Top 10 Organization Entities')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()


plt.savefig('top_10_people.png', format='png', dpi=300) 
plt.show()

In [None]:
import plotly.io as pio


# Plot and save similarity matrix heatmap for topic categories 

# further analysis for first major category online active 
Online_activity = [11, 61, 11, 8, 4, 103, 1, 9, 97, 63, 38, 33, 22, 69]



heatmap_fig = multi_topic_model.visualize_heatmap(topics=Silicon_valley)

#heatmap_fig.write_html('cyber_heatmap_visualization.html', auto_open=True)

heatmap_fig.update_coloraxes(colorbar_x=1.1)  # Moves the color bar further right

heatmap_fig.update_layout(width=900, height=800)

heatmap_fig.show()


In [None]:
multi_topic_model.visualize_barchart(topics=techcompanies)


In [None]:
representation_type = 'POS' 
Online_activity = [11, 61, 8, 4, 103, 1, 9, 97, 63, 38, 33, 22, 69]
topic_aspects = multi_topic_model.topic_aspects_

data = []

# Extract the topic representations for the selected topic categories and convert to DataFrame
for topic_id in culture:
    try:
        # Access the specific representation for the topic
        representation = topic_aspects[representation_type][topic_id]
        
        # Extract terms and their scores as a list of tuples
        terms_scores = [(term, score) for term, score in representation]

        
        # Append the data as a dictionary with Topic ID and the combined terms-scores
        data.append({
            'Topic ID': topic_id,
            'Terms': [term for term, score in terms_scores],  # List of terms
            'Scores': [score for term, score in terms_scores]  # List of scores
        })
    except KeyError:
        print(f"Topic ID {topic_id} or representation type {representation_type} not found.")

culturedf = pd.DataFrame(data)
print(culturedf)

culturedf.to_csv('culture_representations_POS.csv', index=False)


In [None]:
topic_info_df = multi_topic_model.get_topic_info()


In [None]:
# Filter topic information by selected representation 

filtered_df = topic_info_df[topic_info_df['Topic'].isin(china)]


In [None]:
mmr_terms = filtered_df[['Topic', 'MMR']]
pos_terms = filtered_df[['Topic', 'POS']]
keybert_terms = filtered_df[['Topic', 'Keybert']]

In [None]:
combined_terms = {}

for _, row in filtered_df.iterrows():
    topic_id = row['Topic']
    mmr_terms = row['MMR']
    pos_terms = row['POS']
    keybert_terms = row['Keybert']
    
    # Combine terms and convert to a set to get unique terms
    all_terms = set(mmr_terms + pos_terms + keybert_terms)
    combined_terms[topic_id] = list(all_terms)

In [None]:
combined_terms_df = pd.DataFrame(list(combined_terms.items()), columns=['Topic', 'Combined_Terms'])


In [None]:
largedf = pd.DataFrame({"Document": df['text_content'], "Topic": topics, "Year": df['year']})

largedf['Year'] = largedf['Year'].astype(int)

In [None]:
# Convert topics over time using Years column from original dataframe

topics_over_time = multi_topic_model.topics_over_time(
    docs=largedf['Document'].tolist(),  # List of documents
    timestamps=largedf['Year'].tolist()  # Corresponding years
)

In [None]:
tech_time = multi_topic_model.visualize_topics_over_time(topics_over_time, topics= techcompanies)

In [None]:
multi_topic_model.get_topic_info()

In [None]:
>>> similar_topics, similarity = multi_topic_model.find_topics("outsource", top_n=10)


In [None]:
>>> multi_topic_model.get_topic(similar_topics[0])


In [None]:
topic_ids = similar_topics  # Extracting topic IDs

# Print the topic IDs
print(topic_ids)

In [None]:
pip install transformers torch


In [None]:
! pip install torch

In [None]:
# Conduct sentiment analysis using FinBERT model for documents under the financial stock market category


from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Load FinBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Create a pipeline for sentiment analysis
sentiment_analysis = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, device=0)

In [None]:
document_info = multi_topic_model.get_document_info(docs)

# Filter documents based on topic IDs
filtered_documents = document_info[document_info['Topic'].isin(stock)]

# Extract the texts of the filtered documents
texts_to_analyze = filtered_documents['Document'].tolist()

In [None]:
len(texts_to_analyze)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import torch

def truncate_texts(texts, tokenizer, max_length=512):
    truncated_texts = []
    for text in texts:
        # Tokenize the text
        tokens = tokenizer.encode(text, truncation=True, max_length=max_length)
        # Decode tokens back to string
        truncated_text = tokenizer.decode(tokens, skip_special_tokens=True)
        truncated_texts.append(truncated_text)
    return truncated_texts

# Truncate texts to avoid exceeding max token length
texts_to_analyze_truncated = truncate_texts(texts_to_analyze, sentiment_analysis.tokenizer)

# Run sentiment analysis on truncated texts
sentiment_results = sentiment_analysis(texts_to_analyze_truncated)
print("Raw sentiment results:")

for result in sentiment_results:
    print(result)

filtered_documents = pd.DataFrame({
    'Document': texts_to_analyze_truncated
})


# Extract sentiment results
sentiments = []
positive_scores = []
negative_scores = []
neutral_scores = []

for result in sentiment_results:
    # Initialize scores
    pos_score = neg_score = neu_score = 0.0
    
    # Check if result is a list of dictionaries
    if isinstance(result, list):
        for label_score in result:
            if isinstance(label_score, dict):
                label = label_score['label']
                score = label_score['score']
                
                if label == 'POSITIVE':
                    pos_score = score
                elif label == 'NEGATIVE':
                    neg_score = score
                elif label == 'NEUTRAL':
                    neu_score = score
    
    # Determine the overall sentiment
    sentiments.append('POSITIVE' if pos_score > max(neg_score, neu_score) else
                       'NEGATIVE' if neg_score > max(pos_score, neu_score) else
                       'NEUTRAL')
    positive_scores.append(pos_score)
    negative_scores.append(neg_score)
    neutral_scores.append(neu_score)

# Add sentiment results to the DataFrame
filtered_documents['Sentiment'] = sentiments
filtered_documents['Positive Score'] = positive_scores
filtered_documents['Negative Score'] = negative_scores
filtered_documents['Neutral Score'] = neutral_scores

output_csv_path = 'filtered_documents_with_sentiment_scores.csv'

filtered_documents[['Document', 'Sentiment', 'Positive Score', 'Negative Score', 'Neutral Score']].to_csv(output_csv_path, index=False)

print(f"Data saved to {output_csv_path}")

In [None]:
if len(sentiment_results) != len(filtered_documents):
    raise ValueError("The number of sentiment results does not match the number of documents.")

# Create DataFrame for sentiment results
sentiment_df = pd.DataFrame(sentiment_results)

# Add document column for merging 
sentiment_df['Document'] = filtered_documents['Document']

# Merge the two dataframes
merged_df = pd.merge(filtered_documents, sentiment_df, on='Document', how='left')


merged_df = merged_df.rename(columns={
    'label': 'Sentiment Label',
    'score': 'Sentiment Score'
})

# Add Positive, Negative, and Neutral Score columns
merged_df['Positive Score'] = merged_df.apply(lambda row: row['Sentiment Score'] if row['Sentiment Label'] == 'Positive' else 0.0, axis=1)
merged_df['Negative Score'] = merged_df.apply(lambda row: row['Sentiment Score'] if row['Sentiment Label'] == 'Negative' else 0.0, axis=1)
merged_df['Neutral Score'] = merged_df.apply(lambda row: row['Sentiment Score'] if row['Sentiment Label'] == 'Neutral' else 0.0, axis=1)

print(merged_df[['Document', 'Sentiment Label', 'Sentiment Score', 'Positive Score', 'Negative Score', 'Neutral Score']])
output_csv_path = 'filtered_documents_with_sentiment_scores.csv'
merged_df.to_csv(output_csv_path, index=False)

print(f"Data saved to {output_csv_path}")

In [None]:
! pip install seaborn

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count occurrences of each sentiment label
sentiment_counts = merged_df['Sentiment Label'].value_counts()

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='viridis')

# Add labels and title
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.title('Distribution of Sentiment Labels')
plt.xticks(rotation=45)  # Rotate labels for better readability
output_img_path = 'sentiment_distribution.png'
plt.savefig(output_img_path)
# Display the plot
plt.tight_layout()
plt.show()

In [None]:
top_positive_docs = merged_df.sort_values(by='Positive Score', ascending=False).head(10)

# Get top 10 documents with highest negative scores
top_negative_docs = merged_df.sort_values(by='Negative Score', ascending=False).head(10)

# Print the results
print("Top 10 Documents with Highest Positive Scores:")
print(top_positive_docs[['Document', 'Positive Score']])

print("\nTop 10 Documents with Highest Negative Scores:")
print(top_negative_docs[['Document', 'Negative Score']])

In [None]:
sentiment_counts = filtered_documents['Sentiment'].value_counts()

# Print the counts
print("Sentiment Counts:")
print(sentiment_counts)
print(filtered_documents[['Topic', 'Document', 'Sentiment']])

In [None]:
topic_to_category = {}

# Function to add topics to the mapping
def add_to_mapping(topics, category_name):
    for topic_id in topics:
        topic_to_category[topic_id] = category_name

# Add each category and its topics to the mapping
add_to_mapping(china, 'China')
add_to_mapping(culture, 'Culture')
add_to_mapping(Silicon_valley, 'Silicon Valley')
add_to_mapping(Online_activity , 'Online Activity')
add_to_mapping(cybersecurity, 'Cybersecurity')
add_to_mapping(techcompanies, 'Tech Companies')
add_to_mapping(stock, 'Stock')

In [None]:
print(topic_to_category)

In [None]:
document_info = multi_topic_model.get_document_info(docs)


In [None]:
document_info['Category'] = document_info['Topic'].map(topic_to_category)


In [None]:
document_info['Category'].fillna('Unlabeled', inplace=True)


In [None]:
import plotly.express as px


In [None]:
df_plot = pd.DataFrame({
    'x': reduced_embeddings[:, 0],   
    'y': reduced_embeddings[:, 1],   
    'Category': document_info['Category']  # Category labels for each document
})

In [None]:
# Plot 2D UMAP embedding visulization

fig = px.scatter(
    df_plot, x='x', y='y',
    color='Category',                
    color_discrete_map={  
        'China': 'red',
        'Culture': 'blue',
        'Silicon Valley': 'green',
        'Online Activity': 'purple',
        'Cybersecurity': 'orange',
        'Tech Companies': 'cyan',
        'Stock': 'magenta',
        'Unlabeled': 'gray'  # Color for unlabeled points
    },
    title="UMAP Visualization of Documents by Category",
    template="plotly_dark",  # Use a dark background
    width=1000,              # Set width
    height=1200               # Set height
)

# Save the interactive plot as an HTML file
fig.write_html("umap_visualization.html")

fig.show()

In [None]:
# Create summary statistics plot for topic categories 

category_counts = pd.Series(topic_to_category.values()).value_counts().sort_index()

# Create a DataFrame for Plotly
df_plot = pd.DataFrame({
    'Category': category_counts.index,
    'Count': category_counts.values
})

 # Create a horizontal bar chart using Plotly
fig = px.bar(
    df_plot, x='Count', y='Category',
    orientation='h',  # Horizontal bar chart
    title="Number of Topics in Each Category",
    labels={'Count': 'Number of Topics', 'Category': 'Category'},
    template="plotly_white"  # Light theme for better visibility
)

fig.update_layout(
    xaxis=dict(title='Number of Topics', title_font=dict(size=18)),
    yaxis=dict(title='Category', title_font=dict(size=18)),
    margin=dict(l=100, r=50, t=50, b=50),  # Adjust margins for better space usage
)

fig.show()

# Save the plot as an HTML file
fig.write_html("topics_per_category_bar_chart.html")