### Openers

In [None]:
import pandas as pd

# Load the first row from the first file and print the column labels
df1 = pd.read_csv('G:/BERTopic/attachment/attach_sen2_sentencedata.csv', nrows=1)
print("Column labels for 'attach_sen2_sentencedata.csv':")
print(df1.columns.tolist())

# Load the first row from the second file and print the column labels
df2 = pd.read_csv('G:/BERTopic/attachment/attach_processed_length10.csv', nrows=1)
print("Column labels for 'attach_processed_length10.csv':")
print(df2.columns.tolist())


In [None]:
import torch
print("CUDA availability:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("Active GPU:", torch.cuda.current_device())
print("GPU name:", torch.cuda.get_device_name(0))


# BERTopic

## Step 1: Preparation and Embedding Generation (To Be Run Once)

In [None]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle

# Set the embedding save path
embeddings_path = 'G:/BERTopic/attachment/models/attach_sen2_embeddings.npy'

# Path to save the dataframe
dataframe_save_path = 'attachment/attach_sen2_sentencedata.csv'

# Load your dataset
data_path = 'G:/BERTopic/attachment/attach_processed_length10.csv'  # Update with your actual data path
df = pd.read_csv(data_path)

# Ensure there's a unique ID for each document
if 'id' not in df.columns:
    df['id'] = range(len(df))

# Tokenize documents into sentences and map sentences to their originating document ID
all_sentences = []
doc_id_for_each_sentence = []
for _, row in df.iterrows():
    sentences = sent_tokenize(row['text'])
    all_sentences.extend(sentences)
    doc_id_for_each_sentence.extend([str(row['id'])] * len(sentences))  ## CONVERTING TO STRING HERE

# Generate embeddings for each sentence
model = SentenceTransformer('thenlper/gte-large')
embeddings = model.encode(all_sentences, show_progress_bar=True)

np.save(embeddings_path, embeddings)

# Save the 'all_sentences' and 'doc_id_for_each_sentence' to a dataframe and then to a CSV
sentence_data = pd.DataFrame({
    'all_sentences': all_sentences,
    'doc_id_for_each_sentence': doc_id_for_each_sentence
})
sentence_data.to_csv(dataframe_save_path, index=False)

print(f"Embeddings saved to {embeddings_path}")
print(f"Sentence data saved to {dataframe_save_path}")

## Step 2: BERTopic (To Be Run for Each Experiment)

### Running BERTopic

In [None]:
import logging
import pickle
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from bertopic.representation import KeyBERTInspired
from tqdm import tqdm
from collections import defaultdict
import numpy as np

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

### Paths set in CODE BLOCK 1 ###
embeddings_path = 'G:/BERTopic/attachment/models/sen2/attach_sen2_embeddings.npy'
sentence_data_load_path = 'G:/BERTopic/attachment/models/sen2/attach_sen2_sentencedata.csv'  
original_data_path = 'G:/BERTopic/attachment/attach_processed_length10.csv'  

### Paths for saving results ###
doc_topics_data_path = 'G:/BERTopic/attachment/attach_sen2_SenTopics.csv'
model_save_path = 'G:/BERTopic/attachment/models/sen2/attach_sen2_model_dir'

logging.info("Loading Embeddings")
# Load the embeddings
embeddings = np.load(embeddings_path)

# Load the sentence data from the CSV
sentence_data = pd.read_csv(sentence_data_load_path)  # Assuming this line exists in the actual code
all_sentences = sentence_data['all_sentences'].tolist()  # Load all_sentences from the DataFrame
all_sentences = [str(sentence) for sentence in all_sentences]  # Convert sentences to strings

doc_id_for_each_sentence = sentence_data['doc_id_for_each_sentence'].tolist()

logging.info("Loading Dataset")
# Load the original dataset
df = pd.read_csv(original_data_path)

# Ensure there's a 'docs' column if 'text' is used in CSV
if 'docs' not in df.columns and 'text' in df.columns:
    df.rename(columns={'text': 'docs'}, inplace=True)

logging.info("Preparing CountVectorizer")
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 3)).build_tokenizer()
for doc in df['docs'].tolist():
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 100]
logging.info(f"Vocabulary extracted. Size: {len(vocab)}")

logging.info("Preparing BERTopic model...")
# BERTopic model preparation
embedding_model = SentenceTransformer('thenlper/gte-large')

#UMAP parameters
umap_model = UMAP(
        n_components=5,  # has a wild impact hard to predict
        n_neighbors=60,  # Higher is a more gloabl strcture
        min_dist=0.01,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )
reduced_embeddings = umap_model.fit_transform(embeddings)

# HDBSCAN model
hdbscan_model = HDBSCAN(
            min_cluster_size=200,           # smallest size group considered
            min_samples=25,                 # larger is more conservative - more noise
            leaf_size=40,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=True,         # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='leaf', # eom is normal - leaf might get more homogeneous clusters
            cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )

vectorizer_model = CountVectorizer(stop_words="english")
representation_model = KeyBERTInspired()

# Using these in BERTopic
topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_model,
        verbose=True
)

# After fitting the BERTopic model
logging.info("Fitting BERTopic model...")
topics, probs = topic_model.fit_transform(all_sentences, embeddings)

# Putting stuff back into the DF for saving
sentence_level_df = pd.DataFrame({
    'sentence_id': range(len(all_sentences)),
    'id': doc_id_for_each_sentence,
    'topic_id': topics,
    'probability': probs,
    'sentence_text': all_sentences
})

# Merge to include additional columns from the original dataset
merged_df = pd.merge(sentence_level_df, df[['id', 'link_id', 'author', 'created_utc', 'subreddit', 'score', 'author_flair_text']], on='id', how='left')

# Save the merged DataFrame as a CSV
merged_df.to_csv(doc_topics_data_path, index=False)  # Corrected path for saving results
logging.info("Merged BERTopic sentence-level analysis data saved successfully as CSV.")

# Save the BERTopic model as a .safetensors file
logging.info("Saving BERTopic model...")
topic_model.save(model_save_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)
logging.info("BERTopic model saved successfully as a .safetensors file.")


### Passing documents with outlier reduction - c-tf-idf

In [None]:
from bertopic import BERTopic
import pandas as pd

print("Step 1: Loading the model...")
# Load the BERTopic model
topic_model = BERTopic.load('G:/BERTopic/attachment/models/attach_sen2_DocTopics_model_dir')

print("Step 2: Preparing the documents...")
# Load the data
doc_file = 'attachment/attach_processed_length10.csv'
df = pd.read_csv(doc_file, usecols=['text', 'created_utc'], low_memory=False)

docs = df['text'].tolist()

print("Step 3: Applying the model...")
# Apply the model to the documents
topics, probs = topic_model.transform(docs)

print("Step 4: Reducing outliers...")
# Use the "c-TF-IDF" strategy with a threshold
new_topics, _ = topic_model.reduce_outliers(docs, topics, strategy="c-tf-idf", threshold=0.1)

# Reduce all outliers that are left with the "distributions" strategy
final_topics, _ = topic_model.reduce_outliers(docs, new_topics, strategy="distributions")

print("Step 5: Updating the topic representations...")
# Update the topic representations with the new topics
topic_model.update_topics(docs, topics=final_topics)

print("Step 6: Performing topics over time analysis...")
# Perform topics over time analysis
topics_over_time = topic_model.topics_over_time(docs, final_topics, timestamps, nr_bins=40)

print("Step 7: Saving the results...")
# Save the topics and probabilities to the DataFrame
df['Topic'] = final_topics
df['Probability'] = probs

# Save the DataFrame with the updated topics and probabilities
df.to_excel('G:/BERTopic/attachment/analysis/attach_sen2_DocTopics.xlsx', index=False)

# Save the topics over time results
topics_over_time.to_excel('G:/BERTopic/attachment/analysis/attach_sen2_TopicsOverTime.xlsx', index=False)

In [None]:
# Column labels after LIWC analysis:

# ID	Topic #	Document Text	Subreddit	Probability	Score	Author	link_id	created_utc	author_flair_text	Segment	WC	Analytic	Clout	Authentic	Tone	WPS	BigWords	Dic	Linguistic	function	pronoun	ppron	i	we	you	shehe	they	ipron	det	article	number	prep	auxverb	adverb	conj	negate	verb	adj	quantity	Drives	affiliation	achieve	power	Cognition	allnone	cogproc	insight	cause	discrep	tentat	certitude	differ	memory	Affect	tone_pos	tone_neg	emotion	emo_pos	emo_neg	emo_anx	emo_anger	emo_sad	swear	Social	socbehav	prosocial	polite	conflict	moral	comm	socrefs	family	friend	female	male	Culture	politic	ethnicity	tech	Lifestyle	leisure	home	work	money	relig	Physical	health	illness	wellness	mental	substances	sexual	food	death	need	want	acquire	lack	fulfill	fatigue	reward	risk	curiosity	allure	Perception	attention	motion	space	visual	auditory	feeling	time	focuspast	focuspresent	focusfuture	Conversation	netspeak	assent	nonflu	filler	AllPunc	Period	Comma	QMark	Exclam	Apostro	OtherP	Emoji


### Passing Documents without Outlier Reduction

In [None]:
# PASS DOCUMENTS WITHOUT USING AN OUTLIER REDUCTION STRATEGY

from bertopic import BERTopic
import pandas as pd
import numpy as np

# Defined paths
data_save_path = 'G:/BERTopic/attachment/attach_processed_length10.csv'
model_save_path = 'G:/BERTopic/attachment/models/attach_sen1_model.pkl'

# Load the documents and the BERTopic model
df_docs = pd.read_csv(data_save_path)
documents = df_docs['text'].tolist()
bertopic_model = BERTopic.load(model_save_path)

# Apply the model to the documents
topics, probs = bertopic_model.transform(documents)

# Create a DataFrame with the original document texts and their assigned topics
df_topics = pd.DataFrame({'id': df_docs['id'], 'document': documents, 'topic': topics})

# Determine if the row is a comment or submission based on the 'title' column
df_docs['type'] = np.where(df_docs['title'].isna(), 'comment', 'submission')

# Merge the type, subreddit, and author information with the topics DataFrame
merged_df = pd.merge(df_topics, df_docs[['id', 'type', 'subreddit', 'author']], on='id', how='left')

# Reorder and select the specified columns to match the final requirement
final_df = merged_df[['id', 'document', 'type', 'subreddit', 'author', 'topic']]

# Save the enriched DataFrame to an .xlsx file
final_df.to_excel('G:/BERTopic/attachment/analysis/attach_sen1_doc_level_enriched.xlsx', index=False)


# Loading models

In [None]:
import pickle
from bertopic import BERTopic
import pandas as pd

print("Step 2: Loading the model...")
# Load the BERTopic model
topic_model = BERTopic.load('G:/BERTopic/attachment/models/attach_sen2_DocTopics_model.pkl')

# Load the DataFrame from the .pkl file
with open('G:/BERTopic/attachment/models/attach_sen2_DocTopics_model.pkl', "rb") as file:
    df = pickle.load(file)


print("Step 2: Preparing the documents...")
# Load the data
doc_file = ('attachment/attach_processed_length10.csv')
df = pd.read_csv(data_path, usecols=['text', 'timestamps'], low_memory=False)

# Specify what the 'docs' are
docs = df['text'].tolist()
timestamps = df['timestamps'].tolist()

In [None]:
import pickle
from bertopic import BERTopic
import pandas as pd

# Load the BERTopic model
topic_model = BERTopic.load('G:/BERTopic/attachment/models/attach_sen2_DocTopics_model.pkl')

# Load the data
doc_file = ('attachment/attach_processed_length10.csv')
df = pd.read_csv(data_path, usecols=['text', 'timestamps'], low_memory=False)

print("Step 2: Preparing the documents...")
# Specify what the 'docs' are
docs = df['text'].tolist()
timestamps = df['timestamps'].tolist()

# Analysis

### Big File

In [None]:
from bertopic import BERTopic

# Extract the results
topics = topic_model.get_topics()
topic_freq = topic_model.get_topic_freq()
topic_info = topic_model.get_topic_info()
representative_docs = topic_model.get_representative_docs()

txt_file_path = 'attachment/analysis/attach_sen2_DocTopics_analysis.txt'
csv_file_path = 'attachment/analysis/attach_sen2_DocTopics_analysis.csv'  

# Save the results in a more structured and readable manner
with open(txt_file_path, 'w') as f:
    # Topics
    f.write("TOPICS:\n")
    for topic_num, terms in topics.items():
        terms_str = ', '.join([term[0] for term in terms])
        f.write(f"Topic {topic_num}: {terms_str}\n")
    f.write("\n")
    
    # Topic Frequency
    f.write("TOPIC FREQUENCY:\n")
    for index, row in topic_freq.iterrows():
        f.write(f"Topic {row['Topic']}: {row['Count']} entries\n")
    f.write("\n")

    # Representative Docs
    f.write("REPRESENTATIVE DOCS:\n")
    for topic_num, docs in representative_docs.items():
        f.write(f"Topic {topic_num} representative docs:\n")
        for doc in docs:
            f.write(f"  - {doc}\n")
        f.write("\n")

# Convert 'topic_info' DataFrame directly to CSV
topic_info.to_csv(csv_file_path, index=False)


### Pulling Documents from Target Topics

In [None]:
import pandas as pd

# Load the dataset
data_path = 'G:/BERTopic/attachment/models/attach_sen1_doc_topics_liwc.xlsx'
df = pd.read_excel(data_path)

# Define the topics you're interested in
topics = [14, 20, 51, 54, 83, 98, 102, 112, 116]

# Filter documents based on the specified topics
filtered_docs = df[df['Topic #'].isin(topics)]

# Save the filtered documents to a new Excel file
output_path = 'G:/BERTopic/attachment/analysis/attach_sen1_juanita_topics.xlsx'
filtered_docs.to_excel(output_path, index=False)


In [None]:
import pandas as pd

# Load the dataset
data_path = 'G:/BERTopic/attachment/models/attach_sen1_doc_topics_liwc.xlsx'
df = pd.read_excel(data_path)

# BERTopic model loading and specified topics
topic_model = BERTopic.load(model_save_path)  # Ensure model_save_path is defined
topics = [14, 20, 51, 54, 83, 98, 102, 112, 116]

# Initialize a DataFrame for aggregated stats
aggregated_stats_df = pd.DataFrame()

# Retrieve representations and calculate mean statistics
for topic in topics:
    topic_representation = topic_model.get_topic(topic)
    representation_str = ', '.join([f"{word} ({score:.2f})" for word, score in topic_representation])
    
    topic_docs = df[df['Topic #'] == topic]
    mean_stats = topic_docs[stats_columns].mean().to_frame().T  # Compute mean
    mean_stats['Topic #'] = topic
    mean_stats['Representations'] = representation_str
    aggregated_stats_df = pd.concat([aggregated_stats_df, mean_stats], axis=0)

# Rearrange columns as specified for aggregated_stats_df
columns_order = ['Topic #', 'Representations'] + stats_columns
aggregated_stats_df = aggregated_stats_df[columns_order]

# Filter documents and rearrange columns as specified for filtered_docs
filtered_columns = ['ID', 'Topic #', 'Document Text', 'Subreddit', 'Probability', 'Score', 'Author', 'link_id', 'created_utc', 'author_flair_text', 'author_flair_richtext']
filtered_docs = df[df['Topic #'].isin(topics)][filtered_columns]

# Save to Excel with the two sheets
output_path = 'G:/BERTopic/attachment/analysis/attach_sen1_juanita_topics.xlsx'
with pd.ExcelWriter(output_path) as writer:
    aggregated_stats_df.to_excel(writer, sheet_name='Descriptive Statistics', index=False)
    filtered_docs.to_excel(writer, sheet_name='Filtered Documents', index=False)

