# Biterm Topic Modeling (BTM) for Community Notes
# This script performs topic modeling on English Community Notes

In [4]:
!pip install --use-pep517 biterm

Collecting biterm
  Using cached biterm-0.1.5.tar.gz (79 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[50 lines of output][0m
  [31m   [0m <string>:2: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         ********************************************************************************
  [31m   [0m         Requirements should be satisfied by a PEP 517 installer.
  [31m   [0m         If you are using pip, you can try `pip install --use-pep517`.
  [31m   [0m         ********************************************************************************
  [31m   [0m 
  [31m   [0m !!
  [31m   [0m /Users/yunkaili/spring2025/NLP/project/.ven

In [None]:
import sys
sys.path.append('/Users/yunkaili/spring2025/NLP/project/.venv/lib/python3.10/site-packages')

In [2]:
import sys
print(sys.executable)

/Users/yunkaili/spring2025/NLP/project/.venv/bin/python


In [3]:
pip --version

pip 25.0.1 from /Users/yunkaili/spring2025/NLP/project/.venv/lib/python3.10/site-packages/pip (python 3.10)
Note: you may need to restart the kernel to use updated packages.


In [1]:
!which python
!which pip

/Users/yunkaili/spring2025/NLP/project/.venv/bin/python
/Users/yunkaili/spring2025/NLP/project/.venv/bin/pip


In [5]:
pip show biterm


[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from biterm.btm import oBTM
from biterm.utility import vec_to_biterms
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from wordcloud import WordCloud
import pyLDAvis
import pyLDAvis.sklearn
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Download NLTK resources if not already downloaded
try:
    nltk.data.find('corpora/stopwords')
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')

# Define file paths
english_notes_path = "../english_only/english_notes-00000.tsv"
output_dir = "../topics/"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load English notes
print("Loading English notes...")
notes_df = pd.read_csv(english_notes_path, sep='\t')
print(f"Loaded {len(notes_df)} English notes")

# Display data info
print("\nData columns:")
print(notes_df.columns.tolist())

# Check if summary column exists
if 'summary' not in notes_df.columns:
    raise ValueError("The 'summary' column is not found in the dataset.")

# Sample data if it's too large
max_notes = 50000  # Adjust this based on your computational resources
if len(notes_df) > max_notes:
    print(f"\nSampling {max_notes} notes for topic modeling due to computational constraints...")
    notes_df = notes_df.sample(n=max_notes, random_state=42)

# Text preprocessing function
def preprocess_text(text):
    if pd.isna(text) or text.strip() == '':
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs, mentions, and hashtags
    text = re.sub(r'http\S+|@\S+|#\S+', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return " ".join(tokens)

# Preprocess the text data
print("\nPreprocessing text data...")
notes_df['processed_text'] = notes_df['summary'].progress_apply(preprocess_text)

# Remove empty texts after preprocessing
notes_df = notes_df[notes_df['processed_text'].str.strip() != ""]
print(f"After preprocessing, {len(notes_df)} notes remain")

# Save preprocessed data
notes_df[['noteId', 'processed_text']].to_csv(os.path.join(output_dir, 'preprocessed_notes.csv'), index=False)

# Vectorize the texts
print("\nVectorizing the texts...")
vec = CountVectorizer(max_df=0.8, min_df=10)
X = vec.fit_transform(notes_df['processed_text']).toarray()
vocab = np.array(vec.get_feature_names_out())
print(f"Vocabulary size: {len(vocab)}")

# Generate biterms
print("Generating biterms...")
biterms = vec_to_biterms(X)
print(f"Number of biterms: {len(biterms)}")

# Set up BTM model
# Tune the number of topics based on your dataset
num_topics = 10
btm = oBTM(num_topics=num_topics, V=vocab.size)

# Train the model
print(f"\nTraining BTM model with {num_topics} topics...")
btm.fit(biterms, iterations=20)
print("Training complete!")

# Get topics and their top words
topics = btm.transform(X)
print("Topic distribution matrix shape:", topics.shape)

# Save topic model
np.save(os.path.join(output_dir, 'topics_matrix.npy'), topics)
np.save(os.path.join(output_dir, 'vocabulary.npy'), vocab)

# Assign topics to documents
notes_df['topic'] = topics.argmax(axis=1)
notes_df['topic_probability'] = topics.max(axis=1)

# Save documents with assigned topics
notes_df[['noteId', 'topic', 'topic_probability']].to_csv(
    os.path.join(output_dir, 'notes_with_topics.csv'), index=False)

# Generate topic visualization
def visualize_topics(btm_model, vocab, num_topics, num_top_words=15):
    # Create directory for topic visualizations
    topic_vis_dir = os.path.join(output_dir, 'topic_visualizations')
    os.makedirs(topic_vis_dir, exist_ok=True)
    
    # Get topic words distribution
    topic_words = btm_model.get_topic_words(num_words=num_top_words)
    
    # Create a DataFrame to store topic-word distributions
    topic_word_df = pd.DataFrame()
    
    # For each topic, create a visualization
    for i in range(num_topics):
        # Create word cloud
        words = dict(zip(topic_words[i].keys(), topic_words[i].values()))
        wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=num_top_words)
        wordcloud.generate_from_frequencies(words)
        
        # Plot wordcloud
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Topic {i+1}')
        plt.savefig(os.path.join(topic_vis_dir, f'topic_{i+1}_wordcloud.png'), bbox_inches='tight')
        plt.close()
        
        # Add topic words to DataFrame
        topic_word_df[f'Topic_{i+1}'] = pd.Series(topic_words[i])
    
    # Save topic words to CSV
    topic_word_df.to_csv(os.path.join(topic_vis_dir, 'topic_words.csv'))
    
    # Get documents per topic
    topic_counts = notes_df['topic'].value_counts().sort_index()
    
    # Plot topic distribution
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(x=topic_counts.index, y=topic_counts.values)
    plt.title('Number of Documents per Topic')
    plt.xlabel('Topic')
    plt.ylabel('Number of Documents')
    
    # Add count labels on top of bars
    for i, count in enumerate(topic_counts.values):
        ax.text(i, count + 50, str(count), ha='center')
    
    plt.savefig(os.path.join(topic_vis_dir, 'topic_distribution.png'), bbox_inches='tight')
    plt.close()
    
    return topic_word_df

# Generate visualizations
print("\nGenerating topic visualizations...")
topic_words_df = visualize_topics(btm, vocab, num_topics)

# Create an interactive visualization (optional)
try:
    print("\nCreating interactive visualization...")
    # Create a CountVectorizer with the same parameters as before
    cv = CountVectorizer(max_df=0.8, min_df=10, vocabulary=vec.vocabulary_)
    dtm = cv.fit_transform(notes_df['processed_text'])
    
    # Convert BTM topic-word distribution to format used by pyLDAvis
    panel = pyLDAvis.sklearn.prepare(btm, dtm, cv)
    pyLDAvis.save_html(panel, os.path.join(output_dir, 'topic_visualization.html'))
    print(f"Interactive visualization saved to {os.path.join(output_dir, 'topic_visualization.html')}")
except Exception as e:
    print(f"Could not create interactive visualization: {str(e)}")

# Extract some example notes from each topic
def save_topic_examples(df, num_examples=5):
    topic_examples_dir = os.path.join(output_dir, 'topic_examples')
    os.makedirs(topic_examples_dir, exist_ok=True)
    
    with open(os.path.join(topic_examples_dir, 'topic_examples.txt'), 'w', encoding='utf-8') as f:
        for topic_id in range(num_topics):
            f.write(f"===== TOPIC {topic_id+1} =====\n\n")
            
            # Get top examples (highest probability)
            topic_notes = df[df['topic'] == topic_id].sort_values('topic_probability', ascending=False)
            for i, (_, row) in enumerate(topic_notes.head(num_examples).iterrows()):
                f.write(f"Example {i+1} (probability: {row['topic_probability']:.4f}):\n")
                f.write(f"Original: {row['summary']}\n")
                f.write(f"Processed: {row['processed_text']}\n")
                f.write("\n")
            
            f.write("\n\n")

# Save topic examples
print("\nSaving example notes for each topic...")
save_topic_examples(notes_df)

print("\nTopic modeling complete! Results saved to:", output_dir)

# OPTIONAL: Hyperparameter tuning for number of topics
def tune_num_topics(biterms, vocab_size, topic_range, iterations=10):
    coherence_scores = []
    
    for k in topic_range:
        print(f"Testing model with {k} topics...")
        model = oBTM(num_topics=k, V=vocab_size)
        model.fit(biterms, iterations=iterations)
        
        # Calculate coherence (this is just one possible metric)
        # In a real implementation, you would need to define a proper coherence metric
        coherence = calculate_coherence(model, vocab)
        coherence_scores.append(coherence)
        
        print(f"Topics: {k}, Coherence: {coherence}")
    
    return coherence_scores

def calculate_coherence(model, vocabulary):
    # This is a placeholder for a real coherence calculation
    # In a real implementation, you would use a proper coherence metric like NPMI or UMass
    return 0.5  # Placeholder

# Uncomment to run hyperparameter tuning
"""
print("\nRunning hyperparameter tuning for number of topics...")
topic_range = range(5, 30, 5)
coherence_scores = tune_num_topics(biterms, vocab.size, topic_range)

# Plot coherence scores
plt.figure(figsize=(10, 6))
plt.plot(list(topic_range), coherence_scores, marker='o')
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.title('Topic Coherence by Number of Topics')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'topic_coherence.png'), bbox_inches='tight')
plt.close()
"""