In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from umap import UMAP
import hdbscan
import logging
import warnings
warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.WARNING)

# Set random seed for reproducibility
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def preprocess_text(text):
    """Clean and preprocess dialogue text"""
    if not isinstance(text, str):
        return ""
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Remove scene directions in parentheses or brackets
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text

In [3]:
def extract_speaker(person_scene):
    """Extract the speaker name from the person_scene column"""
    if not isinstance(person_scene, str):
        return None
    
    # Extract the speaker name (assuming format is "Name" or "Name:")
    match = re.match(r'^([^:]+)', person_scene.strip())
    if match:
        return match.group(1).strip()
    return None

In [4]:
def extract_sheldon_conversations(df, main_cast):
    """
    Extract conversations between Sheldon and main cast members
    Returns a dictionary with cast member names as keys and lists of conversations as values
    """
    # Initialize dictionary to store conversations
    sheldon_conversations = {member: [] for member in main_cast if member != "Sheldon"}
    sheldon_conversations["all"] = []  # For all Sheldon dialogues
    
    # Sort by series, episode, and implied sequence
    df = df.sort_values(by=["series", "episode"])
    
    # Group by episode
    episode_groups = df.groupby(["series", "episode"])
    
    for (series, episode), episode_df in episode_groups:
        # Reset index to get sequence within episode
        episode_df = episode_df.reset_index(drop=True)
        
        # Process each line in the episode
        for i in range(len(episode_df) - 1):  # -1 to avoid index out of bounds
            current_row = episode_df.iloc[i]
            next_row = episode_df.iloc[i + 1]
            
            # Extract speaker names
            current_speaker = extract_speaker(current_row["person_scene"])
            next_speaker = extract_speaker(next_row["person_scene"])
            
            # Skip if either speaker is not identifiable
            if not current_speaker or not next_speaker:
                continue
            
            # If Sheldon is speaking
            if current_speaker == "Sheldon":
                # Add to all Sheldon dialogues
                cleaned_dialogue = preprocess_text(current_row["dialogue"])
                if cleaned_dialogue:
                    sheldon_conversations["all"].append(cleaned_dialogue)
                
                # If the next speaker is a main cast member
                if next_speaker in main_cast and next_speaker != "Sheldon":
                    sheldon_line = preprocess_text(current_row["dialogue"])
                    response_line = preprocess_text(next_row["dialogue"])
                    
                    # Add the dialogue pair to the corresponding cast member's list
                    if sheldon_line and response_line:
                        sheldon_conversations[next_speaker].append(
                            {"sheldon": sheldon_line, "response": response_line}
                        )
            
            # If a main cast member is speaking and Sheldon responds
            elif current_speaker in main_cast and next_speaker == "Sheldon":
                cast_line = preprocess_text(current_row["dialogue"])
                sheldon_line = preprocess_text(next_row["dialogue"])
                
                # Add to all Sheldon dialogues
                if sheldon_line:
                    sheldon_conversations["all"].append(sheldon_line)
                
                # Add the dialogue pair to the corresponding cast member's list
                if cast_line and sheldon_line:
                    sheldon_conversations[current_speaker].append(
                        {"cast": cast_line, "sheldon": sheldon_line}
                    )
    
    return sheldon_conversations

In [6]:
def run_bertopic_analysis(conversations, output_prefix=""):
    """
    Run BERTopic analysis on the conversations
    
    Args:
        conversations: List of text documents to analyze
        output_prefix: Prefix for output files
    
    Returns:
        BERTopic model and topics
    """
    # Initialize sentence transformer model
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Initialize dimensionality reduction with lower n_components to avoid the eigenvalue error
    # Reducing n_components to 2 instead of 5
    umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42)
    
    # Initialize clustering model with a smaller min_cluster_size
    # Change min_cluster_size from 15 to a smaller value if dataset is small
    hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
    
    # Initialize vectorizer with n-grams
    vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.01, max_df=0.9)
    
    # Initialize BERTopic model
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer,
        verbose=True
    )
    
    # Fit model and transform documents
    topics, probs = topic_model.fit_transform(conversations)
    
    n_real_topics = len({t for t in topics if t != -1})   # exclude outliers (-1)
    
    # Modify visualization approach to avoid the error
    if n_real_topics > 2:
        try:
            # Try to visualize topics, but with a contingency plan
            fig = topic_model.visualize_topics()
            fig.write_html(f"{output_prefix}topic_visualization.html")
        except TypeError as e:
            # If we get the eigenvalue error, handle it gracefully
            print(f"Visualization error: {e}")
            print("Skipping interactive visualization, but still saving topic information.")
    else:
        print(f"{n_real_topics} topic(s) detected – skipping 2-D visualization.")

    # 4. exports --------------------------------------------------------------
    topic_info = topic_model.get_topic_info()
    topic_info.to_csv(f"{output_prefix}topic_info.csv", index=False)

    with open(f"{output_prefix}top_topics.txt", "w") as f:
        for topic in topic_info.itertuples():
            if topic.Topic != -1:
                f.write(f"Topic {topic.Topic}: {topic.Name}\n")
                words = topic_model.get_topic(topic.Topic)
                f.write("Key terms: " +
                        ", ".join([f"{w} ({s:.3f})" for w, s in words[:10]]) +
                        "\n\n")

    return topic_model, topics

# Alternative approach if you need to modify analyze_sheldon_topics function
def analyze_sheldon_topics(csv_file):
    """Main function to analyze topics in Sheldon's conversations"""
    print("Loading data...")
    df = pd.read_csv(csv_file)
    
    # Define main cast
    main_cast = ["Sheldon", "Leonard", "Penny", "Howard", "Raj", "Amy", "Bernadette"]
    
    print("Extracting conversations...")
    conversations = extract_sheldon_conversations(df, main_cast)
    
    # Print statistics
    print("\nConversation Statistics:")
    print(f"Total Sheldon dialogues: {len(conversations['all'])}")
    for member in main_cast:
        if member != "Sheldon":
            print(f"Sheldon-{member} conversations: {len(conversations[member])}")
    
    # Create output directory
    import os
    os.makedirs("results", exist_ok=True)
    
    # Analyze all Sheldon dialogues
    print("\nAnalyzing all Sheldon dialogues...")
    sheldon_model, sheldon_topics = run_bertopic_analysis(
        conversations["all"], 
        output_prefix="results/all_sheldon_"
    )
    
    # For very small datasets, we might need to skip topic modeling
    # Adding a minimum threshold (e.g., 50 conversations)
    min_samples_for_topics = 100
    
    # Analyze Sheldon's conversations with each main cast member
    for member in main_cast:
        if member != "Sheldon":
            # Skip if not enough conversations
            if len(conversations[member]) < min_samples_for_topics:
                print(f"Skipping {member} due to insufficient data (less than {min_samples_for_topics} samples)")
                continue
                
            print(f"\nAnalyzing Sheldon-{member} conversations...")
            # Extract Sheldon's lines from conversations with this cast member
            sheldon_lines = []
            for conv in conversations[member]:
                if "sheldon" in conv:
                    sheldon_lines.append(conv["sheldon"])
            
            if len(sheldon_lines) < min_samples_for_topics:
                print(f"Skipping {member} due to insufficient data after preprocessing")
                continue
                
            # Run topic modeling with try/except to handle potential errors
            try:
                member_model, member_topics = run_bertopic_analysis(
                    sheldon_lines,
                    output_prefix=f"results/sheldon_{member.lower()}_"
                )
            except Exception as e:
                print(f"Error analyzing {member} conversations: {str(e)}")
                print(f"Skipping {member} analysis.")
    
    print("\nAnalysis complete. Results saved to 'results' directory.")
    
    # Return the main model for further analysis if needed
    return sheldon_model

In [7]:
def analyze_topic_evolution(csv_file, topic_model=None):
    """
    Analyze how topics evolve across different seasons.
    
    Args:
        csv_file: Path to the CSV file containing script data
        topic_model: An existing BERTopic model (if None, a new model will be created)
        
    Returns:
        Visualizations and data showing topic trends over seasons
    """
    print("Analyzing topic evolution across seasons...")
    
    # Load data
    df = pd.read_csv(csv_file)
    
    # Define main cast
    main_cast = ["Sheldon", "Leonard", "Penny", "Howard", "Raj", "Amy", "Bernadette"]
    
    # Filter for Sheldon's dialogues only
    sheldon_df = df[df["person_scene"].apply(
        lambda x: isinstance(x, str) and extract_speaker(x) == "Sheldon"
    )].copy()
    
    # Clean dialogues
    sheldon_df["clean_dialogue"] = sheldon_df["dialogue"].apply(
        lambda x: preprocess_text(x) if isinstance(x, str) else ""
    )
    
    # Remove empty dialogues
    sheldon_df = sheldon_df[sheldon_df["clean_dialogue"] != ""]
    
    # Group by season
    season_groups = sheldon_df.groupby("series")
    
    # Track topics by season
    seasons = sorted(sheldon_df["series"].unique())
    season_dialogues = {}
    
    for season in seasons:
        if season in season_groups.groups:
            season_data = season_groups.get_group(season)
            season_dialogues[season] = season_data["clean_dialogue"].tolist()
    
    # Create a new topic model if one isn't provided
    if topic_model is None:
        print("Creating new topic model for all seasons combined...")
        all_dialogues = [d for s in season_dialogues.values() for d in s]
        topic_model, _ = run_bertopic_analysis(all_dialogues, "results/all_seasons_")
    
    # Create output directory
    import os
    os.makedirs("results", exist_ok=True)
    
    # Analyze each season with the same model
    print("Analyzing topics by season...")
    season_topics = {}
    season_topic_counts = {}
    
    for season, dialogues in season_dialogues.items():
        if len(dialogues) < 30:  # Skip seasons with too few dialogues
            print(f"Skipping season {season} (insufficient data)")
            continue
            
        print(f"Processing season {season} ({len(dialogues)} dialogues)")
        
        # Use the same model to transform this season's dialogues
        topics, probs = topic_model.transform(dialogues)
        
        # Store the topics
        season_topics[season] = topics
        
        # Count topic occurrences
        topic_counts = pd.Series(topics).value_counts()
        season_topic_counts[season] = topic_counts
    
    # Get topic info
    topic_info = topic_model.get_topic_info()
    
    # Create a mapping of topic IDs to names
    topic_names = {}
    for _, row in topic_info.iterrows():
        if row["Topic"] != -1:  # Skip outlier topic
            # Get the topic words
            words = [word for word, _ in topic_model.get_topic(row["Topic"])[:3]]
            topic_names[row["Topic"]] = f"Topic {row['Topic']}: {' '.join(words)}"
        else:
            topic_names[-1] = "Outliers"
    
    # Create a dataframe for the evolution of topics
    evolution_data = []
    
    for season, counts in season_topic_counts.items():
        for topic, count in counts.items():
            # Skip outlier topic for cleaner visualization
            if topic != -1:
                name = topic_names.get(topic, f"Topic {topic}")
                proportion = count / len(season_dialogues[season])
                evolution_data.append({
                    "Season": season,
                    "Topic": topic,
                    "Topic Name": name,
                    "Count": count,
                    "Proportion": proportion
                })
    
    evolution_df = pd.DataFrame(evolution_data)
    
    # Save to CSV
    evolution_df.to_csv("results/topic_evolution.csv", index=False)
    
    # Create visualizations
    
    # 1. Line plot showing topic trends over seasons
    plt.figure(figsize=(16, 10))
    
    # Get top 10 topics by overall count
    top_topics = evolution_df.groupby("Topic")["Count"].sum().nlargest(10).index
    
    # Filter for top topics and create pivot table
    top_evolution_df = evolution_df[evolution_df["Topic"].isin(top_topics)]
    pivot_df = top_evolution_df.pivot_table(
        index="Season", 
        columns="Topic Name", 
        values="Proportion",
        fill_value=0
    )
    
    # Plot trends
    ax = pivot_df.plot(kind="line", marker="o", ax=plt.gca())
    plt.title("Evolution of Top Topics Across Seasons", fontsize=16)
    plt.xlabel("Season", fontsize=14)
    plt.ylabel("Topic Proportion", fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend(title="Topics", fontsize=10)
    
    # Save the plot
    plt.savefig("results/topic_evolution_line.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # 2. Heatmap of all topics across seasons
    plt.figure(figsize=(16, 12))
    
    # Create pivot table with all non-outlier topics
    non_outlier_df = evolution_df[evolution_df["Topic"] != -1]
    pivot_all = non_outlier_df.pivot_table(
        index="Topic Name", 
        columns="Season", 
        values="Proportion",
        fill_value=0
    )
    
    # Sort rows by sum of proportions
    pivot_all = pivot_all.loc[pivot_all.sum(axis=1).sort_values(ascending=False).index]
    
    # Keep only top 20 topics for readability
    if len(pivot_all) > 20:
        pivot_all = pivot_all.iloc[:20]
    
    # Create heatmap
    sns.heatmap(pivot_all, annot=True, fmt=".2f", cmap="YlGnBu", linewidths=0.5)
    plt.title("Topic Proportion Heatmap by Season", fontsize=16)
    plt.tight_layout()
    
    # Save heatmap
    plt.savefig("results/topic_heatmap_by_season.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # 3. Character-specific topic evolution (for top characters)
    top_chars = ["Leonard", "Penny", "Amy"]
    
    for char in top_chars:
        print(f"Analyzing Sheldon-{char} conversations across seasons...")
        
        # Filter conversations where Sheldon interacts with this character
        char_df = df.copy()
        
        # Group by episode to find interactions
        episode_groups = char_df.groupby(["series", "episode"])
        
        # Store dialogues by season
        char_season_dialogues = {season: [] for season in seasons}
        
        for (season, episode), episode_df in episode_groups:
            # Reset index for proper sequence
            episode_df = episode_df.reset_index(drop=True)
            
            # Find Sheldon's lines when talking with this character
            for i in range(len(episode_df) - 1):
                current_row = episode_df.iloc[i]
                next_row = episode_df.iloc[i + 1]
                
                # Extract speakers
                current_speaker = extract_speaker(current_row.get("person_scene", ""))
                next_speaker = extract_speaker(next_row.get("person_scene", ""))
                
                if current_speaker == "Sheldon" and next_speaker == char:
                    dialogue = preprocess_text(current_row.get("dialogue", ""))
                    if dialogue:
                        if season in char_season_dialogues:
                            char_season_dialogues[season].append(dialogue)
                
                elif current_speaker == char and next_speaker == "Sheldon":
                    dialogue = preprocess_text(next_row.get("dialogue", ""))
                    if dialogue:
                        if season in char_season_dialogues:
                            char_season_dialogues[season].append(dialogue)
        
        # Analyze topics for each season
        char_evolution_data = []
        
        for season, dialogues in char_season_dialogues.items():
            if len(dialogues) < 30:  # Skip if not enough data
                continue
                
            # Transform dialogues using existing model
            topics, _ = topic_model.transform(dialogues)
            
            # Count topics
            topic_counts = pd.Series(topics).value_counts()
            
            # Store data
            for topic, count in topic_counts.items():
                if topic != -1:  # Skip outliers
                    name = topic_names.get(topic, f"Topic {topic}")
                    proportion = count / len(dialogues)
                    char_evolution_data.append({
                        "Season": season,
                        "Topic": topic,
                        "Topic Name": name,
                        "Count": count,
                        "Proportion": proportion,
                        "Character": char
                    })
        
        # Create dataframe
        if char_evolution_data:
            char_evolution_df = pd.DataFrame(char_evolution_data)
            
            # Create heatmap
            plt.figure(figsize=(14, 10))
            
            char_pivot = char_evolution_df.pivot_table(
                index="Topic Name", 
                columns="Season", 
                values="Proportion",
                fill_value=0
            )
            
            # Sort rows by sum of proportions
            char_pivot = char_pivot.loc[char_pivot.sum(axis=1).sort_values(ascending=False).index]
            
            # Keep only top 15 topics for readability
            if len(char_pivot) > 15:
                char_pivot = char_pivot.iloc[:15]
            
            # Create heatmap
            sns.heatmap(char_pivot, annot=True, fmt=".2f", cmap="YlOrRd", linewidths=0.5)
            plt.title(f"Sheldon-{char} Topic Evolution by Season", fontsize=16)
            plt.tight_layout()
            
            # Save heatmap
            plt.savefig(f"results/sheldon_{char.lower()}_topic_evolution.png", dpi=300, bbox_inches="tight")
            plt.close()
    
    # Return the combined results dataframe for further analysis
    return evolution_df

In [None]:
def generate_topics_csv(csv_file, topic_model):
    """
    Generate a CSV file showing topics over seasons between Sheldon and other characters
    in the format: year,character,topic_name,n,prop
    
    Args:
        csv_file: Path to the CSV file containing script data
        topic_model: An existing BERTopic model
    """
    print("Generating topics CSV...")
    
    # Load data
    df = pd.read_csv(csv_file)
    
    # Define main cast
    main_cast = ["Leonard", "Penny", "Howard", "Raj", "Amy", "Bernadette"]
    
    # Create output directory
    import os
    os.makedirs("results", exist_ok=True)
    
    # Initialize a list to store all data
    all_topics_data = []
    
    # Get topic info
    topic_info = topic_model.get_topic_info()
    
    # Create a mapping of topic IDs to names
    topic_names = {}
    for _, row in topic_info.iterrows():
        if row["Topic"] != -1:  # Skip outlier topic
            # Get the topic words
            words = [word for word, _ in topic_model.get_topic(row["Topic"])[:3]]
            topic_names[row["Topic"]] = " ".join(words)
        else:
            topic_names[-1] = "outliers"
    
    # Process each character's conversations with Sheldon by season
    for char in main_cast:
        print(f"Processing Sheldon-{char} conversations...")
        
        # Filter conversations where Sheldon interacts with this character
        char_df = df.copy()
        
        # Group by episode to find interactions
        episode_groups = char_df.groupby(["series", "episode"])
        
        # Store dialogues by season
        seasons = sorted(char_df["series"].unique())
        char_season_dialogues = {season: [] for season in seasons}
        
        for (season, episode), episode_df in episode_groups:
            # Reset index for proper sequence
            episode_df = episode_df.reset_index(drop=True)
            
            # Find Sheldon's lines when talking with this character
            for i in range(len(episode_df) - 1):
                current_row = episode_df.iloc[i]
                next_row = episode_df.iloc[i + 1]
                
                # Extract speakers
                current_speaker = extract_speaker(current_row.get("person_scene", ""))
                next_speaker = extract_speaker(next_row.get("person_scene", ""))
                
                if current_speaker == "Sheldon" and next_speaker == char:
                    dialogue = preprocess_text(current_row.get("dialogue", ""))
                    if dialogue:
                        if season in char_season_dialogues:
                            char_season_dialogues[season].append(dialogue)
                
                elif current_speaker == char and next_speaker == "Sheldon":
                    dialogue = preprocess_text(next_row.get("dialogue", ""))
                    if dialogue:
                        if season in char_season_dialogues:
                            char_season_dialogues[season].append(dialogue)
        
        # Analyze topics for each season
        for season, dialogues in char_season_dialogues.items():
            if len(dialogues) < 30:  # Skip if not enough data
                continue
                
            # Transform dialogues using existing model
            topics, _ = topic_model.transform(dialogues)
            
            # Count topics
            topic_counts = pd.Series(topics).value_counts()
            
            # Store data
            for topic, count in topic_counts.items():
                if topic != -1:  # Skip outliers
                    name = topic_names.get(topic, f"Topic_{topic}")
                    proportion = count / len(dialogues)
                    all_topics_data.append({
                        "year": int(season) if season.is_integer() else season,
                        "character": char,
                        "name": name,
                        "n": count,
                        "prop": proportion
                    })
    
    # Create dataframe
    topics_df = pd.DataFrame(all_topics_data)
    
    # Save to CSV
    topics_df.to_csv("results/sheldon_character_topics.csv", index=False)
    
    print(f"CSV file created: results/sheldon_character_topics.csv")
    
    return topics_df

In [None]:
if __name__ == "__main__":
    # Replace with your file path


    file = "big_bang_scripts.csv"
    
    # First run the regular analysis
    topic_model = analyze_sheldon_topics(file)
    
    # Then analyze evolution over time using the same model
    evolution_df = analyze_topic_evolution(file, topic_model)

    # Generate the character-topic CSV in the requested format
    topics_df = generate_topics_csv(file, topic_model)

Loading data...
Extracting conversations...

Conversation Statistics:
Total Sheldon dialogues: 20659
Sheldon-Leonard conversations: 6758
Sheldon-Penny conversations: 4417
Sheldon-Howard conversations: 1863
Sheldon-Raj conversations: 1668
Sheldon-Amy conversations: 3221
Sheldon-Bernadette conversations: 289

Analyzing all Sheldon dialogues...


2025-04-30 01:18:57,317 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 646/646 [00:07<00:00, 87.28it/s] 
2025-04-30 01:19:04,876 - BERTopic - Embedding - Completed ✓
2025-04-30 01:19:04,877 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:19:49,303 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:19:49,306 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:19:52,593 - BERTopic - Cluster - Completed ✓
2025-04-30 01:19:52,604 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:19:53,316 - BERTopic - Representation - Completed ✓



Analyzing Sheldon-Leonard conversations...


2025-04-30 01:20:06,367 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 212/212 [00:02<00:00, 87.32it/s] 
2025-04-30 01:20:08,851 - BERTopic - Embedding - Completed ✓
2025-04-30 01:20:08,852 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:20:16,153 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:20:16,155 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:20:16,299 - BERTopic - Cluster - Completed ✓
2025-04-30 01:20:16,302 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:20:16,506 - BERTopic - Representation - Completed ✓



Analyzing Sheldon-Penny conversations...


2025-04-30 01:20:17,999 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 139/139 [00:01<00:00, 88.67it/s] 
2025-04-30 01:20:19,604 - BERTopic - Embedding - Completed ✓
2025-04-30 01:20:19,605 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:20:24,383 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:20:24,384 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:20:24,477 - BERTopic - Cluster - Completed ✓
2025-04-30 01:20:24,480 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:20:24,623 - BERTopic - Representation - Completed ✓



Analyzing Sheldon-Howard conversations...


2025-04-30 01:20:25,856 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 59/59 [00:00<00:00, 86.57it/s]
2025-04-30 01:20:26,557 - BERTopic - Embedding - Completed ✓
2025-04-30 01:20:26,557 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:20:30,511 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:20:30,511 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:20:30,551 - BERTopic - Cluster - Completed ✓
2025-04-30 01:20:30,553 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:20:30,632 - BERTopic - Representation - Completed ✓



Analyzing Sheldon-Raj conversations...


2025-04-30 01:20:31,703 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 53/53 [00:00<00:00, 82.21it/s]
2025-04-30 01:20:32,366 - BERTopic - Embedding - Completed ✓
2025-04-30 01:20:32,367 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:20:35,640 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:20:35,641 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:20:35,676 - BERTopic - Cluster - Completed ✓
2025-04-30 01:20:35,679 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:20:35,772 - BERTopic - Representation - Completed ✓



Analyzing Sheldon-Amy conversations...


2025-04-30 01:20:36,921 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 101/101 [00:01<00:00, 88.23it/s]
2025-04-30 01:20:38,095 - BERTopic - Embedding - Completed ✓
2025-04-30 01:20:38,096 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:20:47,668 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:20:47,669 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:20:47,741 - BERTopic - Cluster - Completed ✓
2025-04-30 01:20:47,744 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:20:47,849 - BERTopic - Representation - Completed ✓



Analyzing Sheldon-Bernadette conversations...


2025-04-30 01:20:49,092 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 10/10 [00:00<00:00, 72.38it/s]
2025-04-30 01:20:49,236 - BERTopic - Embedding - Completed ✓
2025-04-30 01:20:49,238 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-30 01:20:49,552 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:20:49,553 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-04-30 01:20:49,561 - BERTopic - Cluster - Completed ✓
2025-04-30 01:20:49,564 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-04-30 01:20:49,577 - BERTopic - Representation - Completed ✓


2 topic(s) detected – skipping 2-D visualization.

Analysis complete. Results saved to 'results' directory.
Analyzing topic evolution across seasons...
Analyzing topics by season...
Processing season 1.0 (1096 dialogues)


Batches: 100%|██████████| 35/35 [00:00<00:00, 86.43it/s]
2025-04-30 01:20:50,226 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:08,035 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:08,036 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:08,081 - BERTopic - Cluster - Completed ✓


Processing season 2.0 (1343 dialogues)


Batches: 100%|██████████| 42/42 [00:00<00:00, 77.93it/s]
2025-04-30 01:21:08,642 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:09,080 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:09,081 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:09,130 - BERTopic - Cluster - Completed ✓


Processing season 3.0 (1307 dialogues)


Batches: 100%|██████████| 41/41 [00:00<00:00, 80.39it/s]
2025-04-30 01:21:09,662 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:10,082 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:10,083 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:10,133 - BERTopic - Cluster - Completed ✓


Processing season 4.0 (1328 dialogues)


Batches: 100%|██████████| 42/42 [00:00<00:00, 87.84it/s]
2025-04-30 01:21:10,632 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:11,083 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:11,084 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:11,134 - BERTopic - Cluster - Completed ✓


Processing season 5.0 (1019 dialogues)


Batches: 100%|██████████| 32/32 [00:00<00:00, 79.61it/s]
2025-04-30 01:21:11,556 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:11,912 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:11,913 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:11,950 - BERTopic - Cluster - Completed ✓


Processing season 6.0 (987 dialogues)


Batches: 100%|██████████| 31/31 [00:00<00:00, 85.46it/s]
2025-04-30 01:21:12,331 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:12,695 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:12,696 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:12,735 - BERTopic - Cluster - Completed ✓


Processing season 7.0 (1092 dialogues)


Batches: 100%|██████████| 35/35 [00:00<00:00, 81.23it/s]
2025-04-30 01:21:13,186 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:13,573 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:13,574 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:13,619 - BERTopic - Cluster - Completed ✓


Processing season 8.0 (1096 dialogues)


Batches: 100%|██████████| 35/35 [00:00<00:00, 84.94it/s]
2025-04-30 01:21:14,050 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:14,446 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:14,447 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:14,491 - BERTopic - Cluster - Completed ✓


Processing season 9.0 (1098 dialogues)


Batches: 100%|██████████| 35/35 [00:00<00:00, 83.93it/s]
2025-04-30 01:21:14,926 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:15,329 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:15,330 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:15,372 - BERTopic - Cluster - Completed ✓


Processing season 10.0 (1071 dialogues)


Batches: 100%|██████████| 34/34 [00:00<00:00, 90.58it/s]
2025-04-30 01:21:15,765 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:16,168 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:16,169 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:16,211 - BERTopic - Cluster - Completed ✓


Analyzing Sheldon-Leonard conversations across seasons...


Batches: 100%|██████████| 32/32 [00:00<00:00, 78.27it/s]
2025-04-30 01:21:22,296 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:22,613 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:22,614 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:22,650 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 28/28 [00:00<00:00, 89.20it/s]
2025-04-30 01:21:22,982 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:23,270 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:23,270 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:23,302 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 25/25 [00:00<00:00, 80.10it/s]
2025-04-30 01:21:23,631 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:23,893 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:23,894 

Analyzing Sheldon-Penny conversations across seasons...


Batches: 100%|██████████| 16/16 [00:00<00:00, 83.40it/s]
2025-04-30 01:21:31,733 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:31,899 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:31,900 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:31,920 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 26/26 [00:00<00:00, 85.21it/s]
2025-04-30 01:21:32,238 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:32,523 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:32,524 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:32,554 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 22/22 [00:00<00:00, 91.10it/s]
2025-04-30 01:21:32,808 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:33,043 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:33,044 

Analyzing Sheldon-Amy conversations across seasons...


Batches: 100%|██████████| 11/11 [00:00<00:00, 78.25it/s]
2025-04-30 01:21:39,793 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:39,937 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:39,938 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:39,952 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 9/9 [00:00<00:00, 77.76it/s]
2025-04-30 01:21:40,077 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:40,196 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:40,197 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-04-30 01:21:40,208 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 13/13 [00:00<00:00, 77.35it/s]
2025-04-30 01:21:40,386 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-04-30 01:21:40,537 - BERTopic - Dimensionality - Completed ✓
2025-04-30 01:21:40,538 - 