In [1]:
# Import necessary libraries
import pandas as pd
import nltk
import os
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dungu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dungu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dungu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Define main cast characters
MAIN_CAST = ['Sheldon', 'Penny', 'Amy', 'Howard', 'Bernadette', 'Leonard', 'Raj']

In [3]:
# Load the dataset
def load_dialogue_data(file_path):
    """Load the dialogue data from CSV file."""
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully with {len(df)} rows.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

file_path = 'big_bang_scripts.csv'
dialogue_df = load_dialogue_data(file_path)

# Display the first few rows to confirm the data structure
dialogue_df.head()

Data loaded successfully with 54406 rows.


Unnamed: 0,episode_name,dialogue,person_scene,series,episode,episode_name_only
0,Series 01 Episode 01 – Pilot Episode,A corridor at a sperm bank.,Scene,1.0,1.0,Pilot Episode
1,Series 01 Episode 01 – Pilot Episode,So if a photon is directed through a plane wi...,Sheldon,1.0,1.0,Pilot Episode
2,Series 01 Episode 01 – Pilot Episode,"Agreed, what’s your point?",Leonard,1.0,1.0,Pilot Episode
3,Series 01 Episode 01 – Pilot Episode,"There’s no point, I just think it’s a good id...",Sheldon,1.0,1.0,Pilot Episode
4,Series 01 Episode 01 – Pilot Episode,Excuse me?,Leonard,1.0,1.0,Pilot Episode


In [4]:
# Helper function to clean text
def clean_text(text):
    """Clean text by removing punctuation, converting to lowercase, and removing stop words."""
    if not isinstance(text, str):
        return []
    
    # Tokenize and convert to lowercase
    words = word_tokenize(text.lower())
    
    # Get English stop words
    stop_words = set(stopwords.words('english'))
    
    # Additional stop words specific to the script context
    additional_stop_words = {'oh', 'uh', 'um', 'like', 'just', 'well', 'yeah', 'okay', 'ok', 'so', 'hey', 'oh', 'ah', 
                            'gonna', 'wanna', 'gotta', 'would', 'could', 'should', 'i', 'you', 'he', 'she', 'it', 'we', 
                            'they', 'me', 'him', 'her', 'them', 'my', 'your', 'his', 'its', 'our', 'their', 'that', 
                            'this', 'what', 'going', 'get', 'got', 'do', 'does', 'did', 'doing', 'don', 'doesn', 
                            'didn', 'is', 'am', 'are', 'was', 'were', 'been', 'being', 'have', 'has', 'had', 'having', 'na', 'gon'}
    stop_words.update(additional_stop_words)
    
    # Remove stop words and non-alphabetic characters
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 1]
    
    return filtered_words

In [5]:
def process_dialogue_data(df):
    """Process dialogue data to extract words used by each main cast character in each series."""
    if 'person_scene' not in df.columns or 'dialogue' not in df.columns or 'series' not in df.columns:
        print("Required columns not found in the dataframe.")
        return None
    
    # Filter for main cast and exclude scene descriptions
    df_dialogue = df[(df['person_scene'].isin(MAIN_CAST))]
    
    # Check if we have any dialogue for the main cast
    if len(df_dialogue) == 0:
        print("No dialogue found for the specified main cast. Check character names in the dataset.")
        # Print all unique character names to help troubleshoot
        print("Available characters in the dataset:", df['person_scene'].unique())
        return None
    
    print(f"Found {len(df_dialogue)} dialogue lines for the main cast.")
    
    # Initialize structures to store word counts
    character_series_words = {character: {} for character in MAIN_CAST}
    
    # Process each row in the dataframe
    for _, row in df_dialogue.iterrows():
        character = row['person_scene']
        series = row['series']
        dialogue = row['dialogue']
        
        # Clean the dialogue text
        words = clean_text(dialogue)
        
        # Update the character's word count for this series
        if series not in character_series_words[character]:
            character_series_words[character][series] = []
        
        character_series_words[character][series].extend(words)
    
    return character_series_words

In [6]:
# Get top words for each character by series
def get_top_words(character_series_words, top_n=15):
    """Get the top N most common words for each main cast character by series."""
    top_words_data = []
    id_counter = 1
    
    for character in MAIN_CAST:
        if character not in character_series_words:
            continue
            
        for series in character_series_words[character]:
            words = character_series_words[character][series]
            word_counts = Counter(words)
            
            # Get top N words
            top_words = word_counts.most_common(top_n)
            
            # Add to the list
            for word, count in top_words:
                top_words_data.append({
                    'id': id_counter,
                    'character': character,
                    'series': series,
                    'word': word,
                    'count': count
                })
                id_counter += 1
    
    return pd.DataFrame(top_words_data)

In [7]:
def get_unique_words(character_series_words, top_n=15):
    """Find words that are unique to each main cast character (used more frequently by them than others)."""
    # First, get all words used by each character across all series
    character_words = {character: [] for character in MAIN_CAST}
    all_words = []
    
    for character in MAIN_CAST:
        if character not in character_series_words:
            continue
            
        for series in character_series_words[character]:
            character_words[character].extend(character_series_words[character][series])
        all_words.extend(character_words[character])
    
    # Calculate total word frequencies
    total_word_freq = Counter(all_words)
    
    # Calculate uniqueness scores (TF-IDF inspired)
    unique_words_data = []
    id_counter = 1
    
    for character in MAIN_CAST:
        if character not in character_words or not character_words[character]:
            continue
            
        char_word_freq = Counter(character_words[character])
        
        # Calculate uniqueness score for each word
        # (frequency of word for this character / frequency of word across all characters)
        uniqueness_scores = {}
        for word, count in char_word_freq.items():
            # Skip very rare words (used less than 3 times by this character)
            if count < 3:
                continue
            
            # Calculate uniqueness score - higher when the word is used more by this character
            # and less by others
            uniqueness_scores[word] = (count / total_word_freq[word] * count) if total_word_freq[word] > 0 else 0
        
        # Get top N unique words
        top_unique_words = sorted(uniqueness_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
        
        # Add to the list
        for word, score in top_unique_words:
            unique_words_data.append({
                'id': id_counter,
                'character': character,
                'word': word,
                'count': char_word_freq[word],
                'uniqueness_score': score
            })
            id_counter += 1
    
    return pd.DataFrame(unique_words_data)

In [8]:
# Create output directories
def create_output_dirs():
    """Create directories for output files."""
    os.makedirs('output', exist_ok=True)
    os.makedirs('output/by_season', exist_ok=True)
    os.makedirs('output/by_character', exist_ok=True)
    print("Output directories created.")

# Save results to CSV
def save_results(top_words_df, unique_words_df):
    """Save the analysis results to CSV files."""
    # Save the overall top words
    top_words_df.to_csv('output/main_cast_top_words.csv', index=False)
    print("Saved top words to output/main_cast_top_words.csv")
    
    # Save the unique words
    unique_words_df.to_csv('output/main_cast_unique_words.csv', index=False)
    print("Saved unique words to output/main_cast_unique_words.csv")
    
    # Save top words by season
    for series in top_words_df['series'].unique():
        series_df = top_words_df[top_words_df['series'] == series]
        series_df.to_csv(f'output/by_season/series_{series}_top_words.csv', index=False)
        print(f"Saved Series {series} top words to output/by_season/series_{series}_top_words.csv")
    
    # Save top words by character
    for character in MAIN_CAST:
        if character in top_words_df['character'].values:
            char_df = top_words_df[top_words_df['character'] == character]
            char_df.to_csv(f'output/by_character/{character}_top_words.csv', index=False)
            print(f"Saved {character}'s top words to output/by_character/{character}_top_words.csv")


In [9]:
def create_visualizations(top_words_df, unique_words_df):
    """Create visualizations of the word analysis for main cast."""
    # Plot top words for each main character
    for character in MAIN_CAST:
        if character in top_words_df['character'].values:
            # Get overall top words for this character
            char_df = top_words_df[top_words_df['character'] == character]
            word_counts = char_df.groupby('word')['count'].sum().reset_index()
            top_15_words = word_counts.sort_values('count', ascending=False).head(15)
            
            plt.figure(figsize=(12, 6))
            plt.bar(top_15_words['word'], top_15_words['count'])
            plt.title(f"Top 15 Words Used by {character}")
            plt.xlabel("Word")
            plt.ylabel("Count")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f"output/{character}_top_words.png")
            plt.close()
    
    # Plot unique words for each main character
    for character in MAIN_CAST:
        if character in unique_words_df['character'].values:
            char_df = unique_words_df[unique_words_df['character'] == character].head(15)
            
            plt.figure(figsize=(12, 6))
            plt.bar(char_df['word'], char_df['uniqueness_score'])
            plt.title(f"Top 15 Unique Words for {character}")
            plt.xlabel("Word")
            plt.ylabel("Uniqueness Score")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(f"output/{character}_unique_words.png")
            plt.close()

In [10]:
# def main():
#     """Main execution flow."""
#     # Create output directories
#     create_output_dirs()
    
#     # Process the dialogue data
#     character_series_words = process_dialogue_data(dialogue_df)
    
#     if character_series_words:
#         # Get top words for each character by series
#         top_words_df = get_top_words(character_series_words)
        
#         # Get unique words for each character
#         unique_words_df = get_unique_words(character_series_words)
        
#         # Save results to CSV
#         save_results(top_words_df, unique_words_df)
        
#         # Create visualizations
#         create_visualizations(top_words_df, unique_words_df)
        
#         # Display sample of results
#         print("\nSample of top words by character and series:")
#         display(top_words_df.head(10))
        
#         print("\nSample of unique words by character:")
#         display(unique_words_df.head(10))
        
#         print("\nCharacter statistics:")
#         character_counts = top_words_df['character'].value_counts()
#         display(character_counts)
        
#         print("\nAnalysis complete!")
#     else:
#         print("Failed to process dialogue data.")

# # Execute the main function
# if __name__ == "__main__":
#     main()

In [11]:
def get_unique_words_by_season(character_series_words, top_n=10):
    """Find words that are unique to each main cast character by season."""
    # Get all seasons with valid data (exclude NaN)
    seasons = sorted(set([s for character in character_series_words 
                       for s in character_series_words[character].keys() 
                       if not pd.isna(s)]))
    
    all_results = []
    id_counter = 1
    
    for season in seasons:
        # Get all words used by each character in this season
        season_character_words = {character: [] for character in MAIN_CAST}
        all_season_words = []
        
        for character in MAIN_CAST:
            if character in character_series_words and season in character_series_words[character]:
                season_character_words[character] = character_series_words[character][season]
                all_season_words.extend(season_character_words[character])
        
        # Calculate total word frequencies for this season
        season_word_freq = Counter(all_season_words)
        
        # Calculate uniqueness scores for each character in this season
        for character in MAIN_CAST:
            if not season_character_words[character]:
                continue
                
            char_word_freq = Counter(season_character_words[character])
            
            # Calculate uniqueness score for each word
            uniqueness_scores = {}
            for word, count in char_word_freq.items():
                # Skip very rare words (used less than 2 times by this character in this season)
                if count < 2:
                    continue
                
                # Calculate uniqueness score - higher when the word is used more by this character
                # and less by others in this season
                uniqueness_scores[word] = (count / season_word_freq[word] * count) if season_word_freq[word] > 0 else 0
            
            # Get top N unique words for this character in this season
            top_unique_words = sorted(uniqueness_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
            
            # Add to the results - each word appears exactly once per character per season
            for rank, (word, score) in enumerate(top_unique_words, 1):
                all_results.append({
                    'id': id_counter,
                    'character': character,
                    'season': season,
                    'rank': rank,  # Add rank information (1-10)
                    'word': word,
                    'count': char_word_freq[word],
                    'uniqueness_score': score
                })
                id_counter += 1
    
    return pd.DataFrame(all_results)

def save_season_uniqueness_results(season_uniqueness_df):
    """Save only the main season-based uniqueness analysis results to CSV."""
    # Make sure the output directory exists
    os.makedirs('output', exist_ok=True)
    
    # Save only the main results file - this contains exactly 10 words per character per season
    season_uniqueness_df.to_csv('output/season_character_unique_words.csv', index=False)
    print(f"Saved all seasons uniqueness data with {len(season_uniqueness_df)} entries to output/season_character_unique_words.csv")
    
    # Verify the count of entries per character per season
    word_counts = season_uniqueness_df.groupby(['character', 'season']).size().reset_index(name='word_count')
    print("\nVerifying entry counts per character per season:")
    print(word_counts.head(10))
    
    # Check if all word counts are 10 or less as expected
    if word_counts['word_count'].max() > 10:
        print("WARNING: Some character-season combinations have more than 10 words!")
    else:
        print("All character-season combinations have 10 or fewer words as expected.")

# Update the main function to only include the necessary analysis
def main():
    """Main execution flow."""
    # Create output directory
    os.makedirs('output', exist_ok=True)
    print("Output directory created.")
    
    # Process the dialogue data
    character_series_words = process_dialogue_data(dialogue_df)
    
    if character_series_words:
        # Get unique words by season for each character
        season_uniqueness_df = get_unique_words_by_season(character_series_words)
        
        # Save only the main results file
        save_season_uniqueness_results(season_uniqueness_df)
        
        # Display sample of results
        print("\nSample of unique words by character and season:")
        display(season_uniqueness_df.head(10))
        
        print("\nAnalysis complete!")
    else:
        print("Failed to process dialogue data.")

# Execute the main function
if __name__ == "__main__":
    main()

Output directory created.
Found 44967 dialogue lines for the main cast.
Saved all seasons uniqueness data with 650 entries to output/season_character_unique_words.csv

Verifying entry counts per character per season:
    character  season  word_count
0         Amy     4.0          10
1         Amy     5.0          10
2         Amy     6.0          10
3         Amy     7.0          10
4         Amy     8.0          10
5         Amy     9.0          10
6         Amy    10.0          10
7  Bernadette     3.0          10
8  Bernadette     4.0          10
9  Bernadette     5.0          10
All character-season combinations have 10 or fewer words as expected.

Sample of unique words by character and season:


Unnamed: 0,id,character,season,rank,word,count,uniqueness_score
0,1,Sheldon,1.0,1,knock,64,59.362319
1,2,Sheldon,1.0,2,leonard,73,32.10241
2,3,Sheldon,1.0,3,think,47,22.09
3,4,Sheldon,1.0,4,time,46,19.962264
4,5,Sheldon,1.0,5,penny,50,17.730496
5,6,Sheldon,1.0,6,yes,40,16.666667
6,7,Sheldon,1.0,7,one,45,16.071429
7,8,Sheldon,1.0,8,course,22,13.828571
8,9,Sheldon,1.0,9,need,32,13.473684
9,10,Sheldon,1.0,10,please,22,13.081081



Analysis complete!
