In [None]:
# Import necessary libraries
import pandas as pd
import nltk
import os
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Define main cast characters
MAIN_CAST = ['Sheldon', 'Penny', 'Amy', 'Howard', 'Bernadette', 'Leonard', 'Raj']

In [None]:
# Helper function to clean text
def clean_text(text):
    """Clean text by removing punctuation, converting to lowercase, and removing stop words."""
    if not isinstance(text, str):
        return []
    
    # Tokenize and convert to lowercase
    words = word_tokenize(text.lower())
    
    # Get English stop words
    stop_words = set(stopwords.words('english'))
    
    # Additional stop words specific to the script context
    additional_stop_words = {'oh', 'uh', 'um', 'like', 'just', 'well', 'yeah', 'okay', 'ok', 'so', 'hey', 'oh', 'ah', 
                            'gonna', 'wanna', 'gotta', 'would', 'could', 'should', 'i', 'you', 'he', 'she', 'it', 'we', 
                            'they', 'me', 'him', 'her', 'them', 'my', 'your', 'his', 'its', 'our', 'their', 'that', 
                            'this', 'what', 'going', 'get', 'got', 'do', 'does', 'did', 'doing', 'don', 'doesn', 
                            'didn', 'is', 'am', 'are', 'was', 'were', 'been', 'being', 'have', 'has', 'had', 'having', 'na', 'gon'}
    stop_words.update(additional_stop_words)
    
    # Remove stop words and non-alphabetic characters
    filtered_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 1]
    
    return filtered_words

In [None]:
def load_dialogue_data(file_path):
    """Load the dialogue data from CSV file."""
    try:
        df = pd.read_csv(file_path)
        print(f"Data loaded successfully with {len(df)} rows.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [None]:
def get_unique_words_by_season(character_series_words, top_n=10):
    """Find words that are unique to each main cast character by season."""
    # Get all seasons with valid data (exclude NaN)
    seasons = sorted(set([s for character in character_series_words 
                       for s in character_series_words[character].keys() 
                       if not pd.isna(s)]))
    
    all_results = []
    id_counter = 1
    
    for season in seasons:
        # Get all words used by each character in this season
        season_character_words = {character: [] for character in MAIN_CAST}
        all_season_words = []
        
        for character in MAIN_CAST:
            if character in character_series_words and season in character_series_words[character]:
                season_character_words[character] = character_series_words[character][season]
                all_season_words.extend(season_character_words[character])
        
        # Calculate total word frequencies for this season
        season_word_freq = Counter(all_season_words)
        
        # Calculate uniqueness scores for each character in this season
        for character in MAIN_CAST:
            if not season_character_words[character]:
                continue
                
            char_word_freq = Counter(season_character_words[character])
            
            # Calculate uniqueness score for each word
            uniqueness_scores = {}
            for word, count in char_word_freq.items():
                # Skip very rare words (used less than 2 times by this character in this season)
                if count < 2:
                    continue
                
                # Calculate uniqueness score - higher when the word is used more by this character
                # and less by others in this season
                uniqueness_scores[word] = (count / season_word_freq[word] * count) if season_word_freq[word] > 0 else 0
            
            # Get top N unique words for this character in this season
            top_unique_words = sorted(uniqueness_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
            
            # Add to the results - each word appears exactly once per character per season
            for rank, (word, score) in enumerate(top_unique_words, 1):
                all_results.append({
                    'id': id_counter,
                    'character': character,
                    'season': season,
                    'rank': rank,  # Add rank information (1-10)
                    'word': word,
                    'count': char_word_freq[word],
                    'uniqueness_score': score
                })
                id_counter += 1
    
    return pd.DataFrame(all_results)

In [None]:
def process_dialogue_data(df):
    """Process dialogue data to extract words used by each main cast character in each series."""
    if 'person_scene' not in df.columns or 'dialogue' not in df.columns or 'series' not in df.columns:
        print("Required columns not found in the dataframe.")
        return None
    
    # Filter for main cast and exclude scene descriptions
    df_dialogue = df[(df['person_scene'].isin(MAIN_CAST))]
    
    # Check if we have any dialogue for the main cast
    if len(df_dialogue) == 0:
        print("No dialogue found for the specified main cast. Check character names in the dataset.")
        # Print all unique character names to help troubleshoot
        print("Available characters in the dataset:", df['person_scene'].unique())
        return None
    
    print(f"Found {len(df_dialogue)} dialogue lines for the main cast.")
    
    # Initialize structures to store word counts
    character_series_words = {character: {} for character in MAIN_CAST}
    
    # Process each row in the dataframe
    for _, row in df_dialogue.iterrows():
        character = row['person_scene']
        series = row['series']
        dialogue = row['dialogue']
        
        # Clean the dialogue text
        words = clean_text(dialogue)
        
        # Update the character's word count for this series
        if series not in character_series_words[character]:
            character_series_words[character][series] = []
        
        character_series_words[character][series].extend(words)
    
    return character_series_words

In [None]:
def main():
    """Main execution flow - only outputs the requested CSV file."""
    # Create output directory
    os.makedirs('output', exist_ok=True)
    
    # Load the dataset
    file_path = 'big_bang_scripts.csv'
    dialogue_df = load_dialogue_data(file_path)
    
    if dialogue_df is not None:
        # Process the dialogue data
        character_series_words = process_dialogue_data(dialogue_df)
        
        if character_series_words:
            # Get unique words by season for each character
            season_uniqueness_df = get_unique_words_by_season(character_series_words)
            
            # Save the CSV file
            output_path = 'output/testing.csv'
            season_uniqueness_df.to_csv(output_path, index=False)
            print(f"Success! Saved file with {len(season_uniqueness_df)} entries to {output_path}")
            
            # Verify that each character has at most 10 words per season
            word_counts = season_uniqueness_df.groupby(['character', 'season']).size().reset_index(name='word_count')
            if word_counts['word_count'].max() <= 10:
                print("All character-season combinations have the expected number of words.")
            else:
                print("WARNING: Some character-season combinations have more than 10 words!")
        else:
            print("Failed to process dialogue data.")
    else:
        print("Failed to load dialogue data.")

# Execute the main function
if __name__ == "__main__":
    main()