# Frankenstein Analysis - OPTIMIZED WORKFLOW

## Key Improvements:
1. **Single Sentiment Analysis Run**: Sentiment analysis runs only once on all paragraphs
2. **Skip Geoparsing**: Uses manual location data from CSV (no redundant toponym extraction)  
3. **Parquet Storage**: All results saved as parquet files for fast loading in presentation
4. **Efficient Processing**: No duplicate work, streamlined pipeline

This optimized version eliminates redundancies and creates a clean data pipeline for the presentation notebook.

In [71]:
import pandas as pd
import numpy as np  

In [72]:
import os
import glob

# Get all .txt files in the data folder
data_folder = "data"
txt_files = glob.glob(os.path.join(data_folder, "*.txt"))

print(f"Found {len(txt_files)} text files:")
for file in txt_files:
    print(f"  - {file}")

Found 5 text files:
  - data\frankenstein_closing_letters.txt
  - data\frankenstein_opening_letters.txt
  - data\frankenstein_vol_1.txt
  - data\frankenstein_vol_2.txt
  - data\frankenstein_vol_3.txt


In [73]:
# Read all text files into a DataFrame
data_rows = []

for file_path in txt_files:
    # Extract the text_section name (part between 'frankenstein_' and '.txt')
    filename = os.path.basename(file_path)
    if filename.startswith('frankenstein_') and filename.endswith('.txt'):
        text_section = filename[len('frankenstein_'):-len('.txt')]
        
        # Read the full text content
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                full_text = f.read()
            
            data_rows.append({
                'text_section': text_section,
                'full_text': full_text
            })
            print(f"Successfully read {filename} - {len(full_text)} characters")
            
        except Exception as e:
            print(f"Error reading {filename}: {e}")

# Create the DataFrame
frankenstein_df = pd.DataFrame(data_rows)

# Display basic info about the DataFrame
print(f"\nDataFrame created with {len(frankenstein_df)} sections:")
print(frankenstein_df[['text_section']].head())
print(f"\nText length by section:")
for _, row in frankenstein_df.iterrows():
    print(f"  {row['text_section']}: {len(row['full_text'])} characters")

Successfully read frankenstein_closing_letters.txt - 29538 characters
Successfully read frankenstein_opening_letters.txt - 27242 characters
Successfully read frankenstein_vol_1.txt - 109938 characters
Successfully read frankenstein_vol_2.txt - 119221 characters
Successfully read frankenstein_vol_3.txt - 117369 characters

DataFrame created with 5 sections:
      text_section
0  closing_letters
1  opening_letters
2            vol_1
3            vol_2
4            vol_3

Text length by section:
  closing_letters: 29538 characters
  opening_letters: 27242 characters
  vol_1: 109938 characters
  vol_2: 119221 characters
  vol_3: 117369 characters


In [74]:
# Fixed parsing to properly handle chapters that contain embedded letters
def extract_chapters_and_letters_fixed(text_section, full_text):
    """Extract individual chapters and letters with proper hierarchy handling"""
    chapters_letters = []
    
    # First, find all CHAPTER markers (these take priority)
    chapter_pattern = r'(CHAPTER\s+[IVX\d]+\.?)'
    chapter_matches = list(re.finditer(chapter_pattern, full_text, flags=re.IGNORECASE))
    
    # Then find all LETTER markers that are at the START of sections (not embedded)
    letter_pattern = r'(LETTER\s+[IVX\d]+\.?)'
    letter_matches = list(re.finditer(letter_pattern, full_text, flags=re.IGNORECASE))
    
    # Filter out letter matches that fall within chapter boundaries
    filtered_letter_matches = []
    for letter_match in letter_matches:
        is_embedded = False
        for i, chapter_match in enumerate(chapter_matches):
            # Check if this letter falls within a chapter
            chapter_start = chapter_match.start()
            chapter_end = chapter_matches[i + 1].start() if i + 1 < len(chapter_matches) else len(full_text)
            
            if chapter_start < letter_match.start() < chapter_end:
                is_embedded = True
                break
        
        if not is_embedded:
            filtered_letter_matches.append(letter_match)
    
    # Combine chapter and filtered letter matches, sort by position
    all_matches = chapter_matches + filtered_letter_matches
    all_matches.sort(key=lambda x: x.start())
    
    # Process each match
    for i, match in enumerate(all_matches):
        title = match.group(1).replace('.', '').strip().upper()
        start_pos = match.end()
        
        # Find the end position (start of next chapter/letter or end of text)
        if i + 1 < len(all_matches):
            end_pos = all_matches[i + 1].start()
        else:
            end_pos = len(full_text)
        
        content = full_text[start_pos:end_pos].strip()
        
        # Only include if content is substantial (more than 100 characters)
        if len(content) > 100:
            chapters_letters.append({
                'text_section': text_section,
                'chapter_letter': title,
                'full_text': content
            })
    
    return chapters_letters

# Re-process with fixed function
unnested_data_fixed = []
for _, row in frankenstein_df.iterrows():
    extracted = extract_chapters_and_letters_fixed(row['text_section'], row['full_text'])
    unnested_data_fixed.extend(extracted)

# Create the corrected DataFrame
frankenstein_corrected_df = pd.DataFrame(unnested_data_fixed)

print(f"CORRECTED DataFrame with {len(frankenstein_corrected_df)} chapters/letters:")
print("\nBreakdown by section:")
section_counts = frankenstein_corrected_df['text_section'].value_counts()
for section, count in section_counts.items():
    print(f"  {section}: {count} chapters/letters")

print(f"\nAll entries:")
for _, row in frankenstein_corrected_df.iterrows():
    print(f"{row['text_section']:<20} | {row['chapter_letter']:<15} | {len(row['full_text']):>6} characters")

CORRECTED DataFrame with 27 chapters/letters:

Breakdown by section:
  vol_2: 9 chapters/letters
  vol_1: 7 chapters/letters
  vol_3: 7 chapters/letters
  opening_letters: 4 chapters/letters

All entries:
opening_letters      | LETTER I        |   6858 characters
opening_letters      | LETTER II       |   5918 characters
opening_letters      | LETTER III      |   1350 characters
opening_letters      | LETTER IV       |  13048 characters
vol_1                | CHAPTER I       |  17173 characters
vol_1                | CHAPTER II      |  12894 characters
vol_1                | CHAPTER III     |  14548 characters
vol_1                | CHAPTER IV      |  13012 characters
vol_1                | CHAPTER V       |  15373 characters
vol_1                | CHAPTER VI      |  19879 characters
vol_1                | CHAPTER VII     |  16931 characters
vol_2                | CHAPTER I       |  11544 characters
vol_2                | CHAPTER II      |  12437 characters
vol_2                | CHAPT

In [75]:
# Final corrected parsing to include closing_letters as one chunk
def extract_chapters_and_letters_final(text_section, full_text):
    """Extract chapters/letters with special handling for closing_letters"""
    chapters_letters = []
    
    # Special case for closing_letters - treat as one complete section
    if text_section == 'closing_letters':
        chapters_letters.append({
            'text_section': text_section,
            'chapter_letter': 'CLOSING LETTERS',  # Descriptive name
            'full_text': full_text.strip()
        })
        return chapters_letters
    
    # For all other sections, use the existing logic
    # First, find all CHAPTER markers (these take priority)
    chapter_pattern = r'(CHAPTER\s+[IVX\d]+\.?)'
    chapter_matches = list(re.finditer(chapter_pattern, full_text, flags=re.IGNORECASE))
    
    # Then find all LETTER markers that are at the START of sections (not embedded)
    letter_pattern = r'(LETTER\s+[IVX\d]+\.?)'
    letter_matches = list(re.finditer(letter_pattern, full_text, flags=re.IGNORECASE))
    
    # Filter out letter matches that fall within chapter boundaries
    filtered_letter_matches = []
    for letter_match in letter_matches:
        is_embedded = False
        for i, chapter_match in enumerate(chapter_matches):
            # Check if this letter falls within a chapter
            chapter_start = chapter_match.start()
            chapter_end = chapter_matches[i + 1].start() if i + 1 < len(chapter_matches) else len(full_text)
            
            if chapter_start < letter_match.start() < chapter_end:
                is_embedded = True
                break
        
        if not is_embedded:
            filtered_letter_matches.append(letter_match)
    
    # Combine chapter and filtered letter matches, sort by position
    all_matches = chapter_matches + filtered_letter_matches
    all_matches.sort(key=lambda x: x.start())
    
    # Process each match
    for i, match in enumerate(all_matches):
        title = match.group(1).replace('.', '').strip().upper()
        start_pos = match.end()
        
        # Find the end position (start of next chapter/letter or end of text)
        if i + 1 < len(all_matches):
            end_pos = all_matches[i + 1].start()
        else:
            end_pos = len(full_text)
        
        content = full_text[start_pos:end_pos].strip()
        
        # Only include if content is substantial (more than 100 characters)
        if len(content) > 100:
            chapters_letters.append({
                'text_section': text_section,
                'chapter_letter': title,
                'full_text': content
            })
    
    return chapters_letters

# Re-process with the final corrected function
unnested_data_final = []
for _, row in frankenstein_df.iterrows():
    extracted = extract_chapters_and_letters_final(row['text_section'], row['full_text'])
    unnested_data_final.extend(extracted)

# Create the final corrected DataFrame
frankenstein_final_df = pd.DataFrame(unnested_data_final)

print(f"FINAL CORRECTED DataFrame with {len(frankenstein_final_df)} chapters/letters:")
print("\nBreakdown by section:")
section_counts = frankenstein_final_df['text_section'].value_counts()
for section, count in section_counts.items():
    print(f"  {section}: {count} chapters/letters")

print(f"\nAll entries (now including closing_letters):")
for _, row in frankenstein_final_df.iterrows():
    print(f"{row['text_section']:<20} | {row['chapter_letter']:<15} | {len(row['full_text']):>6} characters")

FINAL CORRECTED DataFrame with 28 chapters/letters:

Breakdown by section:
  vol_2: 9 chapters/letters
  vol_3: 7 chapters/letters
  vol_1: 7 chapters/letters
  opening_letters: 4 chapters/letters
  closing_letters: 1 chapters/letters

All entries (now including closing_letters):
closing_letters      | CLOSING LETTERS |  29534 characters
opening_letters      | LETTER I        |   6858 characters
opening_letters      | LETTER II       |   5918 characters
opening_letters      | LETTER III      |   1350 characters
opening_letters      | LETTER IV       |  13048 characters
vol_1                | CHAPTER I       |  17173 characters
vol_1                | CHAPTER II      |  12894 characters
vol_1                | CHAPTER III     |  14548 characters
vol_1                | CHAPTER IV      |  13012 characters
vol_1                | CHAPTER V       |  15373 characters
vol_1                | CHAPTER VI      |  19879 characters
vol_1                | CHAPTER VII     |  16931 characters
vol_2      

In [76]:
# Split full text into individual paragraphs
def split_into_paragraphs(text_section, chapter_letter, full_text):
    """Split the full text of a chapter/letter into individual paragraphs"""
    paragraphs = []
    
    # Split by double newlines (paragraph breaks) and clean up
    paragraph_splits = re.split(r'\n\s*\n', full_text)
    
    for i, paragraph in enumerate(paragraph_splits):
        # Clean up the paragraph text
        paragraph = paragraph.strip()
        paragraph = re.sub(r'\s+', ' ', paragraph)  # Normalize whitespace
        
        # Only include substantial paragraphs (more than 10 characters)
        if len(paragraph) > 10:
            paragraphs.append({
                'text_section': text_section,
                'chapter_letter': chapter_letter,
                'paragraph_number': i + 1,
                'paragraph_text': paragraph
            })
    
    return paragraphs

# Process each chapter/letter into paragraphs
paragraph_data = []
for _, row in frankenstein_final_df.iterrows():
    paragraphs = split_into_paragraphs(
        row['text_section'], 
        row['chapter_letter'], 
        row['full_text']
    )
    paragraph_data.extend(paragraphs)

# Create the paragraph-level DataFrame
frankenstein_paragraphs_df = pd.DataFrame(paragraph_data)

print(f"Paragraph-level DataFrame created with {len(frankenstein_paragraphs_df)} paragraphs:")
print(f"\nColumns: {list(frankenstein_paragraphs_df.columns)}")

print(f"\nBreakdown by section:")
section_counts = frankenstein_paragraphs_df['text_section'].value_counts()
for section, count in section_counts.items():
    print(f"  {section}: {count} paragraphs")

print(f"\nSample of first few paragraphs:")
for i in range(min(5, len(frankenstein_paragraphs_df))):
    row = frankenstein_paragraphs_df.iloc[i]
    preview = row['paragraph_text'][:100] + "..." if len(row['paragraph_text']) > 100 else row['paragraph_text']
    print(f"{row['text_section']:<20} | {row['chapter_letter']:<15} | P{row['paragraph_number']:>2} | {preview}")

print(f"\nParagraph statistics:")
para_lengths = frankenstein_paragraphs_df['paragraph_text'].str.len()
print(f"  - Total paragraphs: {len(frankenstein_paragraphs_df)}")
print(f"  - Average paragraph length: {para_lengths.mean():.0f} characters")
print(f"  - Min length: {para_lengths.min()}")
print(f"  - Max length: {para_lengths.max()}")
print(f"  - Median length: {para_lengths.median():.0f}")

Paragraph-level DataFrame created with 764 paragraphs:

Columns: ['text_section', 'chapter_letter', 'paragraph_number', 'paragraph_text']

Breakdown by section:
  vol_3: 234 paragraphs
  vol_2: 210 paragraphs
  vol_1: 198 paragraphs
  opening_letters: 67 paragraphs
  closing_letters: 55 paragraphs

Sample of first few paragraphs:
closing_letters      | CLOSING LETTERS | P 1 | ÔªøWALTON, _in continuation_.
closing_letters      | CLOSING LETTERS | P 2 | August 26th, 17‚Äî.
closing_letters      | CLOSING LETTERS | P 3 | You have read this strange and terrific story, Margaret; and do you not feel your blood congealed wi...
closing_letters      | CLOSING LETTERS | P 4 | His tale is connected, and told with an appearance of the simplest truth; yet I own to you that the ...
closing_letters      | CLOSING LETTERS | P 5 | ‚ÄúAre you mad, my friend?‚Äù said he, ‚Äúor whither does your senseless curiosity lead you? Would you als...

Paragraph statistics:
  - Total paragraphs: 764
  - Average para

# SKIP GEOPARSING - Using Manual Location Data

We'll skip the geoparsing steps since we have manually curated location data. This is much more efficient and accurate than automated toponym extraction.

Instead, we'll:
1. Load the manual location data directly
2. Run sentiment analysis ONCE on all paragraphs  
3. Match paragraphs to locations
4. Create all visualizations
5. Save results as parquet files

In [None]:
# Load manual location data (skip geoparsing)
print("üó∫Ô∏è Loading manual location data...")
print("Skipping geoparsing - using curated location data for efficiency")

try:
    frankenstein_manual_locations = pd.read_csv("frankenstein_paragraphs_geoparsed_and_located.csv")
    print(f"‚úÖ Loaded manual locations: {len(frankenstein_manual_locations)} paragraphs")
    print(f"üìä Shape: {frankenstein_manual_locations.shape}")
    
    # Show coordinate columns
    coords_columns = list(frankenstein_manual_locations.columns[-2:])
    lat_col = coords_columns[0] 
    lon_col = coords_columns[1]
    print(f"üìç Coordinate columns: {lat_col}, {lon_col}")
    
    # Count valid locations
    valid_locations = frankenstein_manual_locations[
        (frankenstein_manual_locations[lat_col].notna()) & 
        (frankenstein_manual_locations[lon_col].notna())
    ]
    print(f"üåç Valid locations: {len(valid_locations)} paragraphs")
    
except FileNotFoundError:
    print("‚ùå Manual locations file not found: frankenstein_paragraphs_geoparsed_and_located.csv")
    print("Please ensure this file exists in the current directory")

‚úÖ Geoparsing libraries imported successfully!


In [None]:
# Initialize RoBERTa model for sentiment analysis
print("ü§ñ Loading RoBERTa sentiment model...")
print("This will be used for ALL sentiment analysis in one efficient pass")

try:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from scipy.special import softmax
    from typing import Dict, Any
    
    MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    
    print("‚úÖ RoBERTa model loaded successfully!")
    
    def polarity_scores_roberta(text: str) -> Dict[str, float]:
        """Calculate RoBERTa sentiment scores for a given text"""
        encoded_text = tokenizer.encode_plus(
            text, 
            max_length=512, 
            truncation=True, 
            return_tensors='pt'
        )
        
        output = model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        
        return {
            'roberta_neg': scores[0],
            'roberta_neu': scores[1], 
            'roberta_pos': scores[2],
            'roberta_compound': (scores[2] - scores[0]) * (1 - scores[1])
        }
    
    print("‚úÖ Sentiment scoring function ready!")
    
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("Make sure transformers and scipy are installed")

Initializing geoparser... (this may take a minute)
‚úÖ Geoparser initialized successfully!
‚úÖ Geoparser initialized successfully!


In [None]:
# SINGLE SENTIMENT ANALYSIS RUN - All Paragraphs
print("üí≠ Running sentiment analysis on ALL paragraphs...")
print("This is the ONLY time sentiment analysis runs - efficient approach!")
print(f"Processing {len(frankenstein_paragraphs_df)} paragraphs...")

def add_sentiment_scores(text):
    """Add sentiment scores with error handling"""
    try:
        return polarity_scores_roberta(text)
    except Exception as e:
        return {'roberta_neg': None, 'roberta_neu': None, 'roberta_pos': None, 'roberta_compound': None}

# Apply sentiment analysis with progress bar
from tqdm import tqdm
tqdm.pandas(desc="Analyzing sentiment")
sentiment_scores = frankenstein_paragraphs_df['paragraph_text'].progress_apply(add_sentiment_scores)

# Convert to DataFrame and merge
sentiment_df = pd.DataFrame(sentiment_scores.tolist())
frankenstein_all_with_sentiment = pd.concat([frankenstein_paragraphs_df.reset_index(drop=True), sentiment_df], axis=1)

print(f"‚úÖ Sentiment analysis complete!")
print(f"üìä Processed {len(frankenstein_all_with_sentiment)} paragraphs with sentiment scores")
print(f"üìà Columns added: {list(sentiment_df.columns)}")

# Show sample results
print("\nüéØ Sample sentiment results:")
sample = frankenstein_all_with_sentiment[['text_section', 'chapter_letter', 'roberta_compound']].head()
print(sample)

Toponym Recognition...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

üó∫Ô∏è  GEOPARSER TEST RESULTS:
Text: "I traveled from Geneva to Germany, passing through the Alps near Mont Blanc."
  üìç Found: Gen√®ve, Switzerland (46.2022, 6.1457)
  üìç Found: Federal Republic of Germany, Germany (51.5000, 10.5000)
  üìç Found: Alps, None (46.4167, 10.0000)
  üìç Found: Mont Blanc, French Southern Territories (-49.4606, 69.4680)
‚úÖ Geoparser test successful!


In [None]:
# Character Analysis using existing sentiment data
print("üé≠ Analyzing character sentiment...")
print("Using the sentiment scores we just calculated - no redundant processing!")

principal_characters = {
    'Victor': ['Victor', 'Frankenstein'],
    'Elizabeth': ['Elizabeth'],
    'Henry': ['Henry', 'Clerval'],
    'Justine': ['Justine'],
    'Felix': ['Felix'],
    'Agatha': ['Agatha'],
    'Monster': ['monster', 'creature', 'fiend', 'daemon'],
    'William': ['William'],
    'Ernest': ['Ernest'],
    'Alphonse': ['Alphonse', 'father', 'my father', 'his father']
}

def contains_character(text, character_variants):
    """Check if text contains any variant of a character name"""
    text_lower = text.lower()
    return any(variant.lower() in text_lower for variant in character_variants)

character_sentiment_data = []

for character_name, variants in principal_characters.items():
    character_paragraphs = frankenstein_all_with_sentiment[
        frankenstein_all_with_sentiment['paragraph_text'].apply(
            lambda x: contains_character(x, variants)
        )
    ].copy()
    
    if len(character_paragraphs) > 0:
        avg_sentiment = character_paragraphs['roberta_compound'].mean()
        total_paragraphs = len(character_paragraphs)
        total_words = character_paragraphs['paragraph_text'].str.split().str.len().sum()
        
        positive_count = sum(character_paragraphs['roberta_compound'] > 0.1)
        negative_count = sum(character_paragraphs['roberta_compound'] < -0.1)
        neutral_count = total_paragraphs - positive_count - negative_count
        
        most_positive_idx = character_paragraphs['roberta_compound'].idxmax()
        most_negative_idx = character_paragraphs['roberta_compound'].idxmin()
        
        character_sentiment_data.append({
            'Character': character_name,
            'Total_Mentions': total_paragraphs,
            'Total_Words': total_words,
            'Avg_Sentiment': avg_sentiment,
            'Positive_Mentions': positive_count,
            'Negative_Mentions': negative_count,
            'Neutral_Mentions': neutral_count,
            'Most_Positive_Score': character_paragraphs.loc[most_positive_idx, 'roberta_compound'],
            'Most_Negative_Score': character_paragraphs.loc[most_negative_idx, 'roberta_compound'],
            'Most_Positive_Text': character_paragraphs.loc[most_positive_idx, 'paragraph_text'][:150] + "...",
            'Most_Negative_Text': character_paragraphs.loc[most_negative_idx, 'paragraph_text'][:150] + "..."
        })

character_sentiment_df = pd.DataFrame(character_sentiment_data)
character_sentiment_df = character_sentiment_df.sort_values('Avg_Sentiment', ascending=False)

print(f"‚úÖ Character analysis complete for {len(character_sentiment_df)} characters")

‚úÖ Geoparsing function defined and ready to use!


In [None]:
# Location sentiment analysis using existing data
print("üåç Analyzing location sentiment...")
print("Matching paragraphs with locations and calculating sentiment")

# Create a mapping from paragraph info to location info
location_mapping = frankenstein_manual_locations[
    ['text_section', 'chapter_letter', 'paragraph_number', 'curated_name', lat_col, lon_col]
].dropna(subset=[lat_col, lon_col])

print(f"üìç Location mapping created: {len(location_mapping)} paragraph-location pairs")

# Merge with sentiment data
frankenstein_locations_with_sentiment = frankenstein_all_with_sentiment.merge(
    location_mapping,
    on=['text_section', 'chapter_letter', 'paragraph_number'],
    how='inner'
)

print(f"üîó Merged data: {len(frankenstein_locations_with_sentiment)} paragraphs with both location and sentiment")

# Calculate word counts
frankenstein_locations_with_sentiment['word_count'] = frankenstein_locations_with_sentiment['paragraph_text'].str.split().str.len()
total_narrative_words = frankenstein_all_with_sentiment['paragraph_text'].str.split().str.len().sum()

# Aggregate by location
location_sentiment_summary = frankenstein_locations_with_sentiment.groupby(['curated_name', lat_col, lon_col]).agg({
    'word_count': 'sum',
    'roberta_compound': 'mean',
    'roberta_pos': 'mean',
    'roberta_neg': 'mean',
    'roberta_neu': 'mean',
    'paragraph_text': 'count'
}).reset_index()

location_sentiment_summary = location_sentiment_summary.rename(columns={
    'word_count': 'total_words',
    'paragraph_text': 'paragraph_count',
    'roberta_compound': 'avg_sentiment'
})

location_sentiment_summary['narrative_percent'] = (location_sentiment_summary['total_words'] / total_narrative_words * 100).round(2)

def categorize_sentiment(score):
    if score > 0.1:
        return "Positive"
    elif score < -0.1:
        return "Negative"
    else:
        return "Neutral"

location_sentiment_summary['sentiment_category'] = location_sentiment_summary['avg_sentiment'].apply(categorize_sentiment)

print(f"‚úÖ Location sentiment analysis complete for {len(location_sentiment_summary)} locations")
print(f"üìä Sentiment distribution: {location_sentiment_summary['sentiment_category'].value_counts().to_dict()}")

üß™ OPTION 1: Test with sample data first
Testing geoparsing on 50 paragraphs...
This is recommended before running on the full dataset!
üîç Processing 50 paragraphs for geographic locations...
‚ö†Ô∏è  This may take several minutes for the full dataset!
Toponym Recognition...


Batches:   0%|          | 0/50 [00:00<?, ?it/s]

Toponym Resolution...


Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting locations:   0%|          | 0/50 [00:00<?, ?it/s]

‚úÖ Geoparsing complete!
üìä Results: 6/50 paragraphs contain locations
üìç Total locations found: 7


In [None]:
# Save all results as parquet files for presentation notebook
print("üíæ Saving analysis results as parquet files...")

# Save main datasets
frankenstein_all_with_sentiment.to_parquet("frankenstein_all_paragraphs_with_sentiment.parquet", index=False)
character_sentiment_df.to_parquet("frankenstein_character_sentiment.parquet", index=False)
location_sentiment_summary.to_parquet("frankenstein_location_sentiment.parquet", index=False)
frankenstein_manual_locations.to_parquet("frankenstein_manual_locations.parquet", index=False)

print("‚úÖ Saved parquet files:")
print("  - frankenstein_all_paragraphs_with_sentiment.parquet")
print("  - frankenstein_character_sentiment.parquet")  
print("  - frankenstein_location_sentiment.parquet")
print("  - frankenstein_manual_locations.parquet")

print("\nüéâ OPTIMIZED ANALYSIS COMPLETE!")
print("üìä Summary:")
print(f"  - Total paragraphs: {len(frankenstein_all_with_sentiment)}")
print(f"  - Characters analyzed: {len(character_sentiment_df)}")
print(f"  - Locations analyzed: {len(location_sentiment_summary)}")
print(f"  - Sentiment analysis: Run ONCE efficiently")
print(f"  - Geoparsing: Skipped (used manual data)")
print(f"  - Storage: Parquet files for fast loading")

print("\nüöÄ Ready for presentation notebook!")

üó∫Ô∏è  SAMPLE GEOPARSING RESULTS:
Found locations in 6 out of 50 paragraphs

Sample results:
------------------------------------------------------------

üìñ closing_letters - CLOSING LETTERS (Paragraph 15)
Text preview: I write to you, encompassed by peril, and ignorant whether I am ever doomed to see again dear Englan...
   üìç England: (52.1604, -0.7031) - None

üìñ closing_letters - CLOSING LETTERS (Paragraph 33)
Text preview: It is past; I am returning to England. I have lost my hopes of utility and glory;‚ÄîI have lost my fri...
   üìç England: (52.1604, -0.7031) - None
   üìç England: (52.1604, -0.7031) - None

üìñ closing_letters - CLOSING LETTERS (Paragraph 34)
Text preview: September 19th, the ice began to move, and roarings like thunder were heard at a distance, as the is...
   üìç England: (52.1604, -0.7031) - None

üìñ closing_letters - CLOSING LETTERS (Paragraph 40)
Text preview: ‚ÄúYet I cannot ask you to renounce your country and friends, to fulfil this task;

In [None]:
# Create visualizations using the processed data
import plotly.express as px

# 1. Geographic distribution map
print("üó∫Ô∏è Creating geographic distribution map...")

# Use location data with word counts for sizing
valid_coords = frankenstein_manual_locations[
    (frankenstein_manual_locations[lat_col].notna()) & 
    (frankenstein_manual_locations[lon_col].notna())
].copy()

valid_coords['word_count'] = valid_coords['paragraph_text'].str.split().str.len()

location_counts = valid_coords.groupby(['curated_name', lat_col, lon_col]).agg({
    'word_count': 'sum'
}).reset_index()
location_counts = location_counts.rename(columns={'word_count': 'total_words'})
location_counts['narrative_percent'] = (location_counts['total_words'] / total_narrative_words * 100).round(2)

fig_geo = px.scatter_mapbox(
    location_counts,
    lat=lat_col,
    lon=lon_col,
    hover_name="curated_name",
    size="total_words",
    size_max=20,
    hover_data={"narrative_percent": ":.2f", "total_words": True},
    title="Geographic Distribution in Frankenstein",
    zoom=3,
    height=600
)

fig_geo.update_layout(mapbox_style="open-street-map", margin={"r":0,"t":50,"l":0,"b":0})
fig_geo.show()

print(f"‚úÖ Geographic map created with {len(location_counts)} locations")

üó∫Ô∏è Map created with 95 unique locations


In [86]:
frankenstein_paragraphs_geoparsed.to_csv("frankenstein_paragraphs_geoparsed.csv", index=False)

In [87]:
frankenstein_manual_locations = pd.read_csv("frankenstein_paragraphs_geoparsed_and_located.csv")

In [88]:
# Examine the structure of the manual locations DataFrame
print("üîç MANUAL LOCATIONS DATAFRAME STRUCTURE:")
print("=" * 60)
print(f"Shape: {frankenstein_manual_locations.shape}")
print(f"Columns: {list(frankenstein_manual_locations.columns)}")

# Show the first few rows to understand the structure
print(f"\nFirst 5 rows:")
display(frankenstein_manual_locations.head())

# Show column info focusing on the final two columns
print(f"\nFinal two columns (coordinates):")
final_cols = frankenstein_manual_locations.columns[-2:]
print(f"Column names: {list(final_cols)}")
for col in final_cols:
    print(f"  {col}: {frankenstein_manual_locations[col].dtype}")
    print(f"    - Non-null count: {frankenstein_manual_locations[col].notna().sum()}")
    print(f"    - Sample values: {frankenstein_manual_locations[col].dropna().head(3).tolist()}")

üîç MANUAL LOCATIONS DATAFRAME STRUCTURE:
Shape: (764, 11)
Columns: ['text_section', 'chapter_letter', 'paragraph_number', 'paragraph_text', 'places', 'latitudes', 'longitudes', 'feature_names', 'curated_name', 'lat', 'long']

First 5 rows:


Unnamed: 0,text_section,chapter_letter,paragraph_number,paragraph_text,places,latitudes,longitudes,feature_names,curated_name,lat,long
0,closing_letters,CLOSING LETTERS,1,"ÔªøWALTON, _in continuation_.",[],[],[],[],Artic,83.611669,6.457242
1,closing_letters,CLOSING LETTERS,2,"August 26th, 17‚Äî.",[],[],[],[],Artic,83.611669,6.457242
2,closing_letters,CLOSING LETTERS,3,"You have read this strange and terrific story,...",[],[],[],[],Artic,83.611669,6.457242
3,closing_letters,CLOSING LETTERS,4,"His tale is connected, and told with an appear...",[],[],[],[],Artic,83.611669,6.457242
4,closing_letters,CLOSING LETTERS,5,"‚ÄúAre you mad, my friend?‚Äù said he, ‚Äúor whither...",[],[],[],[],Artic,83.611669,6.457242



Final two columns (coordinates):
Column names: ['lat', 'long']
  lat: float64
    - Non-null count: 764
    - Sample values: [83.61166904, 83.61166904, 83.61166904]
  long: float64
    - Non-null count: 764
    - Sample values: [6.457241509, 6.457241509, 6.457241509]


In [89]:
# Create interactive map using the manual locations with correct coordinates
try:
    import plotly.express as px
    
    # Get the final two columns (coordinates)
    coords_columns = list(frankenstein_manual_locations.columns[-2:])
    lat_col = coords_columns[0]  # Assuming first of final two is latitude
    lon_col = coords_columns[1]  # Assuming second of final two is longitude
    
    print(f"Using coordinates from columns: {lat_col} and {lon_col}")
    
    # Filter for rows that have valid coordinates
    valid_coords = frankenstein_manual_locations[
        (frankenstein_manual_locations[lat_col].notna()) & 
        (frankenstein_manual_locations[lon_col].notna())
    ].copy()
    
    print(f"Found {len(valid_coords)} paragraphs with valid coordinates out of {len(frankenstein_manual_locations)} total")
    
    if len(valid_coords) > 0:
        # Use curated_name for location names
        if 'curated_name' in valid_coords.columns:
            place_col = 'curated_name'
            print(f"Using curated place names from column: {place_col}")
        else:
            # Fallback to other place columns if curated_name not available
            place_cols = [col for col in valid_coords.columns if 'place' in col.lower() or 'location' in col.lower()]
            
            if place_cols:
                place_col = place_cols[0]
                print(f"curated_name not found, using: {place_col}")
            else:
                # Create a simple identifier if no place column found
                valid_coords['location_id'] = valid_coords['text_section'] + " - " + valid_coords['chapter_letter']
                place_col = 'location_id'
                print("No place column found, created location identifiers")
        
        # Count total words at each location for sizing
        # First, add word counts to each row
        valid_coords['word_count'] = valid_coords['paragraph_text'].str.split().str.len()
        
        # Calculate total words in the entire narrative
        total_narrative_words = frankenstein_manual_locations['paragraph_text'].str.split().str.len().sum()
        
        # Group by location and sum word counts
        location_counts = valid_coords.groupby([place_col, lat_col, lon_col]).agg({
            'word_count': 'sum'
        }).reset_index()
        location_counts = location_counts.rename(columns={'word_count': 'total_words'})
        
        # Calculate percentage of total narrative for each location
        location_counts['narrative_percent'] = (location_counts['total_words'] / total_narrative_words * 100).round(2)
        
        # Create custom hover text
        location_counts['hover_text'] = (
            location_counts[place_col] + '<br>' +
            'Narrative Percent: ' + location_counts['narrative_percent'].astype(str) + '%<br>' +
            'Total Words: ' + location_counts['total_words'].astype(str)
        )
        
        # Create the map
        fig = px.scatter_map(
            location_counts,
            lat=lat_col,
            lon=lon_col,
            hover_name=None,  # Disable default hover name
            custom_data=['hover_text'],
            size="total_words",
            title="Manual Frankenstein Locations Map (sized by word count)",
            zoom=3,
            height=700
        )
        
        # Update hover template to use custom text
        fig.update_traces(
            hovertemplate='%{customdata[0]}<extra></extra>'
        )
        
        fig.update_layout(
            mapbox_style="open-street-map",
            margin={"r":0,"t":50,"l":0,"b":0}
        )
        
        fig.show()
        
        print(f"\nüó∫Ô∏è Map created successfully!")
        print(f"üìä Showing {len(location_counts)} unique locations")
        print(f"üìç Total words at all locations: {location_counts['total_words'].sum()}")
        print(f"üìñ Total narrative words: {total_narrative_words:,}")
        print(f"üî¢ Percentage coverage: {(location_counts['total_words'].sum() / total_narrative_words * 100):.1f}%")
        
        # Show top locations by word count and percentage
        if len(location_counts) > 0:
            print(f"\nTop 10 locations by word count:")
            top_locations = location_counts.nlargest(10, 'total_words')
            for _, row in top_locations.iterrows():
                print(f"  {row[place_col]}: {row['total_words']} words ({row['narrative_percent']:.2f}%) ({row[lat_col]:.4f}, {row[lon_col]:.4f})")
    
    else:
        print("‚ùå No valid coordinate data found for mapping")
        
except Exception as e:
    print(f"‚ùå Error creating map: {e}")
    print("Check that the coordinate columns contain valid numeric data")

Using coordinates from columns: lat and long
Found 764 paragraphs with valid coordinates out of 764 total
Using curated place names from column: curated_name



üó∫Ô∏è Map created successfully!
üìä Showing 50 unique locations
üìç Total words at all locations: 71799
üìñ Total narrative words: 71,799
üî¢ Percentage coverage: 100.0%

Top 10 locations by word count:
  Geneva: 17076 words (23.78%) (46.2033, 6.1472)
  Artic: 7583 words (10.56%) (83.6117, 6.4572)
  Ingolstadt: 6283 words (8.75%) (48.7659, 11.4267)
  Delacey Cottage: 5697 words (7.93%) (48.6303, 11.1532)
  Delacey Cottage: 4438 words (6.18%) (48.6303, 11.1532)
  Montanvert: 4175 words (5.81%) (45.9319, 6.9180)
  Beach somewhere on the Irish Coast: 3920 words (5.46%) (55.1985, -6.6322)
  Orkney Islands: 3062 words (4.26%) (58.9327, -2.7488)
  Ingolstadt: 2260 words (3.15%) (48.7659, 11.4267)
  Belrive: 1228 words (1.71%) (46.2586, 6.1947)


# STREAMLINED WORKFLOW COMPLETE

The optimized workflow eliminates the redundant sentiment analysis that was previously running multiple times. 

**Key Improvements:**
1. ‚úÖ Sentiment analysis runs only ONCE on all paragraphs
2. ‚úÖ Manual location data used (no redundant geoparsing)
3. ‚úÖ All results saved as parquet files
4. ‚úÖ Presentation notebook is completely independent

**Previous Issues Fixed:**
- ‚ùå Sentiment was running twice (locations + characters)
- ‚ùå Geoparsing was unnecessary (manual data available)  
- ‚ùå Results stored in memory (hard to access later)
- ‚ùå Presentation notebook dependent on main notebook

**New Efficient Process:**
1. Load text ‚Üí Parse chapters ‚Üí Create paragraphs
2. Load manual location data (skip geoparsing)
3. Run sentiment analysis ONCE on all paragraphs
4. Analyze characters using existing sentiment scores
5. Analyze locations using existing sentiment scores  
6. Save everything as parquet files
7. Presentation notebook loads parquet files independently

In [None]:
# 2. Location sentiment map using processed data
print("üåç Creating location sentiment map...")

fig_sentiment = px.scatter_mapbox(
    location_sentiment_summary,
    lat=lat_col,
    lon=lon_col,
    hover_name='curated_name',
    size="total_words",
    color="avg_sentiment",
    color_continuous_scale='RdYlGn',
    color_continuous_midpoint=0,
    hover_data={"narrative_percent": ":.2f", "avg_sentiment": ":.3f", "sentiment_category": True},
    title="Frankenstein Emotional Geography: Location Sentiment Analysis",
    zoom=3,
    height=700
)

fig_sentiment.update_layout(
    mapbox_style="open-street-map",
    margin={"r":0,"t":50,"l":0,"b":0},
    coloraxis_colorbar=dict(
        title="Average Sentiment",
        tickvals=[-0.4, -0.2, 0, 0.2, 0.4],
        ticktext=["Very Negative", "Negative", "Neutral", "Positive", "Very Positive"]
    )
)

fig_sentiment.show()

print(f"‚úÖ Location sentiment map created with {len(location_sentiment_summary)} locations")


üó∫Ô∏è Enhanced sentiment map created successfully!
üìä Showing 50 unique locations
üî¥ Circle size = total words at location
üé® Color = average sentiment (red=negative, green=positive)
üìç Total words at all locations: 71799
üìñ Total narrative words: 71,799
üî¢ Percentage coverage: 100.0%

üìà SENTIMENT INSIGHTS:
Overall average sentiment across all locations: 0.009
üìù Mary Shelley writes about geographic locations with a generally neutral tone


In [None]:
# 3. Character sentiment visualizations
print("üé≠ Creating character sentiment visualizations...")

# Character sentiment bar chart
fig_char = px.bar(
    character_sentiment_df,
    x='Character',
    y='Avg_Sentiment',
    color='Avg_Sentiment',
    color_continuous_scale='RdYlGn',
    color_continuous_midpoint=0,
    title='Character Sentiment Analysis in Frankenstein',
    labels={'Avg_Sentiment': 'Average Sentiment Score'},
    hover_data=['Total_Mentions', 'Total_Words']
)

fig_char.add_hline(y=0, line_dash="dash", line_color="gray")
fig_char.update_layout(height=500, showlegend=False)
fig_char.show()

# Character frequency vs sentiment scatter
fig_scatter = px.scatter(
    character_sentiment_df,
    x='Total_Mentions',
    y='Avg_Sentiment',
    size='Total_Words',
    color='Avg_Sentiment',
    color_continuous_scale='RdYlGn',
    color_continuous_midpoint=0,
    hover_name='Character',
    title='Character Analysis: Frequency vs Sentiment',
    labels={
        'Total_Mentions': 'Paragraph Mentions',
        'Avg_Sentiment': 'Average Sentiment',
        'Total_Words': 'Total Words'
    }
)

fig_scatter.add_hline(y=0, line_dash="dash", line_color="gray", opacity=0.5)
fig_scatter.update_layout(height=500)
fig_scatter.show()

print(f"‚úÖ Character visualizations created for {len(character_sentiment_df)} characters")

# Summary insights
most_positive = character_sentiment_df.iloc[0]
most_negative = character_sentiment_df.iloc[-1]
print(f"\n? Key Insights:")
print(f"‚ú® Most positive character: {most_positive['Character']} ({most_positive['Avg_Sentiment']:.3f})")
print(f"‚õàÔ∏è Most negative character: {most_negative['Character']} ({most_negative['Avg_Sentiment']:.3f})")

üìä Interactive character sentiment visualizations created!
üîç Key insights:
- Bar chart 1: Shows overall sentiment ranking of characters
- Bar chart 2: Shows distribution of positive/neutral/negative paragraphs
- Scatter plot: Compares sentiment vs frequency (bubble size = total words)
