In [None]:
# === 0) Mount Google Drive ===
from google.colab import drive
try:
    drive.flush_and_unmount()
except Exception:
    pass
drive.mount('/content/drive', force_remount=True)

# === 1) Set project working directory on Drive ===
import os
os.chdir('/content/drive/MyDrive/digphil')
print("‚úÖ Working in:", os.getcwd())

# === 2) Imports ===
import pandas as pd
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

# === 3) NLTK data ===
nltk.download('stopwords', quiet=True)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    try:
        nltk.download('punkt_tab', quiet=True)
    except Exception:
        pass

# === 4) Stopwords ===
stop_words = set(stopwords.words('english'))

# === 5) Folders on Drive ===
dialogue_dir = 'dialogues_filtered'                         # input folder on Drive
output_dir = '/content/drive/MyDrive/digphil/wordclouds'    # output folder on Drive
os.makedirs(output_dir, exist_ok=True)

# === 6) Helper: safe tokenizer ===
def safe_tokenize(text: str):
    """Try NLTK word_tokenize; fall back to wordpunct_tokenize if resources missing."""
    try:
        return word_tokenize(text)
    except LookupError:
        return wordpunct_tokenize(text)

# === 7) Collect outputs for CSVs ===
summary_rows = []             # compact per-chapter list of top words
long_rows = []                # long-form: chapter, word, frequency (top 50)

# === 8) Generate word clouds and CSV details for chapters 1‚Äì19 ===
for i in range(1, 20):
    filename = f'chapter_{i}_dialogues.csv'
    filepath = os.path.join(dialogue_dir, filename)

    if not os.path.isfile(filepath):
        print(f"‚ö†Ô∏è  Warning: {filename} not found.")
        continue

    # Read and combine text
    # If your CSV has a 'dialogue' column, this will use it; otherwise fall back to first column
    df = pd.read_csv(filepath, encoding='utf-8', on_bad_lines='skip')
    if 'dialogue' in df.columns:
        text_series = df['dialogue'].astype(str)
    else:
        # fall back to first column
        text_series = df.iloc[:, 0].astype(str)

    text = ' '.join(text_series.tolist())

    # Tokenize & clean (lowercase, keep a‚Äìz only, remove stopwords)
    tokens = safe_tokenize(text.lower())
    words = [w for w in tokens if re.fullmatch(r'[a-z]+', w) and w not in stop_words]

    # Count top 50 words
    top50 = Counter(words).most_common(50)
    if not top50:
        print(f"‚ö†Ô∏è  No content for Chapter {i}.")
        continue

    # Build frequency dict for word cloud
    freq_dict = dict(top50)

    # Generate word cloud (high-res)
    wc = WordCloud(width=1600, height=800, background_color='white', colormap='viridis')
    wc.generate_from_frequencies(freq_dict)

    # Save image
    out_img = os.path.join(output_dir, f'chapter_{i}.png')
    wc.to_file(out_img)
    print(f"‚úÖ  Saved word cloud for Chapter {i} ‚Üí {out_img}")

    # Add compact summary row (top words only, comma-separated)
    summary_rows.append({
        'chapter': f'chapter_{i}',
        'most_frequent_words': ', '.join([w for w, c in top50])
    })

    # Add long-form rows (word + frequency)
    for w, c in top50:
        long_rows.append({'chapter': i, 'word': w, 'frequency': c})

# === 9) Save CSVs to Drive ===
summary_df = pd.DataFrame(summary_rows)
summary_csv_path = '/content/drive/MyDrive/digphil/chapter_word_summary.csv'
summary_df.to_csv(summary_csv_path, index=False)

long_df = pd.DataFrame(long_rows).sort_values(['chapter', 'frequency'], ascending=[True, False])
long_csv_path = '/content/drive/MyDrive/digphil/chapter_word_frequencies_top50.csv'
long_df.to_csv(long_csv_path, index=False)

print("\nüé®  All word clouds saved in /MyDrive/digphil/wordclouds/")
print(f"üßæ  Summary saved to: {summary_csv_path}")
print(f"üìä  Detailed frequencies saved to: {long_csv_path}")


Mounted at /content/drive
‚úÖ Working in: /content/drive/MyDrive/digphil
‚úÖ  Saved word cloud for Chapter 1 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_1.png
‚úÖ  Saved word cloud for Chapter 2 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_2.png
‚úÖ  Saved word cloud for Chapter 3 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_3.png
‚úÖ  Saved word cloud for Chapter 4 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_4.png
‚úÖ  Saved word cloud for Chapter 5 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_5.png
‚úÖ  Saved word cloud for Chapter 6 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_6.png
‚úÖ  Saved word cloud for Chapter 7 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_7.png
‚úÖ  Saved word cloud for Chapter 8 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_8.png
‚úÖ  Saved word cloud for Chapter 9 ‚Üí /content/drive/MyDrive/digphil/wordclouds/chapter_9.png
‚úÖ  Saved word cloud for Chapter 10 ‚Üí /content/drive/MyDrive