In [None]:
import os
import glob
import requests
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from google.colab import drive
try:
    drive.flush_and_unmount()
except Exception:
    pass

drive.mount('/content/drive', force_remount=True)

# Download NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# ---------------- Normalization + Tokenization + Cleaning ---------------- #
import re, unicodedata

# Negations & intensifiers to preserve (important for sentiment)
NEGATIONS = {"not", "no", "nor", "n't", "never", "none", "nobody", "nowhere", "neither"}
INTENSIFIERS = {"very", "so", "too", "quite", "rather", "really", "extremely", "highly", "barely", "hardly"}

# auxiliaries where "aux n't" -> "not" is a useful simplification
_AUX_FOR_NT = {
    "do","does","did",
    "is","are","was","were",
    "have","has","had",
    "will","would","can","could","should","shall","may","might","must"
}
# NEW: stems produced by tokenization for can't/won't/shan't
_AUX_NT_STEMS = {"ca", "wo", "sha"}

# keep only alphabetic tokens a–z (drop ALL punctuation and numbers)
_ALPHA_RE = re.compile(r"^[a-z]+$")

def normalize_text(text: str) -> str:
    """
    - Unicode NFKC (fixes curly quotes/spacing)
    - Convert curly quotes to straight quotes
    - Lowercase
    - Collapse multiple spaces
    """
    if not isinstance(text, str):
        return text
    text = unicodedata.normalize("NFKC", text)
    text = (text.replace("\u2019", "'")
                .replace("\u2018", "'")
                .replace("\u201C", '"')
                .replace("\u201D", '"'))
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text: str):
    """Word tokenization after normalization."""
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

def _fix_contractions(tokens):
    """
    Turn ['aux','n't'] -> ['not'] for auxiliaries and special stems (ca/wo/sha).
    Drop possessive "'s"/"’s".
    """
    fixed = []
    i = 0
    while i < len(tokens):
        t = tokens[i]
        # "aux n't" -> "not" (handle standard auxiliaries and stems)
        if i+1 < len(tokens) and tokens[i+1] in {"n't", "n’t"} and (t in _AUX_FOR_NT or t in _AUX_NT_STEMS):
            fixed.append("not")
            i += 2
            continue
        # drop possessive clitic
        if t in {"'s", "’s"}:
            i += 1
            continue
        fixed.append(t)
        i += 1
    return fixed

# ---------------- Stopword setup ---------------- #
GIST_URL = "https://gist.githubusercontent.com/rishg2/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt"

def load_stopwords():
    """Merge NLTK and gist stopwords; KEEP negations/intensifiers."""
    base = set(w.strip().lower() for w in stopwords.words('english'))
    try:
        resp = requests.get(GIST_URL, timeout=15)
        resp.raise_for_status()
        extra = {line.strip().lower() for line in resp.text.splitlines() if line.strip()}
        merged = base | extra
    except Exception:
        merged = base
    return merged - NEGATIONS - INTENSIFIERS

stop_words = load_stopwords()

# ---------------- Stopword removal ---------------- #
output_dir = '/content/drive/MyDrive/digphil/dialogues_filtered'
os.makedirs(output_dir, exist_ok=True)

def remove_stopwords(text):
    """
    Normalize → tokenize → fix contractions → drop stopwords →
    strip ALL punctuation/numbers (keep only a–z) → join.
    """
    if not isinstance(text, str):
        return text

    norm = normalize_text(text)
    words = tokenize(norm)
    words = _fix_contractions(words)

    # keep negations/intensifiers even if in stoplist; drop others in stoplist
    kept = [w for w in words if (w.lower() not in stop_words) or (w.lower() in NEGATIONS) or (w.lower() in INTENSIFIERS)]

    # strip ALL punctuation/numbers: keep only alphabetic tokens
    kept = [w for w in kept if _ALPHA_RE.fullmatch(w)]

    line = " ".join(kept).strip()
    return line

# ---------------- Process all CSVs (CSV ONLY) ---------------- #
for csv_path in glob.glob('/content/drive/MyDrive/digphil/dialogues/chapter_*_dialogues.csv'):
    df = pd.read_csv(csv_path)

    # Apply to every column (safe even if some columns are non-text)
    for col in df.columns:
        df[col] = df[col].apply(remove_stopwords)

    # Optional: drop rows that became entirely empty across all columns
    df = df.replace('', np.nan).dropna(how='all')

    out_path = os.path.join(output_dir, os.path.basename(csv_path))
    df.to_csv(out_path, index=False)
    print(f"Filtered file saved: {out_path}")

print("✅ All dialogue files processed and saved to 'dialogues_filtered' folder (CSV only).")


Mounted at /content/drive
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_1_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_10_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_11_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_12_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_13_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_14_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_15_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_16_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_17_dialogues.csv
Filtered file saved: /content/drive/MyDrive/digphil/dialogues_filtered/chapter_18_dialogues.csv
Filtered file s