<a href="https://colab.research.google.com/github/kamangirkhan/Data110/blob/main/ArashNateghian_Project4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Arash Nateghian

In [None]:
from dataclasses import dataclass
from collections import Counter
from typing import List, Tuple, Dict, Optional
import os, re, sys, glob, textwrap, csv

# --- Optional libraries (used if available) ---
try:
    from wordcloud import WordCloud
except Exception:
    WordCloud = None  # type: ignore

try:
    from readability import Readability
    _HAS_READABILITY = True
except Exception:
    _HAS_READABILITY = False

try:
    import textstat  # fallback only
except Exception:
    textstat = None  # type: ignore

# --- Output directory (works in script or notebook) ---
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Simple English stopwords (lowercase) ---
STOPWORDS = set(
    """
    a about above after again against all am an and any are aren't as at be because been before being
    below between both but by can can't cannot could couldn't did didn't do does doesn't doing don't down during each every
    few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself
    him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let let's me more most much must mustn't
    my myself no nor not now of off on one once only or other ought our ours ourselves out over own re s same shan't she she'd
    she'll she's should shouldn't so some such t thank than that that's the their theirs them themselves then there there's
    these they they'd they'll they're they've this those through time to today together too under until up us very was wasn't we we'd we'll
    we're we've were weren't what what's when when's where where's which while will who who's whom why why's with won't
    would wouldn't you you'd you'll you're you've your yours yourself yourselves
    """.split()
)

# --- Regex helpers ---
WORD_RE = re.compile(r"[A-Za-z]+(?:-[A-Za-z]+)*|\d+")  # words (keep hyphens) or digits
POSSESSIVE_RE = re.compile(r"(.*?)('s|’s)$", re.IGNORECASE)
SENT_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")

@dataclass
class AnalysisResult:
    name: str
    total_words: int
    sentence_count: int
    avg_sentence_len: float
    top_words: List[Tuple[str, int]]
    readability: Dict[str, float]
    wordcloud_path: Optional[str]
    top_words_csv_path: Optional[str]

# --- Text processing ---

def strip_possessive(token: str) -> str:
    m = POSSESSIVE_RE.match(token)
    return m.group(1) if m else token


def tokenize_words(text: str) -> List[str]:
    tokens = WORD_RE.findall(text.lower())
    cleaned: List[str] = []
    for t in tokens:
        if t.isdigit():
            cleaned.append(t)
        else:
            cleaned.append(strip_possessive(t))
    return cleaned


def split_sentences(text: str) -> List[str]:
    text = text.strip()
    if not text:
        return []
    return [p.strip() for p in SENT_SPLIT_RE.split(text) if p.strip()]

# --- Core analytics ---

def compute_stats(text: str, stopwords: set, top_n: int = 7):
    sentences = split_sentences(text)
    words = tokenize_words(text)
    # sentence lengths by re-using same tokenizer
    sent_lens = [len(tokenize_words(s)) for s in sentences]
    total_words = len(words)
    sentence_count = len(sentences)
    avg_sentence_len = round((sum(sent_lens)/sentence_count) if sentence_count else 0.0, 2)
    content_words = [w for w in words if (not w.isdigit()) and (w not in stopwords)]
    freq = Counter(content_words)
    top = freq.most_common(top_n)
    return total_words, sentence_count, avg_sentence_len, top, freq


def generate_wordcloud(freq: Counter, out_path: str) -> Optional[str]:
    if WordCloud is None:
        return None
    try:
        wc = WordCloud(width=1200, height=700, background_color="white")
        wc.generate_from_frequencies(dict(freq))
        wc.to_file(out_path)
        return out_path
    except Exception:
        return None


def compute_readability(text: str) -> Dict[str, float]:
    scores: Dict[str, float] = {}
    if _HAS_READABILITY:
        try:
            r = Readability(text)
            scores["flesch_kincaid_grade"] = float(r.flesch_kincaid().score)
            return scores
        except Exception:
            pass
    if textstat is not None:
        try:
            scores["flesch_kincaid_grade"] = float(textstat.flesch_kincaid_grade(text))
        except Exception:
            pass
    return scores

# --- Utilities ---

def slugify(name: str) -> str:
    s = re.sub(r"[^a-zA-Z0-9_-]+", "-", name.strip())
    s = re.sub(r"-+", "-", s).strip("-")
    return s.lower() or "address"


def read_text_file(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


def save_top_words_csv(slug: str, top: List[Tuple[str, int]]) -> str:
    path = os.path.join(OUTPUT_DIR, f"{slug}_top_words.csv")
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.writer(f)
        w.writerow(["word", "count"])
        for word, cnt in top:
            w.writerow([word, cnt])
    return path


def print_report(res: AnalysisResult) -> None:
    print("\n" + "="*72)
    print(f"Analysis: {res.name}")
    print("-"*72)
    print(f"Total words:           {res.total_words}")
    print(f"Sentence count:        {res.sentence_count}")
    print(f"Avg words/sentence:    {res.avg_sentence_len}")
    print("\nTop 7 words (ex stopwords & numbers):")
    if not res.top_words:
        print("  (none)")
    else:
        for w, c in res.top_words:
            print(f"  {w:<16} {c}")
    print("\nReadability:")
    if not res.readability:
        print("  (readability libraries unavailable)")
    else:
        fk = res.readability.get("flesch_kincaid_grade")
        if fk is not None:
            print(
        f"Flesch–Kincaid Grade: {fk:.2f}\n"
        f"This means the text is written at about a US grade {round(fk)} reading level.\n"
        "Higher scores = harder text. Lower scores = easier to read.\n"
        "This helps judge how understandable a speech or essay is for the general public."
    )
    if res.wordcloud_path:
        print(f"\nWord cloud saved:      {res.wordcloud_path}")
    else:
        print("\nWord cloud:            (skipped — install 'wordcloud')")
    if res.top_words_csv_path:
        print(f"Top-words CSV:         {res.top_words_csv_path}")

# --- Folder/file selection ---

def choose_from_folder(folder: str) -> List[str]:
    txts = glob.glob(os.path.join(folder, "*.txt"))
    if not txts:
        print("No .txt files found.")
        return []
    print("Files found:")
    for i, p in enumerate(txts, 1):
        print(f"  {i}) {os.path.basename(p)}")
    sel = input("Enter numbers (comma) or 'a' for all: ").strip().lower()
    if sel == "a":
        return txts
    idxs = []
    for s in sel.split(','):
        s = s.strip()
        if not s:
            continue
        try:
            idxs.append(int(s)-1)
        except Exception:
            pass
    return [txts[i] for i in idxs if 0 <= i < len(txts)]


def pick_files_prompt() -> List[str]:
    raw = input("\nEnter .txt paths (comma-separated), or press ENTER to cancel:\n> ").strip()
    if not raw:
        return []
    paths = [p.strip().strip('"\'') for p in raw.split(',')]
    return [p for p in paths if os.path.isfile(p)]

# --- Main analysis wrapper ---

def analyze_text(name: str, text: str, stopwords: set) -> AnalysisResult:
    total_words, scount, avg_len, top, freq = compute_stats(text, stopwords)
    slug = slugify(name)
    wc_path = os.path.join(OUTPUT_DIR, f"{slug}_wordcloud.png")
    wc_saved = generate_wordcloud(freq, wc_path)
    top_csv = save_top_words_csv(slug, top)
    scores = compute_readability(text)
    return AnalysisResult(
        name=name,
        total_words=total_words,
        sentence_count=scount,
        avg_sentence_len=avg_len,
        top_words=top,
        readability=scores,
        wordcloud_path=wc_saved,
        top_words_csv_path=top_csv,
    )

# --- Menu loop ---

def menu_loop():
    stops = STOPWORDS
    analyzed: List[AnalysisResult] = []

    while True:
        print("\n" + "="*72)
        print("Address Analyzer — Menu")
        print("="*72)
        print("1) Analyze .txt file(s)")
        print("2) Show session summary")
        print("3) Quit")
        print("4) Analyze .txt files from a folder")
        choice = input("> ").strip()

        if choice == "1":
            paths = pick_files_prompt()
            if not paths:
                continue
            for p in paths:
                try:
                    text = read_text_file(p)
                except Exception as e:
                    print(f"  ! Failed to read {p}: {e}")
                    continue
                name = os.path.basename(p)
                res = analyze_text(name, text, stops)
                analyzed.append(res)
                print_report(res)
                # append to session CSV
                row = {
                    "name": res.name,
                    "total_words": str(res.total_words),
                    "sentence_count": str(res.sentence_count),
                    "avg_sentence_len": f"{res.avg_sentence_len}",
                    "fk_grade": f"{res.readability.get('flesch_kincaid_grade', '')}",
                    "wordcloud_path": res.wordcloud_path or "",
                }
                path_csv = os.path.join(OUTPUT_DIR, "session_summary.csv")
                write_header = not os.path.exists(path_csv)
                with open(path_csv, "a", newline="", encoding="utf-8") as f:
                    w = csv.DictWriter(f, fieldnames=list(row.keys()))
                    if write_header:
                        w.writeheader()
                    w.writerow(row)

        elif choice == "2":  # summary
            if not analyzed:
                print("No analyses yet.")
                continue
            print("\nSession summary (most recent first):")
            for res in reversed(analyzed):
                top1 = res.top_words[0][0] if res.top_words else "—"
                print(f"- {res.name}: words={res.total_words}, avg_wps={res.avg_sentence_len}, top1={top1}")

        elif choice == "3":
            print("Bye.")
            break

        elif choice == "4":  # folder flow
            folder = input("Enter folder path: ").strip()
            if not os.path.isdir(folder):
                print("Invalid folder.")
                continue
            paths = choose_from_folder(folder)
            if not paths:
                print("No files selected.")
                continue
            for p in paths:
                try:
                    text = read_text_file(p)
                except Exception as e:
                    print(f"  ! Failed to read {p}: {e}")
                    continue
                name = os.path.basename(p)
                res = analyze_text(name, text, stops)
                analyzed.append(res)
                print_report(res)
                # log as above
                row = {
                    "name": res.name,
                    "total_words": str(res.total_words),
                    "sentence_count": str(res.sentence_count),
                    "avg_sentence_len": f"{res.avg_sentence_len}",
                    "fk_grade": f"{res.readability.get('flesch_kincaid_grade', '')}",
                    "wordcloud_path": res.wordcloud_path or "",
                }
                path_csv = os.path.join(OUTPUT_DIR, "session_summary.csv")
                write_header = not os.path.exists(path_csv)
                with open(path_csv, "a", newline="", encoding="utf-8") as f:
                    w = csv.DictWriter(f, fieldnames=list(row.keys()))
                    if write_header:
                        w.writeheader()
                    w.writerow(row)

        else:
            print("Invalid choice. Try 1-4.")

# --- Entry point ---
if __name__ == "__main__":
    try:
        menu_loop()
    except KeyboardInterrupt:
        print("\nInterrupted. Bye.")



Address Analyzer — Menu
1) Analyze .txt file(s)
2) Show session summary
3) Quit
4) Analyze .txt files from a folder


>  4
Enter folder path:  D:\MC\PY\64B


Files found:
  1) Barack Obama 1st.txt
  2) Barack Obama 2nd.txt
  3) Donald J Trump 1st.txt
  4) Donald J Trump 2nd.txt
  5) George Bush.txt
  6) George W. Bush 1st.txt
  7) George W. Bush 2nd.txt
  8) Joseph R Biden Jr.txt
  9) Ronald Reagan 1st.txt
  10) Ronald Reagan 2nd.txt
  11) William J Clinton 2nd.txt
  12) William J. Clinton 1st.txt


Enter numbers (comma) or 'a' for all:  1,3,5,7



Analysis: Barack Obama 1st.txt
------------------------------------------------------------------------
Total words:           2417
Sentence count:        121
Avg words/sentence:    19.98

Top 7 words (ex stopwords & numbers):
  nation           12
  new              11
  america          10
  people           7
  less             7
  world            7
  work             6

Readability:
Flesch–Kincaid Grade: 9.80
This means the text is written at about a US grade 10 reading level.
Higher scores = harder text. Lower scores = easier to read.
This helps judge how understandable a speech or essay is for the general public.

Word cloud saved:      C:\Users\arash\output\barack-obama-1st-txt_wordcloud.png
Top-words CSV:         C:\Users\arash\output\barack-obama-1st-txt_top_words.csv

Analysis: Donald J Trump 1st.txt
------------------------------------------------------------------------
Total words:           1487
Sentence count:        90
Avg words/sentence:    16.52

Top 7 words (ex sto