In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Colab's magic installs into the same interpreter you're running
%pip install -q requests beautifulsoup4 openai

In [None]:
cd /content/drive/MyDrive/Northern_Ireland_ScriptBot

/content/drive/MyDrive/Northern_Ireland_ScriptBot


In [None]:
import sys, textwrap
from pathlib import Path
from typing   import List, Tuple
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from openai import OpenAI

# ───────────────────────────────── edit here ────────────────────────────────
# Group-1 sources
URLS_FILE_1 : str  = "urls1.txt"
FILES_DIR_1 : str = "texts1"
# Group-2 sources
URLS_FILE_2 : str = "urls1.txt"
FILES_DIR_2 : str  = "texts1"
# Question you want ChatGPT to answer
QUESTION: str = "Compare the two source groups – how do they differ and where do they overlap? Or they are just same?"
# OpenAI
API_KEY : str  = ""                  # "sk-…" or leave None to use env var          # or put your key here
MODEL   : str       = "o1"
# Limits
MAX_CHARS_PER_SITE = 8_000
MAX_CHARS_PER_FILE = 8_000
# ────────────────────────────────────────────────────────────────────────────

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; scrape_and_query/1.0)"}

# ─────────────────────────── generic helpers ────────────────────────────────
def read_lines(path: Path) -> List[str]:
    return [
        ln.strip()
        for ln in path.read_text(encoding="utf-8").splitlines()
        if ln.strip() and not ln.lstrip().startswith("#")
    ]

def load_urls(urls_file: str  ) -> List[str]:
    return read_lines(Path(urls_file)) if urls_file and Path(urls_file).exists() else []

def load_files(files_dir: str  ) -> List[Path]:
    return (
        [p for p in Path(files_dir).iterdir() if p.is_file()]
        if files_dir and Path(files_dir).is_dir()
        else []
    )

def fetch_url_text(url: str, max_chars: int) -> Tuple[str, List[str]]:
    try:
        resp = requests.get(url, headers=HEADERS, timeout=15)
        resp.raise_for_status()
    except Exception as exc:
        return f"[Error fetching {url}: {exc}]", []

    soup = BeautifulSoup(resp.text, "html.parser")
    img_srcs = [urljoin(url, (img.get("src") or "").strip()) for img in soup.find_all("img")]
    return soup.get_text(" ", strip=True)[:max_chars], img_srcs

def read_file_text(path: Path, max_chars: int) -> str:
    try:
        txt = path.read_text(encoding="utf-8", errors="ignore")
    except Exception as exc:
        return f"[Error reading {path}: {exc}]"
    return txt[:max_chars]

# ───────────────────────── prompt construction ──────────────────────────────
def build_prompt(
    group1_sites: List[str], group1_files: List[str],
    group2_sites: List[str], group2_files: List[str]
) -> str:
    g1 = "".join(group1_sites + group1_files) or "[no content]"
    g2 = "".join(group2_sites + group2_files) or "[no content]"
    return textwrap.dedent(f"""
You are given **two source groups** (web pages & local files).
Use only these excerpts when answering.  Whenever you cite a fact,
mention the *exact URL* or *file name* it came from.

### Group 1
{g1}

### Group 2
{g2}

————————————————————————
**Question:** {QUESTION}
    """).strip()

# ───────────────────────────────── main ──────────────────────────────────────
def collect_group(urls_file: str  , files_dir: str
) -> Tuple[List[str], List[str]]:
    site_blurbs, file_blurbs = [], []

    # URLs
    for url in load_urls(urls_file):
        text, _ = fetch_url_text(url, MAX_CHARS_PER_SITE)
        site_blurbs.append(f"URL: {url}\nCONTENT:\n{text}")

    # Files
    for fp in load_files(files_dir):
        text = read_file_text(fp, MAX_CHARS_PER_FILE)
        file_blurbs.append(f"FILE: {fp.name}\nCONTENT:\n{text}")

    return site_blurbs, file_blurbs

def main() -> None:
    g1_sites, g1_files = collect_group(URLS_FILE_1, FILES_DIR_1)
    g2_sites, g2_files = collect_group(URLS_FILE_2, FILES_DIR_2)

    if not any([g1_sites, g1_files, g2_sites, g2_files]):
        sys.exit("[error] Nothing to analyse – both groups are empty.")

    prompt = build_prompt(g1_sites, g1_files, g2_sites, g2_files)
    print(prompt)
    with open("output.txt", "w", encoding="utf-8") as f:  # "w" = overwrite (create if missing)
        f.write(prompt)
    client = OpenAI(api_key=API_KEY) if API_KEY else OpenAI()
    resp = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a meticulous web analyst."},
            {"role": "user",    "content": prompt},
        ],
        #temperature=0.2,
    )
    print("\n— ChatGPT answer —\n")
    print(resp.choices[0].message.content)

if __name__ == "__main__":
    main()


You are given **two source groups** (web pages & local files).
Use only these excerpts when answering.  Whenever you cite a fact,
mention the *exact URL* or *file name* it came from.

### Group 1
URL: https://en.wikipedia.org/wiki/Northern_Ireland
CONTENT:
Northern Ireland - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Community portal Recent changes Upload file Special pages Search Search Appearance Donate Create account Log in Personal tools Donate Create account Log in Pages for logged out editors learn more Contributions Talk Contents move to sidebar hide (Top) 1 History Toggle History subsection 1.1 Home Rule Crisis 1.2 Partition of Ireland 1.3 1925–1965 1.4 The Troubles 1.5 Peace process 1.6 Executive crisis 2022–2024 2 Politics Toggle Politics subsection 2.1 Background 2.2 Governance 2.3 Descriptions 2.4 Alternative names 2.4.1 Unionist 2.4.2