# Setup and Dependencies

In [None]:
!pip install pymupdf requests beautifulsoup4 transformers torch selenium webdriver_manager easyocr

Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-win_amd64.whl (16.6 MB)
   ---------------------------------------- 0.0/16.6 MB ? eta -:--:--
   ---------------------------------------  16.5/16.6 MB 86.9 MB/s eta 0:00:01
   ---------------------------------------- 16.6/16.6 MB 74.8 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.4


In [None]:
import os
import fitz  # PyMuPDF for PDF reading
import requests
from bs4 import BeautifulSoup
import csv
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib.parse import urlparse

import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

from langchain_ollama import ChatOllama

# Create directories if not present
os.makedirs("chunks", exist_ok=True)
os.makedirs("pdf", exist_ok=True)  # Place your PDF files here

  from .autonotebook import tqdm as notebook_tqdm


# Chunking PDF Documents

In [2]:
chunk_size = 300  # characters per chunk
chunks_data = []  # list to store dictionaries with keys: "chunk_name" and "text"

# Helper function to chunk text and return list of chunk strings.
def chunk_text(text, chunk_size):
    chunks = []
    text = text.replace("\n", " ").strip()
    start = 0
    while start < len(text):
        chunk = text[start:start+chunk_size]
        if start + chunk_size < len(text):
            # try not to cut in the middle of a word
            last_space = chunk.rfind(" ")
            if last_space != -1 and last_space > 0:
                chunk = chunk[:last_space]
                start += last_space
            else:
                start += chunk_size
        else:
            start += chunk_size
        chunk = chunk.strip()
        if chunk:
            chunks.append(chunk)
    return chunks

# --- Process PDFs ---
pdf_files = [f for f in os.listdir("pdf") if f.lower().endswith(".pdf")]
print(f"Found {len(pdf_files)} PDF files.")

for pdf_file in pdf_files:
    pdf_path = os.path.join("pdf", pdf_file)
    doc = fitz.open(pdf_path)
    pdf_basename = os.path.splitext(pdf_file)[0]
    for page_num, page in enumerate(doc, start=1):
        page_text = page.get_text()
        if not page_text.strip():
            continue
        page_chunks = chunk_text(page_text, chunk_size)
        # Save each chunk with a name like: <pdf_basename>_page<page_num>_chunk<chunk_index>.txt
        for chunk_idx, chunk in enumerate(page_chunks, start=1):
            chunk_filename = f"{pdf_basename}_page{page_num}_chunk{chunk_idx}.txt"
            chunks_data.append({"chunk_name": chunk_filename, "text": chunk})
    doc.close()
print(f"Total PDF chunks created: {len(chunks_data)}")


Found 1 PDF files.
Total PDF chunks created: 26


# Scraping and Chunking Website

In [None]:
def create_session_with_retries():
    session = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retries)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

def get_web_name(url):
    """Extracts a sanitized web name from a URL."""
    parsed = urlparse(url)
    netloc = parsed.netloc
    if netloc.startswith("www."):
        netloc = netloc[4:]
    path = parsed.path.strip("/")
    if path:
        name = f"{netloc}_{path}"
    else:
        name = netloc
    # Remove any remaining problematic characters
    return name.replace("/", "_").replace(" ", "_")

def scrape_text(url):
    """Fetches and cleans text content from a URL using a session with retries.
    If the requests-based retrieval fails, falls back to using Selenium to capture a full-page screenshot
    and uses EasyOCR to extract text from the image.
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/91.0.4472.124 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9"
    }
    
    session = create_session_with_retries()
    
    try:
        resp = session.get(url, headers=headers, timeout=10)
        resp.raise_for_status()
    except Exception as e:
        print(f"Failed to retrieve {url} via requests: {e}")
        # Fallback: Use Selenium to capture a full-page screenshot and perform OCR using EasyOCR
        try:
            from selenium import webdriver
            from selenium.webdriver.chrome.options import Options
            from selenium.webdriver.chrome.service import Service
            from webdriver_manager.chrome import ChromeDriverManager
            
            options = Options()
            options.add_argument("--headless")
            options.add_argument("--disable-gpu")
            options.add_argument("--window-size=1920,1080")
            
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.get(url)
            # Get total scrollable height of the page
            total_height = driver.execute_script("return document.body.scrollHeight")
            driver.set_window_size(1920, total_height)
            screenshot_path = "screenshot.png"
            driver.save_screenshot(screenshot_path)
            driver.quit()
            print(f"Captured full-page screenshot for {url} as {screenshot_path}.")
            
            try:
                import easyocr
                # Create an EasyOCR reader object; set gpu=False if needed
                reader = easyocr.Reader(['en','th'], gpu=False)
                result = reader.readtext(screenshot_path, detail=0, paragraph=True)
                text = "\n".join(result)
                os.remove(screenshot_path)
                return text
            except Exception as ocr_e:
                print(f"EasyOCR failed: {ocr_e}")
                os.remove(screenshot_path)
                return ""
        except Exception as selenium_e:
            print(f"Selenium failed for {url}: {selenium_e}")
            return ""
    
    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["script", "style"]):
        tag.extract()
    text = " ".join(soup.get_text(separator=" ").split())
    return text

# List of URLs to scrape
urls = [
    "https://onlinemedia.idea2mobile.com/?p=6570",
    "https://wisesight.com/about-us/",
    "https://wisesight.com/zocialeye",
    "https://wisesight.com/warroom",
    "https://wisesight.com/command-center",
    "https://wisesight.com/research",
    "https://wisesight.com/consulting",
    "https://wisesight.com/monitoring",
    "https://wisesight.com/chatbot-service/",
    "https://wisesight.com/trend-24hours",
    "https://thailand.zocialawards.com/2022/"
]

chunks_data = []  # This list will store your chunk data.
chunk_size = 300  # Define your chunk size.

def chunk_text(text, chunk_size=300):
    chunks = []
    text = text.replace("\n", " ").strip()
    start = 0
    while start < len(text):
        chunk = text[start:start+chunk_size]
        if start + chunk_size < len(text):
            last_space = chunk.rfind(" ")
            if last_space != -1 and last_space > 0:
                chunk = chunk[:last_space]
                start += last_space
            else:
                start += chunk_size
        else:
            start += chunk_size
        chunk = chunk.strip()
        if chunk:
            chunks.append(chunk)
    return chunks

# Process each URL and create chunks
for url in urls:
    page_text = scrape_text(url)
    if not page_text:
        continue
    web_chunks = chunk_text(page_text, chunk_size)
    web_name = get_web_name(url)  # Get a name based on the URL
    for idx, chunk in enumerate(web_chunks, start=1):
        # Save each file as "{web_name}_chunk{idx}.txt"
        chunk_filename = f"{web_name}_chunk{idx}.txt"
        chunks_data.append({"chunk_name": chunk_filename, "text": chunk})
print(f"Total chunks after adding website content: {len(chunks_data)}")

Failed to retrieve https://wisesight.com/about-us/ via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /about-us/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))
Captured full-page screenshot for https://wisesight.com/about-us/ as screenshot.png.


Using CPU. Note: This module is much faster with a GPU.


Failed to retrieve https://wisesight.com/zocialeye via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /zocialeye (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/zocialeye as screenshot.png.
Failed to retrieve https://wisesight.com/warroom via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /warroom (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/warroom as screenshot.png.
Failed to retrieve https://wisesight.com/command-center via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /command-center (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/command-center as screenshot.png.
Failed to retrieve https://wisesight.com/research via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /research (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/research as screenshot.png.
Failed to retrieve https://wisesight.com/consulting via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /consulting (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/consulting as screenshot.png.
Failed to retrieve https://wisesight.com/monitoring via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /monitoring (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/monitoring as screenshot.png.
Failed to retrieve https://wisesight.com/chatbot-service/ via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /chatbot-service/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/chatbot-service/ as screenshot.png.
Failed to retrieve https://wisesight.com/trend-24hours via requests: HTTPSConnectionPool(host='wisesight.com', port=443): Max retries exceeded with url: /trend-24hours (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://wisesight.com/trend-24hours as screenshot.png.
Failed to retrieve https://thailand.zocialawards.com/2022/ via requests: HTTPSConnectionPool(host='thailand.zocialawards.com', port=443): Max retries exceeded with url: /2022/ (Caused by ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')))


Using CPU. Note: This module is much faster with a GPU.


Captured full-page screenshot for https://thailand.zocialawards.com/2022/ as screenshot.png.
Total chunks after adding website content: 80


# Save Chunks as Text File

In [4]:
for entry in chunks_data:
    file_path = os.path.join("chunks", entry["chunk_name"])
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(entry["text"])

print(f"Saved chunk files")


Saved chunk files


# Embedding and Storing into CSV

In [5]:
model_name = "BAAI/bge-m3"
print(f"Loading embedding model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
model.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded on device:", device)

# Compute embeddings in batches
embeddings_list = []  # list of lists (each inner list is a 1024-dim embedding)
batch_size = 16
texts = [entry["text"] for entry in chunks_data]
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the CLS token embedding (first token)
    cls_emb = outputs.last_hidden_state[:, 0, :]
    cls_emb = torch.nn.functional.normalize(cls_emb, p=2, dim=1)
    cls_emb = cls_emb.cpu().numpy()
    for emb in cls_emb:
        embeddings_list.append(emb.tolist())
print(f"Computed embeddings for {len(embeddings_list)} chunks (each {len(embeddings_list[0])} dimensions).")


Loading embedding model: BAAI/bge-m3
Model loaded on device: cpu
Computed embeddings for 80 chunks (each 1024 dimensions).


In [None]:
# Save embeddings to a CSV file with a header: first column "chunk_name", then columns for each dimension.
csv_filename = "embeddings.csv"
num_dims = len(embeddings_list[0])
header = ["chunk_name"] + [f"emb_{i}" for i in range(num_dims)]

with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    for entry, emb in zip(chunks_data, embeddings_list):
        writer.writerow([entry["chunk_name"]] + emb)
print(f"Embeddings saved to {csv_filename}.")


Embeddings saved to embeddings.csv.
