In [2]:
# ──────────────────────────────────────────────────────────────────────────────
# 1. OCR the page ──────────────────────────────────────────────────────────────
# ──────────────────────────────────────────────────────────────────────────────
!pip install -q easyocr            # silent install

from PIL import Image
import easyocr, re, numpy as np
from sklearn.cluster import KMeans

image_path = "/content/Screenshot 2025-05-03 at 15.04.21.png"
Image.open(image_path)             # just so the notebook shows the image

reader  = easyocr.Reader(['en'])
result  = reader.readtext(image_path)   # [(bbox, text, conf), …]

for bbox, text, conf in result:
    print(f"{text} (Confidence: {conf:.2f})")



Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [3]:

# ──────────────────────────────────────────────────────────────────────────────
# 2. Pick a SINGLE‑YEAR column (here: 2015) and grab ONLY its numbers
# ──────────────────────────────────────────────────────────────────────────────
YEAR_TARGET = "2015"               # ← change this to "2023", "2024", …

# 2‑A. collect all numeric boxes (index, bbox, text) + their x‑centres
numeric_boxes, x_centres = [], []
for idx, (bbox, text, conf) in enumerate(result):
    if conf >= 0.5 and re.fullmatch(r"[\d,.\-]+", text.strip()):
        numeric_boxes.append((idx, bbox, text.strip()))
        xs = [p[0] for p in bbox]          # bbox = 4 corner points
        x_centres.append([sum(xs) / 4])

# 2‑B. cluster those x‑centres (=> columns)
k = min(8, len(x_centres))                 # assume ≤8 numeric columns
if k >= 2:
    kmeans   = KMeans(n_clusters=k, n_init="auto", random_state=0).fit(x_centres)
    clusters = kmeans.labels_
    centres  = kmeans.cluster_centers_.flatten()
else:                                       # only one column → trivial
    clusters = np.zeros(len(x_centres), dtype=int)
    centres  = np.array([x_centres[0][0]])

# 2‑C. find which cluster aligns with the YEAR_TARGET header
header_x = None
for bbox, text, _ in result:
    if re.search(rf"\b{YEAR_TARGET}\b", text):
        header_x = sum(p[0] for p in bbox) / 4
        break
if header_x is None:
    raise ValueError(f"Could not locate a header containing: {YEAR_TARGET}")

year_cluster = int(np.argmin(np.abs(centres - header_x)))

numbers = [                        # (ocr_index, numeric_text) ONLY in that cluster
    (idx, txt) for (idx, _, txt), c in zip(numeric_boxes, clusters) if c == year_cluster
]

# ──────────────────────────────────────────────────────────────────────────────
# 3. Same label/number pairing logic as before  (only numbers list changed)
# ──────────────────────────────────────────────────────────────────────────────
labels = [(i, line[1]) for i, line in enumerate(result) if line[2] >= 0.7]

def find_number_after(index, max_lookahead=3):
    for j in range(1, max_lookahead + 1):
        for num_idx, num_text in numbers:
            if num_idx == index + j:       # same OCR line distance heuristic
                return num_text
    return None

targets = {
    "Scope 1 (Direct Emissions)": ["DIRECT EMISSIONS", "SCOPE1"],
    "Scope 2 (Location-based)": ["Location based"],
    "Scope 2 (Market-based)": ["Market based"],
    "Scope 3 (Other Indirect Emissions)": ["OTHER INDIRECT EMISSIONS", "SCOPE3"],
    "Scope 3 Cat 1 (Purchased goods and services)": ["Cat.1", "goods and services"],
    "Scope 3 Cat 2 (Capital goods)": ["Cat.2", "Capital"],
    "Scope 3 Cat 3 (Fuel and energy activities)": ["Cat.3", "Fuel"],
    "Scope 3 Cat 4 (Upstream transportation and distribution)": ["Cat.4", "Upstream"],
    "Scope 3 Cat 6 (Business travel)": ["Cat.6", "Business"],
    "Scope 3 Cat 7 (Employee commuting)": ["Cat.7", "Commuting"],
    "Scope 3 Cat 11 (Natural gas retail market)": ["Cat.11", "natural gas"]
}

emissions = {}
for label, keywords in targets.items():
    for idx, text in labels:
        if any(k.lower() in text.lower() for k in keywords):
            val = find_number_after(idx)
            if val is not None:
                emissions[label] = val
            break

print(f"\nExtracted {YEAR_TARGET} column:")
for lbl, val in emissions.items():
    print(f"{lbl}: {val}")


Extracted 2015 column:
Scope 3 Cat 3 (Fuel and energy activities): 1,491
Scope 3 Cat 4 (Upstream transportation and distribution): 53,539
Scope 3 Cat 6 (Business travel): 12,857
Scope 3 Cat 7 (Employee commuting): 3,439


In [5]:
!pip install -qU langchain-openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/437.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m430.1/437.6 kB[0m [31m17.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m437.6/437.6 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m66.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
# ──────────────────────────────────────────────────────────────────────────────
# 4. Embeddings & semantic search (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
from langchain_openai import OpenAIEmbeddings
import os, getpass
embedding_inputs = [f"{lbl}: {val}" for lbl, val in emissions.items()]

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = "secret_key"

embeddings     = OpenAIEmbeddings(model="text-embedding-3-large")
value_vectors  = embeddings.embed_documents(embedding_inputs)
embedded_vals  = [
    {"label": lbl, "text": f"{lbl}: {val}", "vector": vec}
    for (lbl, val), vec in zip(emissions.items(), value_vectors)
]

query          = "Total emissions from business travel"
query_vector   = embeddings.embed_query(query)

from sklearn.metrics.pairwise import cosine_similarity
sims   = cosine_similarity([query_vector], [d["vector"] for d in embedded_vals])[0]
ranked = sorted(zip(embedded_vals, sims), key=lambda x: x[1], reverse=True)

print("\nTop semantic matches:")
for item, score in ranked[:5]:
    print(f"🔹 {item['label']} | Similarity: {score:.4f} | Text: {item['text']}")


Top semantic matches:
🔹 Scope 3 Cat 6 (Business travel) | Similarity: 0.5937 | Text: Scope 3 Cat 6 (Business travel): 12,857
🔹 Scope 3 Cat 7 (Employee commuting) | Similarity: 0.5173 | Text: Scope 3 Cat 7 (Employee commuting): 3,439
🔹 Scope 3 Cat 4 (Upstream transportation and distribution) | Similarity: 0.4770 | Text: Scope 3 Cat 4 (Upstream transportation and distribution): 53,539
🔹 Scope 3 Cat 3 (Fuel and energy activities) | Similarity: 0.4522 | Text: Scope 3 Cat 3 (Fuel and energy activities): 1,491


## It takes too long to pass the whole pdf to ocr, so the ideal would be to be able to find the pages in which the scopes table is and then apply ocr to only that page/those few candidate pages. However to do make this effective I think a layout parser of some kind is needed.

In [None]:
import fitz  # Clearly there are too many candidate pages lol

pdf_path = "/content/ghg-inventory-2024.pdf"
doc = fitz.open(pdf_path)

keywords = ["scope 1", "scope 2", "scope 3", "emissions", "ghg inventory", "market based"]
candidate_pages = []

for i, page in enumerate(doc):
    text = page.get_text().lower()
    if any(kw in text for kw in keywords):
        candidate_pages.append(i)

print("📄 Candidate pages:", candidate_pages)
