In [10]:
import os
import fitz  # PyMuPDF
from pathlib import Path

# Base path to your dataset
BASE_PATH = Path.home() / "Downloads" / "F1-Regulations-Project"

# Gather all PDF files recursively
pdf_files = list(BASE_PATH.rglob("*.pdf"))

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Extract and store all documents as a dictionary
documents = {}
for pdf in pdf_files:
    try:
        documents[pdf.name] = extract_text_from_pdf(pdf)
    except Exception as e:
        print(f"Error reading {pdf.name}: {e}")

print(f"Extracted {len(documents)} documents.")


Extracted 142 documents.


In [11]:
import re

# Function to split a regulatory document into articles/sections
def segment_document(text):
    pattern = r"(Article\s+\d+(\.\d+)*|Appendix\s+[A-Z]+|Section\s+\d+(\.\d+)*)"
    matches = list(re.finditer(pattern, text, re.IGNORECASE))
    
    segments = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_title = matches[i].group().strip()
        content = text[start:end].strip()
        segments.append((section_title, content))
    return segments

# Segment one example document
sample_doc_key = list(documents.keys())[0]
segmented = segment_document(documents[sample_doc_key])

for section, content in segmented[:5]:  # Show first 5
    print(f"--- {section} ---\n{content[:300]}...\n")


In [36]:
import re

def segment_document(text):
    """
    Segments FIA regulation text into structured sections based on headers like:
    'Article 3.1', '3.1.1', 'Appendix A', etc.
    
    Parameters:
        text (str): Raw text from a regulation PDF.

    Returns:
        List of tuples: (section_title, content)
    """
    # Pattern captures:
    # - Article 3.1
    # - Section 2.1.4
    # - Appendix A
    # - Pure numeric sections like 3.1.1, 4.3, etc.
    pattern = r"(?i)(Article\s+\d+(\.\d+)*|Section\s+\d+(\.\d+)*|Appendix\s+[A-Z]|\n\d+(\.\d+)+)"
    
    matches = list(re.finditer(pattern, text))
    segments = []

    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_title = matches[i].group().strip().replace("\n", " ")
        content = text[start:end].strip()
        segments.append((section_title, content))

    return segments

# Try to find the first document that has at least 2 valid segments
segmented = []
for doc_name, text in documents.items():
    segments = segment_document(text)
    if len(segments) >= 2:
        segmented = segments
        print(f"✅ Using: {doc_name} ({len(segments)} sections found)")
        break

# Display first few detected section titles
if segmented:
    print("\nSample section headers:")
    for s in segmented[:5]:
        print(f"- {s[0]}")
else:
    print("❌ No suitable document with valid segments found.")


✅ Using: l22_approved_test_houses_harness_8854-1998_0.pdf (5 sections found)

Sample section headers:
- 02.12.2024
- 12.19
- 09.23
- 10.27
- 01.26


In [37]:
print([seg[0] for seg in segmented])


['02.12.2024', '12.19', '09.23', '10.27', '01.26']


In [42]:
from difflib import SequenceMatcher

def compare_rule_versions(documents, article_id):
    """
    Compare the same article across multiple documents and print changes.
    
    Parameters:
        documents (dict): {filename: raw_text}
        article_id (str): e.g., "Article 3.15"
    """
    found_versions = []

    for name, text in sorted(documents.items()):
        segments = segment_document(text)
        for title, content in segments:
            if article_id.lower() in title.lower():
                found_versions.append((name, content))
                break

    if len(found_versions) < 2:
        print(f"Not enough data to compare {article_id}")
        return

    print(f"🔍 Changes in {article_id}:\n")
    for i in range(len(found_versions) - 1):
        name1, text1 = found_versions[i]
        name2, text2 = found_versions[i + 1]
        ratio = SequenceMatcher(None, text1, text2).ratio()
        print(f"\n{article_id} Diff: {name1} → {name2} | Similarity: {round(ratio * 100, 2)}%")
        print("––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––")
        for line in SequenceMatcher(None, text1.splitlines(), text2.splitlines()).get_opcodes():
            if line[0] != 'equal':
                print(f"{line[0].upper()}: {text1.splitlines()[line[1]] if line[1] < len(text1.splitlines()) else ''}")
                print(f"{line[0].upper()}: {text2.splitlines()[line[3]] if line[3] < len(text2.splitlines()) else ''}")


In [43]:
compare_rule_versions(documents, "Article 3.15")


🔍 Changes in Article 3.15:


Article 3.15 Diff: F1_Technical_Regulations - [2022].pdf → F1_Technical_Regulations - [2023].pdf | Similarity: 2.96%
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
REPLACE: Article 3.15.6 whilst on the circuit 
REPLACE: Article 3.15.7.c. Such features must be the minimal required for applying the loads 

Article 3.15 Diff: F1_Technical_Regulations - [2023].pdf → F1_Technical_Regulations - [2024].pdf | Similarity: 100.0%
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––

Article 3.15 Diff: F1_Technical_Regulations - [2024].pdf → F1_Technical_Regulations - [2025].pdf | Similarity: 100.0%
––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––


In [45]:
# Keywords that signal ambiguity or loophole risk
LOOPHOLE_TRIGGERS = [
    r"\b(minimal|required|adequate|sufficient|expected|if applicable|if required|where possible|as necessary)\b",
    r"\b(flex|move|deform|displace|flow)\b",
    r"\b(must be able to withstand|should not)\b"
]

def detect_loophole_candidates(segments):
    """
    Scan regulation segments for potentially vague or risky language.

    Parameters:
        segments (list): List of (section_title, content)

    Returns:
        List of flagged sections with reasons
    """
    flagged = []

    for section, content in segments:
        found_terms = []
        for pattern in LOOPHOLE_TRIGGERS:
            if re.search(pattern, content, re.IGNORECASE):
                found_terms.append(re.findall(pattern, content, re.IGNORECASE))
        if found_terms:
            flagged.append({
                "Section": section,
                "Content": content[:500],  # Preview
                "Triggers": sum(found_terms, [])
            })

    return flagged


In [47]:
# Filter only 2025 technical document
doc_2025_key = [k for k in documents if "2025" in k and "Technical" in k][0]
segments_2025 = segment_document(documents[doc_2025_key])

# Detect potential loophole articles
loophole_flags = detect_loophole_candidates(segments_2025)

# Show results
for flag in loophole_flags[:5]:  # show top 5
    print(f"⚠️ Section: {flag['Section']}")
    print(f"Trigger Words: {set(flag['Triggers'])}")
    print(f"Excerpt:\n{flag['Content']}...\n")


⚠️ Section: 1.5
Trigger Words: {'required'}
Excerpt:
1.5 
New systems or technologies 
Any new system, procedure or technology not specifically covered by these regulations, but 
which is deemed permissible by the FIA Formula One Technical Department, will only be 
admitted until the end of the Championship during which it is introduced. Following this the 
Formula One Commission will be asked to review the technology concerned and, if they feel 
it adds no value to Formula One in general, it may be specifically prohibited by the FIA. 
Any team wh...

⚠️ Section: 3.1.3
Trigger Words: {'flow'}
Excerpt:
3.1.3 
External air stream 
The flow of air around the car which has a primary impact on its aerodynamic performance....

⚠️ Section: 3.1.10
Trigger Words: {'flow'}
Excerpt:
3.1.10 Aerodynamic seal 
The function by which the flow between two regions of different pressure is kept to the 
minimum feasible magnitude....

⚠️ Section: Article 3
Trigger Words: {'required'}
Excerpt:
Article 3 is

In [48]:
import pandas as pd

# Convert flagged loophole candidates to a DataFrame
df_loopholes = pd.DataFrame(loophole_flags)

# Save to CSV
df_loopholes.to_csv("f1_2025_loophole_candidates.csv", index=False)
print("✅ Exported flagged rules to f1_2025_loophole_candidates.csv")


✅ Exported flagged rules to f1_2025_loophole_candidates.csv


In [7]:
import fitz  # PyMuPDF
import re
from pathlib import Path
from random import choice

# 1. Load and parse all PDFs
BASE_PATH = Path.home() / "Downloads" / "F1-Regulations-Project"
pdf_files = list(BASE_PATH.rglob("*.pdf"))

def extract_text_from_pdf(pdf_path):
    try:
        if pdf_path.stat().st_size == 0:
            return None
        doc = fitz.open(pdf_path)
        return "".join([page.get_text() for page in doc])
    except Exception as e:
        print(f"Error reading {pdf_path.name}: {e}")
        return None

documents = {}
for pdf in pdf_files:
    text = extract_text_from_pdf(pdf)
    if text:
        documents[pdf.name] = text

# 2. Segmenter (improved)
def segment_document(text):
    pattern = r"(?i)(Article\s+\d+(\.\d+)*|Section\s+\d+(\.\d+)*|Appendix\s+[A-Z]|\n\d+(\.\d+)+)"
    matches = list(re.finditer(pattern, text))
    segments = []
    for i in range(len(matches)):
        start = matches[i].start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_title = matches[i].group().strip().replace("\n", " ")
        content = text[start:end].strip()
        segments.append((section_title, content))
    return segments

# 3. Trigger-based loophole detector
LOOPHOLE_TRIGGERS = [
    r"\b(minimal|required|adequate|sufficient|expected|if applicable|if required|where possible|as necessary)\b",
    r"\b(flex|move|deform|displace|flow)\b",
    r"\b(must be able to withstand|should not)\b"
]

def detect_loophole_candidates(segments):
    flagged = []
    for section, content in segments:
        found_terms = []
        for pattern in LOOPHOLE_TRIGGERS:
            if re.search(pattern, content, re.IGNORECASE):
                found_terms.append(re.findall(pattern, content, re.IGNORECASE))
        if found_terms:
            flagged.append({
                "Section": section,
                "Content": content[:1000],
                "Triggers": sum(found_terms, [])
            })
    return flagged

# 4. Simulate AI loophole analysis
def simulate_ai_loophole_analysis(rule_text):
    sample_exploit = choice([
        "Design adaptive aero parts that deform only above 250 km/h.",
        "Use vibration damping to mimic non-flexibility in floor components.",
        "Exploit temperature-sensitive coatings that soften under load testing.",
        "Redirect airflow through brake ducts under loosely defined dimensions.",
        "Reduce cooling aperture under 'minimal required' wording."
    ])
    rating = choice(["High Potential", "Medium", "Low", "Impractical"])
    return {
        "Suggested Loophole": sample_exploit,
        "Usefulness Rating": rating
    }

# 5. Run entire loop for 2025 technical regulations
doc_2025_key = [k for k in documents if "2025" in k and "Technical" in k][0]
segments_2025 = segment_document(documents[doc_2025_key])
loophole_flags = detect_loophole_candidates(segments_2025)

ai_insights = []
for flag in loophole_flags:
    insight = simulate_ai_loophole_analysis(flag["Content"])
    ai_insights.append({
        "Section": flag["Section"],
        "Trigger Words": list(set(flag["Triggers"])),
        "Excerpt": flag["Content"][:300],
        "Suggested Loophole": insight["Suggested Loophole"],
        "Usefulness": insight["Usefulness Rating"]
    })

print(f"✅ Rebuilt everything: {len(ai_insights)} loophole forecasts ready.")


✅ Rebuilt everything: 124 loophole forecasts ready.


In [8]:
import ipywidgets as widgets
from IPython.display import display, Markdown

# Dropdown to select sections
section_options = [f"{item['Section']} - ({item['Usefulness']})" for item in ai_insights]
dropdown = widgets.Dropdown(
    options=section_options,
    description='Section:',
    layout=widgets.Layout(width='95%')
)

# Display function
def show_loophole_details(section_label):
    idx = section_options.index(section_label)
    data = ai_insights[idx]
    display(Markdown(f"### 🔍 Section: {data['Section']}"))
    display(Markdown(f"**Trigger Words**: {', '.join(data['Trigger Words'])}"))
    display(Markdown(f"**Excerpt**:\n```\n{data['Excerpt']}\n```"))
    display(Markdown(f"**💡 Suggested Loophole**: _{data['Suggested Loophole']}_"))
    display(Markdown(f"**📊 Usefulness**: **{data['Usefulness']}**"))

# Hook dropdown to function
widgets.interact(show_loophole_details, section_label=dropdown)


interactive(children=(Dropdown(description='Section:', layout=Layout(width='95%'), options=('1.5 - (Medium)', …

<function __main__.show_loophole_details(section_label)>

In [9]:
import pandas as pd

# Save raw 2025 loophole triggers
pd.DataFrame(loophole_flags).to_csv("f1_2025_loophole_candidates.csv", index=False)

# Save AI-inferred forecasts
pd.DataFrame(ai_insights).to_csv("f1_2026_loophole_forecast.csv", index=False)

print("✅ Exported:")
print(" - f1_2025_loophole_candidates.csv")
print(" - f1_2026_loophole_forecast.csv")


✅ Exported:
 - f1_2025_loophole_candidates.csv
 - f1_2026_loophole_forecast.csv


In [17]:
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate

# Load your local gemma3 model from Ollama
llm = OllamaLLM(model="gemma3:latest")

# Define a structured prompt
prompt = PromptTemplate.from_template("""
You're an F1 car engineer and regulation strategist.

Given the regulation text below, do the following:
1. Identify a realistic technical loophole that a team might exploit.
2. Rate its potential: [High / Medium / Low / Impractical].
3. Predict how the FIA may respond in the 2026 regulation update.

Regulation:
\"\"\"{rule_text}\"\"\"
""")

# Build runnable chain
chain = prompt | llm

# Test one regulation
sample_rule = loophole_flags[0]["Content"]
response = chain.invoke({"rule_text": sample_rule})
print(response)


Okay, let’s dive into this. As an F1 engineer and regulation strategist, this regulation – 1.5 – is both incredibly interesting and, frankly, a recipe for chaos. It's a deliberate, almost *designed* loophole.

**1. Realistic Technical Loophole & Exploitation:**

The most realistic loophole lies within the interpretation of "adds no value to Formula One in general." Let's focus on **advanced thermal management systems** specifically relating to the engine and battery pack. 

Here's how a team like Red Bull or Ferrari could exploit it:

* **The Tech:** They’d develop a radically new, integrated thermal management system. This wouldn’t be a simple addition like a bigger heat exchanger. It would be a *system-level* approach.  This could involve:
    * **Microchannel Cooling Matrix:**  Instead of relying solely on traditional heat exchangers, they’d develop a complex network of microchannels directly integrated into the engine block and battery pack casing. These channels would be filled wi

In [34]:
import os
import json
import time
import re
import random
import pandas as pd
from datetime import datetime
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate

# === Config ===
CACHE_FILE = "tavily_cache.json"
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, "r") as f:
        search_cache = json.load(f)
else:
    search_cache = {}

tavily = TavilySearchResults()
rating_priority = {"High": 1, "Medium": 2, "Low": 3, "Impractical": 4, "Unrated": 5}

# === Prompt ===
template = """
You are a Formula 1 regulation strategist.

Given this FIA rule excerpt:

\"\"\"{context}\"\"\"

In max 5 bullet points, provide:
- A realistic technical loophole
- Usefulness rating: High / Medium / Low / Impractical
- What FIA might do in 2026 to address it
"""
prompt = PromptTemplate.from_template(template)

# === Retry Logic ===
def retry(func, max_attempts=2, wait_seconds=2, label=""):
    for attempt in range(max_attempts):
        try:
            return func()
        except Exception as e:
            print(f"⚠️ [{label}] Attempt {attempt + 1} failed: {e}")
            if attempt < max_attempts - 1:
                time.sleep(wait_seconds + random.random() * 2)
            else:
                raise e

# === Usefulness Extraction ===
def extract_rating(text):
    match = re.search(r"(High|Medium|Low|Impractical)", text, re.IGNORECASE)
    return match.group(1).capitalize() if match else "Unrated"

# === Main Pipeline ===
def auto_analyze_articles_dual_llm(article_list, output_csv="full_loop_forecast.csv", use_cache=True):
    results = []

    # Stage 1: Gemma3 for fast loop
    gemma = OllamaLLM(model="gemma3:latest")
    fast_chain = prompt | gemma

    for article in article_list:
        print(f"\n🔍 [{datetime.now().strftime('%H:%M:%S')}] Scanning {article} (Gemma)...")

        try:
            # Tavily web search with retry and caching
            if use_cache and article in search_cache:
                search = search_cache[article]
            else:
                search = retry(
                    lambda: tavily.invoke({"query": f"FIA Formula 1 2025 {article} site:fia.com"}),
                    max_attempts=2,
                    wait_seconds=2,
                    label=f"Tavily: {article}"
                )
                search_cache[article] = search
                with open(CACHE_FILE, "w") as f:
                    json.dump(search_cache, f, indent=2)

            if not search:
                results.append({
                    "Article": article,
                    "URL": None,
                    "Gemma Forecast": "No search results",
                    "Rating": "Unrated",
                    "Rating Score": 5,
                    "Deep Analysis (Top LLM)": ""
                })
                continue

            top = search[0]
            context = top.get("content", "")
            if len(context.strip()) < 100:
                results.append({
                    "Article": article,
                    "URL": top.get("url", ""),
                    "Gemma Forecast": "Too little context",
                    "Rating": "Unrated",
                    "Rating Score": 5,
                    "Deep Analysis (Top LLM)": ""
                })
                continue

            # Gemma3 forecast with retry
            gemma_output = retry(
                lambda: fast_chain.invoke({"context": context}),
                max_attempts=2,
                wait_seconds=1,
                label=f"Gemma: {article}"
            )

            rating = extract_rating(gemma_output)

            results.append({
                "Article": article,
                "URL": top["url"],
                "Gemma Forecast": gemma_output,
                "Rating": rating,
                "Rating Score": rating_priority.get(rating, 5),
                "Deep Analysis (Top LLM)": "",
                "Raw Context": context
            })

        except Exception as e:
            results.append({
                "Article": article,
                "URL": None,
                "Gemma Forecast": f"Error: {str(e)}",
                "Rating": "Unrated",
                "Rating Score": 5,
                "Deep Analysis (Top LLM)": ""
            })

    df = pd.DataFrame(results)
    df = df.sort_values(by="Rating Score")

    # Stage 2: DeepSeek only on high-value loopholes
    deepseek = OllamaLLM(model="deepseek-r1:7b")
    deep_chain = prompt | deepseek

    print("\n🧠 Running DeepSeek on top-rated loopholes...")

    for idx, row in df[df["Rating"] == "High"].iterrows():
        try:
            deep_output = retry(
                lambda: deep_chain.invoke({"context": row["Raw Context"]}),
                max_attempts=2,
                wait_seconds=2,
                label=f"DeepSeek: {row['Article']}"
            )
            df.at[idx, "Deep Analysis (Top LLM)"] = deep_output
        except Exception as e:
            df.at[idx, "Deep Analysis (Top LLM)"] = f"Error: {e}"

    df.drop(columns=["Raw Context"], inplace=True)
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Exported ranked results to {output_csv}")


In [35]:
articles = [f"Article {i}.{j}" for i in range(1, 4) for j in range(1, 5)]
auto_analyze_articles_dual_llm(articles, output_csv="ranked_with_retry.csv")



🔍 [03:08:12] Scanning Article 1.1 (Gemma)...

🔍 [03:09:09] Scanning Article 1.2 (Gemma)...

🔍 [03:09:46] Scanning Article 1.3 (Gemma)...

🔍 [03:10:29] Scanning Article 1.4 (Gemma)...

🔍 [03:11:14] Scanning Article 2.1 (Gemma)...

🔍 [03:11:49] Scanning Article 2.2 (Gemma)...

🔍 [03:12:25] Scanning Article 2.3 (Gemma)...

🔍 [03:13:06] Scanning Article 2.4 (Gemma)...

🔍 [03:13:43] Scanning Article 3.1 (Gemma)...

🔍 [03:14:15] Scanning Article 3.2 (Gemma)...

🔍 [03:15:03] Scanning Article 3.3 (Gemma)...

🔍 [03:15:41] Scanning Article 3.4 (Gemma)...

🧠 Running DeepSeek on top-rated loopholes...

✅ Exported ranked results to ranked_with_retry.csv


In [37]:
import os
import json
import time
import re
import random
import pandas as pd
from datetime import datetime
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_ollama import OllamaLLM
from langchain.prompts import PromptTemplate

# ========== CONFIG ==========
CACHE_FILE = "tavily_cache.json"
CHECKPOINT_FILE = "checkpoint_loopholes.json"
tavily = TavilySearchResults()
rating_priority = {"High": 1, "Medium": 2, "Low": 3, "Impractical": 4, "Unrated": 5}

# ========== HISTORIC LOOPHOLES ==========
historic_loopholes = [
    {"name": "Dual-Axis Steering (DAS)", "year": 2020, "team": "Mercedes", "keywords": ["DAS", "toe", "steering"], "risk": "Medium", "budget": "top"},
    {"name": "Double Diffuser", "year": 2009, "team": "Brawn", "keywords": ["double diffuser"], "risk": "Low", "budget": "mid"},
    {"name": "Mass Damper", "year": 2005, "team": "Renault", "keywords": ["mass damper"], "risk": "Medium", "budget": "mid"},
    {"name": "F-duct", "year": 2010, "team": "McLaren", "keywords": ["f-duct", "stall rear wing"], "risk": "Low", "budget": "top"},
    {"name": "Blown Diffuser", "year": 2011, "team": "Red Bull", "keywords": ["blown diffuser", "exhaust"], "risk": "High", "budget": "top"},
    {"name": "Flexible Front Wing", "year": 2021, "team": "Red Bull", "keywords": ["flex wing"], "risk": "Medium", "budget": "top"},
    {"name": "Ground-effect Skirts", "year": 1978, "team": "Lotus", "keywords": ["ground effect", "skirt"], "risk": "High", "budget": "mid"},
    {"name": "Floor Flex TD039", "year": 2022, "team": "Multi", "keywords": ["floor flex", "TD039"], "risk": "Medium", "budget": "top"},
]

def match_historic(loophole_text):
    txt = loophole_text.lower()
    for item in historic_loopholes:
        if any(k in txt for k in item["keywords"]):
            return item
    return None

# ========== PROMPT BUILDER ==========
def build_prompt(context, risk_mode="safe", team_profile="top"):
    risk_note = "Avoid grey-area or easily banned ideas." if risk_mode == "safe" else \
                "You may propose aggressive grey-area ideas if they plausibly survive scrutiny."
    budget_note = {
        "top":   "Assume big budget / fast development.",
        "mid":   "Assume moderate budget – aero updates OK, but no bespoke power-unit magic.",
        "low":   "Assume limited budget – prefer simple mechanical or operational tricks."
    }[team_profile]

    return f"""
You are a Formula 1 regulation strategist for a {team_profile}-budget team.  
{budget_note}

{risk_note}

Given this FIA rule excerpt:

'''{context}'''

Provide **one loophole idea** in ≤5 bullets that:
- Improves race pace or strategy
- Realistically deploys during a season
- Matches the budget & risk notes above

Include:
• Loophole concept  
• Performance gain  
• Usefulness rating (High / Medium / Low / Impractical)  
• Detection/regulation difficulty  
• Likely FIA response in 2026
"""

# ========== UTILITY FUNCTIONS ==========
def retry(func, max_attempts=2, wait_seconds=2, label=""):
    for attempt in range(max_attempts):
        try:
            return func()
        except Exception as e:
            print(f"⚠️ [{label}] Attempt {attempt + 1} failed: {e}")
            if attempt < max_attempts - 1:
                time.sleep(wait_seconds + random.random() * 2)
            else:
                raise e

def extract_rating(text):
    match = re.search(r"(High|Medium|Low|Impractical)", text, re.IGNORECASE)
    return match.group(1).capitalize() if match else "Unrated"

def score_row(row, team_profile, risk_mode):
    base = rating_priority.get(row["Rating"], 5)
    hist = match_historic(row["Gemma Forecast"])
    risk_penalty = 1 if hist and hist["risk"] == "High" and risk_mode == "safe" else 0
    budget_penalty = 1 if hist and hist["budget"] != team_profile else 0
    return base + risk_penalty + budget_penalty

# ========== MAIN FUNCTION ==========
def auto_analyze_articles_dual_llm(article_list, output_csv, risk_mode="safe", team_profile="top", use_cache=True):
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE, "r") as f:
            search_cache = json.load(f)
    else:
        search_cache = {}

    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            completed = set(json.load(f))
    else:
        completed = set()

    results = []
    gemma = OllamaLLM(model="gemma3:latest")
    deepseek = OllamaLLM(model="deepseek-r1:7b")

    for article in article_list:
        if article in completed:
            print(f"⏭ Skipping {article} (already completed)")
            continue

        print(f"\n🔍 [{datetime.now().strftime('%H:%M:%S')}] Scanning {article}...")

        try:
            if use_cache and article in search_cache:
                search = search_cache[article]
            else:
                search = retry(lambda: tavily.invoke({"query": f"FIA Formula 1 2025 {article} site:fia.com"}), label=f"Tavily: {article}")
                search_cache[article] = search
                with open(CACHE_FILE, "w") as f:
                    json.dump(search_cache, f, indent=2)

            if not search:
                results.append({"Article": article, "URL": None, "Gemma Forecast": "No search results", "Rating": "Unrated", "Rating Score": 5, "Deep Analysis (Top LLM)": ""})
                continue

            top = search[0]
            context = top.get("content", "")
            if len(context.strip()) < 100:
                results.append({"Article": article, "URL": top.get("url", ""), "Gemma Forecast": "Too little context", "Rating": "Unrated", "Rating Score": 5, "Deep Analysis (Top LLM)": ""})
                continue

            fast_prompt = build_prompt(context, risk_mode=risk_mode, team_profile=team_profile)
            gemma_output = retry(lambda: gemma.invoke(fast_prompt), label=f"Gemma: {article}")
            rating = extract_rating(gemma_output)

            results.append({
                "Article": article,
                "URL": top.get("url", ""),
                "Gemma Forecast": gemma_output,
                "Rating": rating,
                "Rating Score": rating_priority.get(rating, 5),
                "Deep Analysis (Top LLM)": "",
                "Raw Context": context
            })

            completed.add(article)
            with open(CHECKPOINT_FILE, "w") as f:
                json.dump(list(completed), f)

        except Exception as e:
            results.append({"Article": article, "URL": None, "Gemma Forecast": f"Error: {str(e)}", "Rating": "Unrated", "Rating Score": 5, "Deep Analysis (Top LLM)": ""})

    df = pd.DataFrame(results)

    print("\n🧠 Running DeepSeek on top-rated loopholes...")
    for idx, row in df[df["Rating"] == "High"].iterrows():
        try:
            deep_prompt = build_prompt(row["Raw Context"], risk_mode=risk_mode, team_profile=team_profile)
            deep_output = retry(lambda: deepseek.invoke(deep_prompt), label=f"DeepSeek: {row['Article']}")
            df.at[idx, "Deep Analysis (Top LLM)"] = deep_output
        except Exception as e:
            df.at[idx, "Deep Analysis (Top LLM)"] = f"Error: {e}"

    df.drop(columns=["Raw Context"], inplace=True)
    df["Total Score"] = df.apply(lambda r: score_row(r, team_profile, risk_mode), axis=1)
    df = df.sort_values("Total Score")
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Exported ranked results to: {output_csv}")

    top_summary = df[df["Rating"] == "High"][["Article", "Gemma Forecast", "Deep Analysis (Top LLM)"]].head(5)
    print("\n🏁 Top 5 High-Rated Loopholes:")
    print(top_summary.to_markdown(index=False))
    return df

# ========== RUN EXAMPLE ==========
if __name__ == "__main__":
    risk_mode = "safe"         # or "risky"
    team_profile = "top"       # "top", "mid", or "low"
    full_fia_articles = [f"Article {i}.{j}" for i in range(1, 16) for j in range(1, 10)]
    output_path = f"fia_loophole_ranking_{risk_mode}_{team_profile}.csv"
    auto_analyze_articles_dual_llm(full_fia_articles, output_path, risk_mode, team_profile)



🔍 [05:09:53] Scanning Article 1.1...

🔍 [05:10:49] Scanning Article 1.2...

🔍 [05:11:27] Scanning Article 1.3...

🔍 [05:12:01] Scanning Article 1.4...

🔍 [05:12:35] Scanning Article 1.5...

🔍 [05:13:22] Scanning Article 1.6...

🔍 [05:14:05] Scanning Article 1.7...

🔍 [05:14:43] Scanning Article 1.8...

🔍 [05:15:24] Scanning Article 1.9...

🔍 [05:16:08] Scanning Article 2.1...

🔍 [05:16:38] Scanning Article 2.2...

🔍 [05:17:15] Scanning Article 2.3...

🔍 [05:17:46] Scanning Article 2.4...

🔍 [05:18:29] Scanning Article 2.5...

🔍 [05:19:10] Scanning Article 2.6...

🔍 [05:19:55] Scanning Article 2.7...

🔍 [05:20:35] Scanning Article 2.8...

🔍 [05:21:15] Scanning Article 2.9...

🔍 [05:22:00] Scanning Article 3.1...

🔍 [05:22:37] Scanning Article 3.2...

🔍 [05:23:13] Scanning Article 3.3...

🔍 [05:23:50] Scanning Article 3.4...

🔍 [05:24:31] Scanning Article 3.5...

🔍 [05:25:13] Scanning Article 3.6...

🔍 [05:25:54] Scanning Article 3.7...

🔍 [05:26:28] Scanning Article 3.8...

🔍 [05:27:09