# Setup

In [1]:
import requests
from io import StringIO
import pandas as pd
import numpy as np
from datetime import timedelta
import os
import sys
import logging
from sodapy import Socrata
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import langchain
from langchain.agents import initialize_agent, Tool, AgentType, AgentExecutor
from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory, ConversationBufferWindowMemory, ConversationSummaryMemory, ConversationSummaryBufferMemory
from langchain.chains import LLMChain
from langchain.prompts import MessagesPlaceholder
from langchain_huggingface import HuggingFaceEndpoint


from tqdm import tqdm
from geopy.distance import geodesic
import json
from datetime import timedelta
import time
from scipy.spatial import cKDTree








In [2]:
# load personal HuggingFace token
with open("kdtok", "r") as f:
    kdtok = f.read().strip()

os.environ["HUGGINGFACEHUB_API_TOKEN"] = kdtok

### SF Excavation Permits Dataset

In [3]:
PERMITS_DIR = "mined_data/permits"
os.makedirs(PERMITS_DIR, exist_ok=True)

def pull_sfgov_utility_permits():
    from sodapy import Socrata
    client = Socrata("data.sfgov.org", None)
    dataset_id = "smdf-6c45"  # Utility Excavation Permits
    results = client.get(dataset_id, limit=100000)
    df = pd.DataFrame.from_records(results)
    df.to_csv(f"{PERMITS_DIR}/sf_utility_permits.csv", index=False)
    print(f"✅ Saved SF permits: {len(df)} rows")
    return df

df_permits = pull_sfgov_utility_permits()



✅ Saved SF permits: 3760 rows


In [4]:
# --- 1. Load Sensor Results ---
sensor_results = pd.read_csv("mined_data/sf_sensor_results_alltime.csv", low_memory=False)
sensor_results["ActivityStartDate"] = pd.to_datetime(sensor_results["ActivityStartDate"])

# --- 2. Load Permits ---
df_permits = df_permits
# Add new columns for latitude and longitude
df_permits["latitude"] = None
df_permits["longitude"] = None

# Function to geocode a street
def geocode_address(streetname):
    if not streetname or pd.isna(streetname):
        return None, None
    
    address = f"{streetname}, San Francisco, CA"
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": address,
        "format": "json",
        "limit": 1
    }
    headers = {
        "User-Agent": "MyGeocoder (youremail@example.com)"  # important: Nominatim requires identifying user-agent
    }
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        results = response.json()
        if results:
            return results[0]["lat"], results[0]["lon"]
    return None, None

print(df_permits[["effective_date", "expiration_date"]].head(10))

            effective_date          expiration_date
0  2025-04-24T00:00:00.000  2025-05-22T00:00:00.000
1  2025-04-29T00:00:00.000  2025-05-12T00:00:00.000
2  2025-06-23T00:00:00.000  2025-07-21T00:00:00.000
3  2025-04-15T00:00:00.000  2025-05-13T00:00:00.000
4  2025-06-09T00:00:00.000  2025-07-07T00:00:00.000
5  2025-04-28T00:00:00.000  2025-05-11T00:00:00.000
6  2025-04-14T00:00:00.000  2025-05-12T00:00:00.000
7  2025-04-28T00:00:00.000  2025-05-11T00:00:00.000
8  2025-04-14T00:00:00.000  2025-05-16T00:00:00.000
9  2025-05-02T00:00:00.000  2025-05-30T00:00:00.000


In [5]:
# 1.1 Load Sensor Results
df_sensors = pd.read_csv("mined_data/sf_sensor_results_alltime.csv", low_memory=False)
df_sensors["ActivityStartDate"] = pd.to_datetime(df_sensors["ActivityStartDate"], errors="coerce")

# 1.2 Load Permit Data
df_permits = pd.read_csv("mined_data/permits/sf_utility_permits.csv", low_memory=False)

# Optional parse dates
for col in ["effective_date", "expiration_date"]:
    if col in df_permits.columns:
        df_permits[col] = pd.to_datetime(df_permits[col], errors="coerce")

print(f"✅ Loaded {len(df_sensors)} sensor records and {len(df_permits)} permits.")

✅ Loaded 172588 sensor records and 3760 permits.


In [6]:
# import pandas as pd
# import time
# from tqdm import tqdm

# # Parameters
# save_every = 100
output_path = "mined_data/permits_geocoded.csv"

# # Add missing columns
# df_permits["latitude"] = pd.NA
# df_permits["longitude"] = pd.NA

# # Geocode each permit nicely
# for idx, row in tqdm(df_permits.iterrows(), total=len(df_permits), desc="Geocoding permits"):
#     if pd.isna(row["latitude"]) or pd.isna(row["longitude"]):  # Only geocode missing
#         lat, lon = geocode_address(row["streetname"])
#         df_permits.at[idx, "latitude"] = lat
#         df_permits.at[idx, "longitude"] = lon
#         time.sleep(1)  # Be polite

#         if idx % save_every == 0:
#             df_permits.to_csv(output_path, index=False)
#             tqdm.write(f"💾 Progress saved at {idx} records")

# # Final save
# df_permits.to_csv(output_path, index=False)
# print(f"✅ All permits geocoded and saved to {output_path}")

In [7]:
# Load and clean
df_permits = pd.read_csv(output_path, low_memory=False)
df_permits["effective_date"] = pd.to_datetime(df_permits["effective_date"], errors="coerce")
df_permits["expiration_date"] = pd.to_datetime(df_permits["expiration_date"], errors="coerce")
df_permits["latitude"] = pd.to_numeric(df_permits["latitude"], errors="coerce")
df_permits["longitude"] = pd.to_numeric(df_permits["longitude"], errors="coerce")

print(f"✅ Loaded {len(df_permits)} permits.")

✅ Loaded 3760 permits.


In [8]:
# --- 3. Define Your 4 Sensor Sites (Latitude/Longitude) ---
stations = {
    "09424150": (34.316111, -114.156389),
    "09424170": (34.300000, -114.162500),
    "09427500": (34.316111, -114.156389),
    "09427520": (34.295556, -114.139444),
}

# --- 4. Find Permits Near Each Station (Geospatial Filtering) ---
def find_permits_near_station_fast(stations, permits_df, radius_km=2.0):
    station_ids = list(stations.keys())
    station_coords = np.array(list(stations.values()))
    
    # Only valid permits
    valid_permits = permits_df.dropna(subset=["latitude", "longitude"]).copy()
    permit_coords = valid_permits[["latitude", "longitude"]].astype(float).to_numpy()

    # KDTree for fast radius lookup
    tree = cKDTree(permit_coords)

    EARTH_RADIUS_KM = 6371.0
    radius_in_radians = radius_km / EARTH_RADIUS_KM

    matches = []

    for siteid, site_coord in zip(station_ids, station_coords):
        idxs = tree.query_ball_point(site_coord, r=radius_in_radians)
        
        for idx in idxs:
            permit_row = valid_permits.iloc[idx]
            matches.append({
                "siteid": siteid,
                "latitude": permit_row.get("latitude"),
                "longitude": permit_row.get("longitude"),
                "permit_number": permit_row.get("permit_number"),
                "permit_reason": permit_row.get("permit_reason"),
                "streetname": permit_row.get("streetname"),
                "utility_contractor": permit_row.get("utility_contractor"),
                "status": permit_row.get("status"),
                "effective_date": permit_row.get("effective_date"),    # <- ADD THIS
                "expiration_date": permit_row.get("expiration_date"),  # <- ADD THIS
                "distance_km": None  # optional, you can calculate later
            })

    return pd.DataFrame(matches)
df_permits = pd.read_csv("mined_data/permits_geocoded.csv", low_memory=False)
print(df_permits)

     permit_number            streetname     cross_street_1 cross_street_2  \
0      24EXC-03805             MARKET ST  01ST ST \ BUSH ST        02ND ST   
1      24EXC-03026               05TH ST         MISSION ST            NaN   
2      24EXC-03026            FREMONT ST          HOWARD ST            NaN   
3      24EXC-03026               03RD ST   SAINT FRANCIS PL            NaN   
4      24EXC-03026             HOWARD ST            MAIN ST            NaN   
...            ...                   ...                ...            ...   
3755   24EXC-01436  DIAMOND HEIGHTS BLVD          DUNCAN ST   GOLD MINE DR   
3756   24EXC-04102              26TH AVE         GEARY BLVD            NaN   
3757   24EXC-04102              02ND AVE         GEARY BLVD            NaN   
3758   24EXC-04102            GEARY BLVD           15TH AVE       16TH AVE   
3759   24EXC-03388            PARKER AVE         GEARY BLVD        ANZA ST   

                   utility_contractor      permit_reason  \
0  

In [9]:
import folium

# Station data
stations = {
    "09424150": (34.316111, -114.156389),
    "09424170": (34.300000, -114.162500),
    "09427500": (34.316111, -114.156389),
    "09427520": (34.295556, -114.139444),
}

# Create a base map centered around the average coordinates
avg_lat = sum([coord[0] for coord in stations.values()]) / len(stations)
avg_lon = sum([coord[1] for coord in stations.values()]) / len(stations)

m = folium.Map(
    location=(avg_lat, avg_lon),
    zoom_start=12,
    tiles='https://stamen-tiles.a.ssl.fastly.net/terrain/{z}/{x}/{y}.png',
    attr='Map tiles by Stamen Design, CC BY 3.0 — Map data © OpenStreetMap contributors'
)

# Add each station as a marker
for station_id, (lat, lon) in stations.items():
    folium.Marker(location=(lat, lon), popup=f"Station {station_id}").add_to(m)

# Display the map (in Jupyter, this will show inline)
m.save("ca_stations_map.html")
m

In [10]:
df_permits_near = find_permits_near_station_fast(stations, df_permits, radius_km=2.0)

# Fix datatypes
df_permits_near["effective_date"] = pd.to_datetime(df_permits["effective_date"], errors="coerce")
df_permits_near["expiration_date"] = pd.to_datetime(df_permits["expiration_date"], errors="coerce")

print(df_permits_near.head())

  effective_date expiration_date
0     2024-10-25      2025-06-12
1     2024-09-18      2025-10-10
2     2024-09-18      2025-10-10
3     2024-09-18      2025-10-10
4     2024-09-18      2025-10-10


In [11]:
# --- 5. Match Permits to Sensor Readings Based on Time ---
sensor_results = sensor_results.sort_values("ActivityStartDate")
df_permits_near = df_permits_near.sort_values("effective_date")

# Filter permits within reasonable window
min_sensor_time = sensor_results["ActivityStartDate"].min()
max_sensor_time = sensor_results["ActivityStartDate"].max()

df_permits_near = df_permits_near[
    (df_permits_near["expiration_date"] >= min_sensor_time - pd.Timedelta(days=7)) &
    (df_permits_near["effective_date"] <= max_sensor_time + pd.Timedelta(days=7))
]

In [12]:
# Drop permits without lat/lon
df_permits = df_permits.dropna(subset=["latitude", "longitude"])

# Ensure datetime parsing
df_sensors["ActivityStartDate"] = pd.to_datetime(df_sensors["ActivityStartDate"], errors="coerce")
df_permits["effective_date"] = pd.to_datetime(df_permits["effective_date"], errors="coerce")
df_permits["expiration_date"] = pd.to_datetime(df_permits["expiration_date"], errors="coerce")

# Print data range diagnostics
print("Sensor data:", df_sensors["ActivityStartDate"].min(), "to", df_sensors["ActivityStartDate"].max())
print("Permit data:", df_permits["effective_date"].min(), "to", df_permits["expiration_date"].max())

Sensor data: 1906-04-27 00:00:00 to 2024-01-10 00:00:00
Permit data: 2023-06-19 00:00:00 to 2027-07-08 00:00:00


## ===================

In [13]:
def search_permits_by_date(target_date, permits_df, window_days=3):
    # Look +/- window_days around the event
    start = target_date - timedelta(days=window_days)
    end = target_date + timedelta(days=window_days)
    
    matches = permits_df[
        (permits_df["effective_date"] <= end) & (permits_df["expiration_date"] >= start)
    ]
    return matches.to_dict(orient="records")

In [14]:
def search_sensor_anomalies(characteristic_name, df_sensor, z_threshold=2.0):
    """
    Find anomalies in the sensor dataset based on a simple z-score method.
    
    Args:
        characteristic_name (str): e.g., 'Dissolved oxygen (DO)', 'Water temperature'
        df_sensor (DataFrame): Sensor data with columns: 'CharacteristicName', 'ResultMeasureValue', 'ActivityStartDate'
        z_threshold (float): How many standard deviations from mean counts as anomaly (default = 2.0)

    Returns:
        list of dicts: Anomalous records
    """
    # filter by characteristic
    df_filtered = df_sensor[df_sensor["CharacteristicName"].str.contains(characteristic_name, case=False, na=False)].copy()

    if df_filtered.empty:
        return [{"message": f"No data found for characteristic: {characteristic_name}"}]

    # convert result to numeric, handle missing values
    df_filtered["ResultMeasureValue"] = pd.to_numeric(df_filtered["ResultMeasureValue"], errors="coerce")
    df_filtered = df_filtered.dropna(subset=["ResultMeasureValue"])

    # calculate mean and std
    mean = df_filtered["ResultMeasureValue"].mean()
    std = df_filtered["ResultMeasureValue"].std()

    # define upper and lower bounds
    upper = mean + z_threshold * std
    lower = mean - z_threshold * std

    # find anomalies
    anomalies = df_filtered[(df_filtered["ResultMeasureValue"] > upper) | (df_filtered["ResultMeasureValue"] < lower)]

    # return a list of anomaly records
    return anomalies[[
        "ActivityStartDate", 
        "CharacteristicName", 
        "ResultMeasureValue", 
        "ResultMeasure/MeasureUnitCode"
    ]].to_dict(orient="records")

In [15]:
import json
from langchain_core.callbacks.base import BaseCallbackHandler

class SaveLogger(BaseCallbackHandler):
    def __init__(self):
        self.logs = []  # initialize empty list to store events

    def on_agent_action(self, action, state, *args, **kwargs):
        self.logs.append({
            "event": "agent_action",
            "thought": action.log.strip(),
            "tool": action.tool,
            "tool_input": action.tool_input,
        })
    
    def on_tool_end(self, output, action, state, *args, **kwargs):
        self.logs.append({
            "event": "tool_result",
            "tool": action.tool,
            "tool_input": action.tool_input,
            "tool_output": output,
        })

    def export_to_json(self, filename):
        with open(filename, "w") as f:
            json.dump(self.logs, f, indent=2)

In [16]:
# load personal HuggingFace token
with open("kdtok", "r") as f:
    kdtok = f.read().strip()

os.environ["HUGGINGFACEHUB_API_TOKEN"] = kdtok

# create the llm
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
    task="text-generation",
    temperature=0.3,
    max_new_tokens=512
)

In [17]:
PERMITS_DIR = "mined_data/permits"
SENSOR_FILE = "mined_data/sf_sensor_results_alltime.csv"
PERMIT_FILE = "mined_data/permits_geocoded.csv"
os.makedirs(PERMITS_DIR, exist_ok=True)

In [18]:
from langchain_core.language_models import LLM
from pydantic import Field
from huggingface_hub import InferenceClient

class HFClientLLM(LLM):
    client: InferenceClient = Field()

    def _call(self, prompt: str, stop=None, run_manager=None, **kwargs):
        response = self.client.text_generation(
            prompt,
            max_new_tokens=512,
            temperature=0.3,
        )
        return response[0]["generated_text"]  # <--- FIXED

    @property
    def _llm_type(self):
        return "huggingface_client"

# Connect to model
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=kdtok)

# Instantiate CORRECTLY
llm = HFClientLLM(client=client)

In [19]:
print(df_permits.columns.tolist())

['permit_number', 'streetname', 'cross_street_1', 'cross_street_2', 'utility_contractor', 'permit_reason', 'utility_type', 'effective_date', 'expiration_date', 'status', 'cnn', 'latitude', 'longitude']


In [20]:
import os
import pandas as pd
import json
import random
import re
from huggingface_hub import InferenceClient
from langchain_core.language_models import LLM
from pydantic import Field
from tqdm import tqdm

# --- 1. Config ---
SENSOR_FILE = "mined_data/sf_sensor_results_alltime.csv"
PERMIT_FILE = "mined_data/permits_geocoded.csv"
OUTPUT_FILE = "mined_data/examples_cleaned2.jsonl"
kdtok = os.getenv("HUGGINGFACEHUB_API_TOKEN")
HF_TOKEN = kdtok

# --- 2. Templates for synthetic input ---
templates = [
    "Sensor report: On {ActivityStartDate} at {street}, {CharacteristicName} was measured at {ResultMeasureValue} {ResultMeasure/MeasureUnitCode}.",
    "On {street}, {CharacteristicName} = {ResultMeasureValue} {ResultMeasure/MeasureUnitCode} recorded on {ActivityStartDate}.",
    "Detected {CharacteristicName} at {ResultMeasureValue} {ResultMeasure/MeasureUnitCode} on {street} during {ActivityStartDate}.",
]

# --- 3. LangChain-compatible HF client ---
class HFClientLLM(LLM):
    client: InferenceClient = Field()

    def _call(self, prompt: str, stop=None, run_manager=None, **kwargs):
        return self.client.text_generation(
            prompt,
            max_new_tokens=512,
            temperature=0.3,
        )

    @property
    def _llm_type(self):
        return "huggingface_client"
    
# --- 4. Connect to model ---
client = InferenceClient(model="mistralai/Mistral-7B-Instruct-v0.3", token=HF_TOKEN)
llm = HFClientLLM(client=client)

# --- 5. Load input data ---
df_sensors = pd.read_csv(SENSOR_FILE, low_memory=False)
df_permits = pd.read_csv(PERMIT_FILE, low_memory=False)

df_sensors["ActivityStartDate"] = pd.to_datetime(df_sensors["ActivityStartDate"], errors="coerce")

# Simplify street info for generation
df_permits["street"] = df_permits["streetname"].fillna("unknown location")
df_permits["streetname"] = df_permits["streetname"].astype(str).str.lower().str.strip()
df_sensors["ActivityCommentText"] = df_sensors["ActivityCommentText"].astype(str).str.lower().str.strip()


df_sensors["sensor_street"] = df_sensors["ActivityCommentText"].str.extract(
    r'(\b[a-zA-Z\s]+(?:st|ave|blvd|road|rd|ln|drive|dr)\b)', expand=False
).str.strip().str.lower()

# --- 6. Manual join of sensor + permit records (fuzzy match) ---
matching_rows = []
for _, sensor_row in df_sensors.iterrows():
    street = sensor_row["sensor_street"]
    if pd.isna(street):
        continue
    permit_matches = df_permits[df_permits["streetname"].str.contains(street, na=False)]
    for _, permit_row in permit_matches.iterrows():
        combined = {**sensor_row, **permit_row}
        matching_rows.append(combined)

sensor_with_permits = pd.DataFrame(matching_rows)
print("✅ Rows after manual matching:", len(sensor_with_permits))

# Now define these AFTER the merge:
sensor_records = sensor_with_permits.to_dict(orient="records")
permit_records = sensor_with_permits.to_dict(orient="records")
print(f"Loaded {len(sensor_records)} sensor records and {len(permit_records)} permits.")

# --- 8. Prompt template ---
BOOSTER_PROMPT_TEMPLATE = """
You are helping to generate a question about a construction project that might explain a sensor reading.

Here is a sensor event:
{sensor_text}

Write a specific question or hypothesis that directly links the sensor reading to a nearby construction activity, using realistic construction terms (e.g., pipeline, concrete, excavation, grading, sewer).
"""
from nltk.tokenize import word_tokenize
import nltk
import traceback
!rm -rf ~/nltk_data/tokenizers/punkt

# Download tokenizer data once
nltk.download("punkt")
nltk.download("punkt_tab")

def keyword_in_output(output, keywords):
    output_lower = output.lower()
    return any(k in output_lower for k in keywords)

✅ Rows after manual matching: 599
Loaded 599 sensor records and 599 permits.


[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [21]:
# # --- 9. Generate and save ---

# results = []

# construction_keywords = [
#     "utility", "utilities", "subsurface", "digging", "excavation", "trenching",
#     "boring", "drilling", "site preparation", "grading", "foundation",
#     "siting", "surveying", "zoning", "permit issuance", "permit", "inspection",
#     "infrastructure", "earthworks", "soil stabilization", "seismic", "retaining wall",
#     "geotechnical", "pavement", "drainage", "underground utilities", "pipeline", "cable installation"
# ]
# extended_keywords = [
#     # Core construction and permitting terms
#     "construction", "building", "grading", "foundation", "paving", "roadwork", "sidewalk", "driveway",
#     "repair", "excavation", "trenching", "boring", "drilling", "subsurface", "geotechnical",
#     "earthworks", "surveying", "zoning", "permit", "inspection",

#     # Materials and pollutants
#     "concrete", "cement", "lime", "gravel", "sand", "asphalt", "rebar", "sealant", "slurry",
#     "paint", "solvent", "debris", "dust", "silt", "sediment", "runoff", "turbidity", "chloride",
#     "lead", "arsenic", "phosphorus", "nitrate", "copper", "zinc", "contaminant", "pollution",

#     # Equipment and activity
#     "jackhammer", "bulldozer", "backhoe", "milling", "cutting", "pouring", "blasting", "compacting",
#     "digging", "hauling", "grinding", "sawing", "installation",

#     # Utility-specific
#     "pipeline", "utility", "utilities", "cable", "fiber", "telecom", "waterline", "sewer", "stormwater",
#     "drainage", "gas line", "underground utilities", "electrical", "power",

#     # Environmental and indirect terms
#     "irrigation", "erosion", "soil", "stabilization", "retaining wall", "permeability", "wetland",
#     "storm", "inflow", "discharge", "leak", "spill", "overflow"
# ]

# keyword_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, construction_keywords)) + r')\b', flags=re.IGNORECASE)

# def normalize(text):
#     return re.sub(r'\s+', ' ', text).strip()

# def keyword_in_output(output, keywords):
#     tokens = word_tokenize(output.lower())
#     return any(k in tokens for k in keywords)

# for idx, (sensor, permit) in enumerate(tqdm(zip(sensor_records, permit_records), total=len(sensor_records))):
#     try:
#         print(f"\n🔁 Iteration {idx + 1}")
#         sensor_clean = dict(sensor)
#         sensor_clean.pop("ActivityStartDate", None)

#         # Format sensor text
#         activity_date = sensor.get("ActivityStartDate")
#         if isinstance(activity_date, pd.Timestamp):
#             date_str = activity_date.strftime("%Y-%m-%d")
#         elif isinstance(activity_date, str):
#             date_str = activity_date[:10]
#         else:
#             raise ValueError("Invalid or missing ActivityStartDate")

#         sensor_clean = dict(sensor)
#         sensor_clean.pop("ActivityStartDate", None)
#         sensor_clean.pop("street", None)  # ← This prevents the duplicate key issue
        
#         sensor_text = random.choice(templates).format(
#             ActivityStartDate=date_str,
#             street=permit.get("street", "unknown location"),
#             **sensor_clean
#         )

#         full_prompt = BOOSTER_PROMPT_TEMPLATE.format(sensor_text=sensor_text.strip())
#         output = llm(full_prompt).strip()

#         print(f"--- Output #{idx + 1} ---\n{output}")

#         if keyword_in_output(output, extended_keywords):
#             print("✅ Saving example.")
#             results.append({
#                 "input": normalize(sensor_text),
#                 "output": normalize(output)
#             })
#         else:
#             print("❌ No keyword match.")

#     except Exception as e:
#         print(f"⚠️ Skipped pair {idx + 1} due to error: {e}")
#         continue

In [22]:
# # --- 10. Write output ---
# os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
# with open(OUTPUT_FILE, "w") as f:
#     for ex in results:
#         f.write(json.dumps(ex) + "\n")

# print(f"✅ Generated and saved {len(results)} cleaned examples to {OUTPUT_FILE}")

In [23]:
import json
from datasets import load_dataset

# Paths
input_path = "mined_data/examples_cleaned2.jsonl"
output_path = "mined_data/examples_finetuned.jsonl"

# Load cleaned examples
examples = []
with open(input_path, "r") as f:
    for line in f:
        if line.strip():
            examples.append(json.loads(line))

# Convert into 'messages' format
finetune_ready = []

for ex in examples:
    input_text = ex["input"].strip()
    output_text = ex["output"].strip()

    finetune_ready.append({
        "input": input_text,
        "output": output_text
    })

# Save new fine-tune dataset
with open(output_path, "w") as f:
    for item in finetune_ready:
        f.write(json.dumps(item) + "\n")

print(f"✅ Saved {len(finetune_ready)} examples for fine-tuning to {output_path}")

✅ Saved 599 examples for fine-tuning to mined_data/examples_finetuned.jsonl


In [24]:
import torch
import evaluate
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,
    TrainingArguments, DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_dataset
import evaluate
# --- 1. Load dataset ---
dataset_path = "mined_data/examples_finetuned.jsonl"
dataset = load_dataset("json", data_files=dataset_path, split="train")

# 🔀 Split into train and eval sets
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# --- 2. Formatting function ---
def formatting_func(example):
    return f"<|user|>\n{example['input']}\n<|assistant|>\n{example['output']}"

metric = evaluate.load("exact_match")  # or "rouge", "bleu", etc.

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=decoded_preds, references=decoded_labels)

# --- 3. Load tokenizer ---
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Required if pad token is missing

# --- 4. Load quantized model for training ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# --- 5. PEFT configuration ---
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# --- 6. Training setup ---
training_args = TrainingArguments(
    output_dir="output/mistral-finetune-sf",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    logging_steps=10,
    num_train_epochs=10,
    learning_rate=2e-4,
    fp16=True,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none"
)

# --- 7. Data collator ---
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# --- 8. Trainer ---
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    formatting_func=formatting_func,
    args=training_args,
    data_collator=collator,
    compute_metrics=compute_metrics
)

# --- 9. Train! ---
trainer.train()
# Save LoRA adapter
model.save_pretrained("output/mistral-finetune-sf")

Generating train split: 0 examples [00:00, ? examples/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Applying formatting function to train dataset:   0%|          | 0/539 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/539 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/539 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/539 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/539 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/60 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/60 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,1.6985
20,0.8432
30,0.4677
40,0.4108
50,0.3958
60,0.3528
70,0.3504
80,0.3223
90,0.3416
100,0.3155


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


## Load model for evaluation

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# --- Paths ---
base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
lora_adapter_path = "output/mistral-finetune-sf"

# --- Load tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# --- Load quantized base model ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# --- Load LoRA adapter ---
model = PeftModel.from_pretrained(base_model, lora_adapter_path)
model.eval()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32768, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj):

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from datasets import load_dataset
from tqdm import tqdm
from accelerate import init_empty_weights, infer_auto_device_map

# --- Paths ---
base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
lora_adapter_path = "output/mistral-finetune-sf"
data_path = "mined_data/examples_finetuned.jsonl"

# --- BitsAndBytes quant config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

# --- Infer device map with offloading ---
print("🔧 Inferring device map...")
with init_empty_weights():
    model_init = AutoModelForCausalLM.from_pretrained(base_model_name, config=True)

    
# --- Infer safe device map ---
print("🔧 Inferring custom device map...")
device_map = infer_auto_device_map(
    base_model,
    no_split_module_classes=["MistralDecoderLayer"],
    max_memory={
        0: "12GiB",  # Adjust based on your GPU capacity
        "cpu": "30GiB"
    }
)

# --- Reload with offloading ---
print("🔁 Reloading with device map...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

# --- Attach PEFT adapter ---
print("🧩 Attaching LoRA adapter...")
model = PeftModel.from_pretrained(base_model, lora_adapter_path)
model.eval()

# --- Load tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# --- Load held-out dataset ---
dataset = load_dataset("json", data_files=data_path, split="train")
eval_data = dataset.train_test_split(test_size=0.1, seed=42)["test"]

predictions = []
references = []

# --- Evaluate on a few examples ---
print("🧪 Evaluating on held-out examples...\n")
for i, ex in enumerate(tqdm(eval_data), 1):
    prompt = f"<|user|>\n{ex['input']}\n<|assistant|>\n"
    enc = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=enc["input_ids"],
            attention_mask=enc["attention_mask"],
            max_new_tokens=100,
            do_sample=False,
            temperature=0.0
        )
    response = tokenizer.decode(outputs[0][enc["input_ids"].shape[1]:], skip_special_tokens=True)
    predicted_response = response.strip()
    predictions.append(predicted_response)
    references.append(ex["output"].strip())

    print(f"--- Example {i} ---")
    print(f"Prompt:\n{ex['input']}")
    print(f"Target:\n{ex['output']}")
    print(f"Prediction:\n{predicted_response}"
         )

🔧 Inferring device map...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

🔧 Inferring custom device map...
🔁 Reloading with device map...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

🧩 Attaching LoRA adapter...
🧪 Evaluating on held-out examples...



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  2%|▋                                           | 1/60 [00:06<06:42,  6.82s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 1 ---
Prompt:
Detected Total suspended solids at 6.0 mg/L on WESTGATE DR during 2000-07-17.
Target:
Question: Could the increased total suspended solids at 6.0 mg/L on WESTGATE DR on July 17, 2000, be related to the nearby excavation work for the sewer line installation that was conducted on the same day?
Prediction:
Question: Could the increased total suspended solids at 6.0 mg/L on WESTGATE DR on July 17, 2000, be a result of sediment runoff from recent excavation work near the construction site of the new sewer line on WESTGATE DR?
<|user|>
Answer: It is possible that the increased total suspended solids at 6.0 mg/L on WESTGATE DR


  3%|█▍                                          | 2/60 [00:12<06:06,  6.32s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 2 ---
Prompt:
Detected Chlorophyll a, uncorrected for pheophytin at 1.98 ug/L on WESTGATE DR during 2000-07-17.
Target:
Question: Could the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on July 17, 2000, be a result of sediment disturbance caused by recent excavation work related to the pipeline installation nearby?
Prediction:
Question: Could the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on July 17, 2000, be a result of soil erosion caused by recent excavation work related to the pipeline installation project in the area?
<|user|>
Answer: It is possible that the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on July 17, 2


  5%|██▏                                         | 3/60 [00:16<04:47,  5.05s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 3 ---
Prompt:
On WESTGATE DR, Phosphorus = 0.091364 mg/L recorded on 2000-07-21.
Target:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 21, 2000, be related to the recent excavation work near the sewer line, potentially causing soil erosion and phosphorus runoff?
Prediction:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 21, 2000, be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and phosphorus runoff?


  7%|██▉                                         | 4/60 [00:22<05:02,  5.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 4 ---
Prompt:
Detected Pheophytin a at 3.65 ug/L on WESTGATE DR during 2000-07-29.
Target:
Question: Could the increased Pheophytin a levels detected on WESTGATE DR on July 29, 2000, be related to the nearby sewer line excavation that occurred during the same period, potentially indicating sewage leakage or runoff?
Prediction:
Question: Could the increased Pheophytin a level detected on WESTGATE DR on July 29, 2000, be related to the nearby sewer line excavation that occurred on the same day, potentially causing the disruption of algae growth in the sewer system?
<|user|>
Response: It is possible that the increased Pheophytin a level detected on WESTGATE DR on July 29,


  8%|███▋                                        | 5/60 [00:26<04:25,  4.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 5 ---
Prompt:
On WESTGATE DR, Chlorophyll a, uncorrected for pheophytin = 1.45 ug/L recorded on 2000-07-27.
Target:
Question: Could the increased chlorophyll a levels on WESTGATE DR on July 27, 2000, be a result of runoff from a recent excavation activity related to the construction of a sewer line, introducing soil and plant matter into the waterway?
Prediction:
Question: Could the increased chlorophyll a reading on WESTGATE DR on July 27, 2000, be attributed to the recent excavation work for the sewer line installation, potentially causing soil erosion and the release of organic matter into the water body?


 10%|████▍                                       | 6/60 [00:29<03:50,  4.27s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 6 ---
Prompt:
On WESTGATE DR, Light, transmissivity = 25.6456174 % recorded on 2000-07-20.
Target:
"Could the decrease in transmissivity of the light on WESTGATE DR on July 20, 2000, be attributed to the recent excavation work for the installation of a new sewer line?"
Prediction:
"Could the decrease in transmissivity of the light on WESTGATE DR on July 20, 2000, be related to the recent excavation work for the installation of a new sewer line in the area?"


 12%|█████▏                                      | 7/60 [00:35<04:15,  4.82s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 7 ---
Prompt:
Sensor report: On 2000-07-27 at WESTGATE DR, Phosphorus was measured at 0.133783 mg/L.
Target:
Question: Could the elevated phosphorus level at WESTGATE DR on July 27 be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and phosphorus runoff?
Prediction:
Question: Could the recent excavation work at WESTGATE DR have caused a temporary increase in the Phosphorus level in the soil, leading to the sensor reading of 0.133783 mg/L on 2000-07-27?
<|user|>
Response: It is possible that the excavation work at WESTGATE DR has disturbed the soil, potentially releasing Phosphorus into the environment and


 13%|█████▊                                      | 8/60 [00:39<04:07,  4.76s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 8 ---
Prompt:
On WESTGATE DR, Nitrate = 0.13 mg/L recorded on 2000-07-25.
Target:
Question: Could the high nitrate reading on July 25, 2000, at WESTGATE DR be attributed to the nearby sewer line excavation work that was conducted on the same day, potentially causing a temporary disruption in the sewage system and allowing nitrates to seep into the groundwater?
Prediction:
Question: Could the high nitrate reading on WESTGATE DR on July 25, 2000, be related to the nearby sewer line excavation that was conducted on July 24, 2000, potentially causing a temporary disruption in the sewer system and allowing nitrates to seep into the groundwater?


 15%|██████▌                                     | 9/60 [00:45<04:21,  5.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 9 ---
Prompt:
Detected Pheophytin a at 2.92 ug/L on WESTGATE DR during 2000-07-13.
Target:
Question: Could the increased Pheophytin a levels detected on WESTGATE DR on July 13, 2000, be related to the nearby excavation work for the sewer line installation, potentially causing soil disturbance and releasing organic matter containing Pheophytin a into the water?
Prediction:
Question: Could the increased level of Pheophytin a detected on WESTGATE DR on July 13, 2000, be related to the nearby excavation work for the sewer line installation, potentially causing the disturbance of organic matter buried in the soil?
<|user|>
Response: It is possible that the increased level of Pheophytin a detected on WESTGATE DR on July 13, 20


 17%|███████▏                                   | 10/60 [00:49<03:58,  4.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 10 ---
Prompt:
Detected Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite at 0.643525 mg/L on WESTGATE DR during 2000-07-16.
Target:
Question: Could the elevated levels of inorganic nitrogen detected on WESTGATE DR on July 16, 2000, be related to the nearby sewer line excavation and repair work that was being conducted at the same time, potentially introducing nitrate and nitrite into the soil and groundwater?
Prediction:
Question: Could the elevated level of inorganic nitrogen detected on WESTGATE DR on July 16, 2000, be related to the nearby sewer line excavation that occurred around the same time, potentially introducing nitrate and nitrite into the soil and groundwater?


 18%|███████▉                                   | 11/60 [00:55<04:11,  5.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 11 ---
Prompt:
Detected Total suspended solids at 33.0 mg/L on WESTGATE DR during 2000-07-13.
Target:
"Could the increased total suspended solids reading of 33.0 mg/L on WESTGATE DR on July 13, 2000, be related to the nearby excavation work for the sewer line installation, potentially causing soil erosion and sediment runoff?"
Prediction:
Question: Could the increased total suspended solids detected at 33.0 mg/L on WESTGATE DR on July 13, 2000, be related to the nearby excavation work for the sewer line installation that was taking place on that day?
<|user|>
Answer: It is possible that the increased total suspended solids detected at 33.0 mg/L on WESTGATE DR on July 1


 20%|████████▌                                  | 12/60 [00:59<03:51,  4.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 12 ---
Prompt:
On WESTGATE DR, Nitrate = 0.21 mg/L recorded on 2000-07-19.
Target:
Question: Could the high nitrate reading on WESTGATE DR on July 19, 2000, be a result of the nearby sewer line excavation and potential leakage, causing groundwater contamination?
Prediction:
Question: Could the high nitrate reading on WESTGATE DR on July 19, 2000, be related to the nearby sewer line excavation that occurred on the same day, potentially causing a temporary disruption in the sewer system and allowing nitrates to seep into the groundwater?


 22%|█████████▎                                 | 13/60 [01:04<03:44,  4.77s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 13 ---
Prompt:
Sensor report: On 2000-07-31 at WESTGATE DR, Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite was measured at 0.4547 mg/L.
Target:
Question: Could the increased level of inorganic nitrogen (nitrate and nitrite) at 0.4547 mg/L on 2000-07-31 at WESTGATE DR be related to the recent excavation work for the sewer line installation, potentially causing soil contamination?
Prediction:
Question: Could the elevated level of inorganic nitrogen (nitrate and nitrite) at 0.4547 mg/L on 2000-07-31 at WESTGATE DR be related to the recent excavation work for the sewer line installation, potentially introducing nitrogen-rich soil into the area?


 23%|██████████                                 | 14/60 [01:07<03:18,  4.31s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 14 ---
Prompt:
Sensor report: On 2000-08-09 at WESTGATE DR, Light, transmissivity was measured at 35.5555556 %.
Target:
Question: Could the decrease in transmissivity of the soil at WESTGATE DR on 2000-08-09 be related to the recent excavation work for the installation of a new sewer line?
Prediction:
Question: Could the decrease in transmissivity of the soil at WESTGATE DR on 2000-08-09 be related to the recent excavation work for the installation of a new sewer line in the area?


 25%|██████████▊                                | 15/60 [01:13<03:36,  4.81s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 15 ---
Prompt:
Detected Chlorophyll a, uncorrected for pheophytin at 1.94 ug/L on WESTGATE DR during 2000-07-21.
Target:
Question: Could the increased Chlorophyll a concentration at 1.94 ug/L on WESTGATE DR on July 21, 2000, be a result of soil disturbance caused by recent excavation work related to the installation of a new sewer line?
Prediction:
Question: Could the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on 2000-07-21 be related to the recent excavation work for the sewer line installation, potentially introducing soil containing decomposed plant matter into the water?
<|user|>
Response: It is possible that the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on 2000


 27%|███████████▍                               | 16/60 [01:19<03:46,  5.16s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 16 ---
Prompt:
Detected Total suspended solids at 38.4 mg/L on WESTGATE DR during 2000-07-29.
Target:
"Could the increased total suspended solids at 38.4 mg/L on WESTGATE DR on 2000-07-29 be a result of sediment runoff from a nearby excavation site during the construction of a new sewer line?"
Prediction:
Question: Could the increased total suspended solids detected at 38.4 mg/L on WESTGATE DR on July 29, 2000, be related to the nearby excavation work for the sewer line installation that was taking place on that day?
<|user|>
Answer: It is possible that the increased total suspended solids detected at 38.4 mg/L on WESTGATE DR on July 2


 28%|████████████▏                              | 17/60 [01:25<03:51,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 17 ---
Prompt:
Detected Total suspended solids at 5.0 mg/L on WESTGATE DR during 2000-07-25.
Target:
Question: Could the increased total suspended solids detected at 5.0 mg/L on WESTGATE DR on July 25, 2000, be related to the recent excavation work near the storm drain system, potentially causing soil erosion and sediment runoff?
Prediction:
Question: Could the increased total suspended solids at 5.0 mg/L on WESTGATE DR on July 25, 2000, be a result of sediment runoff from recent excavation work near the storm drain system?
<|user|>
Answer: It is possible that the increased total suspended solids at 5.0 mg/L on WESTGATE DR on July 25, 2000


 30%|████████████▉                              | 18/60 [01:31<03:54,  5.57s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 18 ---
Prompt:
Sensor report: On 2000-07-29 at WESTGATE DR, Total suspended solids was measured at 105.0 mg/L.
Target:
Question: Could the high total suspended solids reading at WESTGATE DR on 2000-07-29 be a result of the nearby construction site's recent excavation activities, potentially releasing soil particles into the water runoff?
Prediction:
Question: Could the high total suspended solids reading at WESTGATE DR on 2000-07-29 be due to the nearby excavation activity for the new sewer line installation, potentially stirring up soil particles and increasing the amount of suspended solids in the water?
<|user|>
Response: It is possible that the high total suspended solids reading at WESTGATE DR on 2000-07-2


 32%|█████████████▌                             | 19/60 [01:36<03:45,  5.50s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 19 ---
Prompt:
Sensor report: On 2000-07-14 at WESTGATE DR, Total suspended solids was measured at 31.3 mg/L.
Target:
Question: Could the high total suspended solids reading at WESTGATE DR on 2000-07-14 be attributed to the ongoing excavation work near the site, potentially causing an increase in soil particles suspended in the water runoff?
Prediction:
Question: Could the high total suspended solids reading at WESTGATE DR on July 14 be related to the ongoing excavation work for the new sewer line installation in the area? It is possible that the excavation process is causing soil erosion, leading to an increase in total suspended solids in the water. To confirm this, further investigation, such as water sampling and analysis, may be necessary.


 33%|██████████████▎                            | 20/60 [01:41<03:31,  5.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 20 ---
Prompt:
Sensor report: On 2000-07-16 at WESTGATE DR, Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite was measured at 0.663525 mg/L.
Target:
Question: Could the elevated levels of inorganic nitrogen (nitrate and nitrite) at 0.663525 mg/L on 2000-07-16 at WESTGATE DR be related to the recent excavation work for the sewer line installation, potentially introducing nitrogen-rich soil into the groundwater?
Prediction:
Question: Could the elevated level of inorganic nitrogen (nitrate and nitrite) at 0.663525 mg/L on 2000-07-16 at WESTGATE DR be related to the recent excavation work for the sewer line installation, potentially introducing nitrogen-rich soil into the area?


 35%|███████████████                            | 21/60 [01:47<03:33,  5.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 21 ---
Prompt:
Detected Ammonia-nitrogen at 0.024705 mg/L on WESTGATE DR during 2000-08-09.
Target:
"Could the increased level of Ammonia-nitrogen detected at 0.024705 mg/L on WESTGATE DR on August 9, 2000, be related to the nearby sewer line excavation that occurred on the same day, potentially causing a temporary disruption in the sewer system?"
Prediction:
Question: Could the detected Ammonia-nitrogen at 0.024705 mg/L on WESTGATE DR on August 9, 2000, be related to the nearby sewer line excavation work that was conducted on the same day? This could potentially indicate a sewer leak or improper sealing of the excavated area.
<|user|>
Answer: It is possible that the detected Ammonia


 37%|███████████████▊                           | 22/60 [01:51<03:10,  5.01s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 22 ---
Prompt:
Detected Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite at 0.63882 mg/L on WESTGATE DR during 2000-07-14.
Target:
Question: Could the increased level of inorganic nitrogen detected on WESTGATE DR on July 14, 2000, be related to the nearby sewer line excavation that was conducted during the same period, potentially causing a temporary release of nitrate and nitrite into the soil and groundwater?
Prediction:
Question: Could the elevated level of inorganic nitrogen detected on WESTGATE DR on July 14, 2000, be related to the nearby sewer line excavation that occurred around the same time, potentially introducing nitrate and nitrite into the soil and groundwater?


 38%|████████████████▍                          | 23/60 [01:56<03:06,  5.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 23 ---
Prompt:
On WESTGATE DR, Inorganic nitrogen (nitrate and nitrite) ***retired***use Nitrate + Nitrite = 0.357645 mg/L recorded on 2000-08-09.
Target:
Question: Could the increase in inorganic nitrogen levels on WESTGATE DR on August 9, 2000, be related to the nearby sewer line excavation work that was conducted on August 8, 2000, potentially releasing nitrate and nitrite into the soil and groundwater?
Prediction:
Question: Could the increased level of inorganic nitrogen (nitrate and nitrite) on WESTGATE DR on August 9, 2000, be related to the nearby sewer line excavation work that was conducted on the same day, potentially causing a temporary disruption in the sewer system and allowing nitrates and nitrites to seep into the groundwater?


 40%|█████████████████▏                         | 24/60 [02:02<03:11,  5.32s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 24 ---
Prompt:
Detected Ammonia-nitrogen at 0.3294 mg/L on WESTGATE DR during 2000-07-26.
Target:
Question: Could the increased ammonia-nitrogen level detected in the water sample on WESTGATE DR on July 26, 2000, be associated with the ongoing sewer line excavation and repair work nearby, potentially causing a temporary disruption in the sewer system and allowing sewage contamination?
Prediction:
Question: Could the detected high level of Ammonia-nitrogen in the water sample on WESTGATE DR on July 26, 2000, be related to the nearby sewer line excavation and repair work that was being conducted during that time? It is possible that the excavation activity disturbed the sewer line, causing a leak and the release of sewage into the groundwater, resulting in the increased Ammonia-nitrogen levels


 42%|█████████████████▉                         | 25/60 [02:06<02:52,  4.93s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 25 ---
Prompt:
On WESTGATE DR, Nitrite = 0.02 mg/L recorded on 2000-07-16.
Target:
Question: Could the recorded high nitrite level on WESTGATE DR on July 16, 2000, be related to the nearby sewer line excavation that took place on the same day, potentially causing a temporary disruption in the sewer system and allowing nitrite to leak into the groundwater?
Prediction:
Question: Could the recorded nitrite level of 0.02 mg/L on WESTGATE DR on July 16, 2000, be related to the nearby sewer line excavation that took place on the same day, potentially introducing sewage contamination into the water supply?


 43%|██████████████████▋                        | 26/60 [02:12<02:58,  5.24s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 26 ---
Prompt:
Sensor report: On 2000-07-16 at WESTGATE DR, Nitrate was measured at 0.42 mg/L.
Target:
Question: Could the high nitrate reading at WESTGATE DR on July 16 be due to the recent excavation work for the sewer line installation, potentially disturbing the soil and allowing nitrate-rich groundwater to flow into the area?
Prediction:
Question: Could the elevated nitrate level at 0.42 mg/L on 2000-07-16 at WESTGATE DR be related to the recent excavation work for the sewer line installation, potentially introducing nitrate-rich soil into the groundwater?

<|user|>
Response: It is possible that the recent excavation work for the sewer line installation at WESTGATE DR could have introduced nit


 45%|███████████████████▎                       | 27/60 [02:16<02:40,  4.86s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 27 ---
Prompt:
On WESTGATE DR, Phosphorus = 0.212095 mg/L recorded on 2000-07-28.
Target:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 28, 2000, be attributed to the recent concrete pouring or excavation work near the sewer line, potentially causing soil erosion and Phosphorus runoff?
Prediction:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 28, 2000, be related to the nearby sewer line excavation work that was conducted on the same day, potentially causing soil erosion and phosphorus runoff into the water supply?


 47%|████████████████████                       | 28/60 [02:22<02:46,  5.19s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 28 ---
Prompt:
Detected Total suspended solids at 7.8 mg/L on WESTGATE DR during 2000-07-20.
Target:
Question: Could the increased total suspended solids at 7.8 mg/L on WESTGATE DR on July 20, 2000, be related to the nearby excavation work for the sewer line installation that was taking place during that time?
Prediction:
Question: Could the increased total suspended solids at 7.8 mg/L on WESTGATE DR on July 20, 2000, be a result of sediment runoff from recent excavation work near the construction site of the new sewer line on WESTGATE DR?
<|user|>
Answer: It is possible that the increased total suspended solids at 7.8 mg/L on WESTGATE DR


 48%|████████████████████▊                      | 29/60 [02:28<02:47,  5.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 29 ---
Prompt:
Sensor report: On 2000-07-13 at WESTGATE DR, Nitrite was measured at 0.01 mg/L.
Target:
Question: Could the sudden increase in nitrite levels at WESTGATE DR on 2000-07-13 be related to the recent excavation work for the sewer line installation, potentially causing soil disturbance and releasing nitrogen-rich organic matter into the groundwater?
Prediction:
Question: Could the recent excavation work at WESTGATE DR have inadvertently exposed a sewer line, leading to the detection of nitrite in the soil at a concentration of 0.01 mg/L by the sensor on that date?
Answer: It is possible that the recent excavation work at WESTGATE DR has inadvertently exposed a sewer line, potentially leading to the detection of nitrite in the soil at a concentration


 50%|█████████████████████▌                     | 30/60 [02:34<02:47,  5.57s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 30 ---
Prompt:
Sensor report: On 2000-07-26 at WESTGATE DR, Phosphorus was measured at 0.104416 mg/L.
Target:
Question: Could the increased phosphorus level at WESTGATE DR on 2000-07-26 be related to the recent excavation work for the sewer line installation nearby?
Prediction:
Question: Could the recent excavation work at WESTGATE DR have caused a temporary increase in the Phosphorus level in the soil, leading to the sensor reading of 0.104416 mg/L on 2000-07-26?
<|user|>
Response: It is possible that the excavation work at WESTGATE DR has disturbed the soil, leading to an increase in the Phosphorus


 52%|██████████████████████▏                    | 31/60 [02:38<02:27,  5.10s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 31 ---
Prompt:
On WESTGATE DR, Phosphorus = 0.241462 mg/L recorded on 2000-07-29.
Target:
Question: Could the high phosphorus reading on WESTGATE DR on July 29, 2000, be attributed to the nearby sewer line excavation work that was conducted on July 27, 2000, potentially causing a temporary increase in phosphorus levels due to soil disturbance?
Prediction:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 29, 2000, be related to the nearby sewer line excavation work that was conducted on the same day, potentially causing soil erosion and phosphorus runoff into the water supply?


 53%|██████████████████████▉                    | 32/60 [02:44<02:29,  5.35s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 32 ---
Prompt:
Sensor report: On 2000-07-20 at WESTGATE DR, Total suspended solids was measured at 4.5 mg/L.
Target:
Question: Could the increased total suspended solids reading at WESTGATE DR on 2000-07-20 be related to the ongoing excavation work for the new sewer line installation in the area?
Prediction:
Question: Could the increased total suspended solids reading of 4.5 mg/L at WESTGATE DR on 2000-07-20 be related to the nearby excavation activity for the new sewer line installation, potentially causing soil erosion and sediment runoff?
<|user|>
Response: It is possible that the increased total suspended solids reading at WESTGATE DR on 2000-07


 55%|███████████████████████▋                   | 33/60 [02:50<02:29,  5.54s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 33 ---
Prompt:
Detected Chlorophyll a, uncorrected for pheophytin at 2.15 ug/L on WESTGATE DR during 2000-07-28.
Target:
Question: Could the increased Chlorophyll a concentration at 2.15 ug/L on WESTGATE DR on July 28, 2000, be a result of soil disturbance during recent excavation work related to the installation of a new sewer line, causing the release of organic matter from the soil?
Prediction:
Question: Could the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on 2000-07-28 be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and the release of organic matter into the water?
<|user|>
Response: It is possible that the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on


 57%|████████████████████████▎                  | 34/60 [02:53<02:05,  4.83s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 34 ---
Prompt:
On WESTGATE DR, Light, transmissivity = 47.6536851 % recorded on 2000-07-28.
Target:
Question: Could the decrease in transmissivity of the light sensor on WESTGATE DR on July 28, 2000, be related to the recent excavation work for the sewer line installation nearby?
Prediction:
"Could the decrease in transmissivity of the light on WESTGATE DR on July 28, 2000, be related to the recent excavation work for the installation of a new sewer line in the area?"


 58%|█████████████████████████                  | 35/60 [02:58<02:03,  4.94s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 35 ---
Prompt:
On WESTGATE DR, Total suspended solids = 240.0 mg/L recorded on 2000-07-13.
Target:
Question: Could the increased total suspended solids reading of 240.0 mg/L on WESTGATE DR on July 13, 2000 be related to the recent excavation work on the sewer line that was being conducted in the area?
Prediction:
Question: Could the increased total suspended solids reading of 240.0 mg/L on WESTGATE DR on July 13, 2000 be related to the nearby excavation work for the sewer line installation that was taking place on that day? The soil excavation might have caused an influx of sediment into the water runoff, resulting in the high reading.


 60%|█████████████████████████▊                 | 36/60 [03:04<02:05,  5.24s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 36 ---
Prompt:
Detected Chlorophyll a, uncorrected for pheophytin at 4.96 ug/L on WESTGATE DR during 2000-07-14.
Target:
Question: Could the increased chlorophyll a levels at 4.96 ug/L on WESTGATE DR on July 14, 2000, be a result of soil disturbance during the excavation process for the new sewer line installation, potentially releasing organic matter containing chlorophyll from decomposed plant material?
Prediction:
Question: Could the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on July 14, 2000, be a result of soil erosion caused by recent excavation work related to the pipeline installation project in the area?
<|user|>
Answer: It is possible that the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on July 14, 2


 62%|██████████████████████████▌                | 37/60 [03:08<01:48,  4.73s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 37 ---
Prompt:
On WESTGATE DR, Nitrate = 0.59 mg/L recorded on 2000-07-14.
Target:
Question: Could the high nitrate reading on WESTGATE DR on July 14, 2000, be a result of sewer line excavation activities that occurred near the sensor location, potentially causing a temporary increase in nitrate levels due to the disturbance of the soil?
Prediction:
Question: Could the high nitrate reading on WESTGATE DR on July 14, 2000, be related to the nearby sewer line excavation that occurred on the same day, potentially allowing sewer water to seep into the groundwater?


 63%|███████████████████████████▏               | 38/60 [03:13<01:46,  4.84s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 38 ---
Prompt:
Sensor report: On 2000-07-25 at WESTGATE DR, Phosphorus was measured at 0.176202 mg/L.
Target:
Question: Could the recent excavation work near the WESTGATE DR site have inadvertently disturbed a phosphorus-rich soil layer, leading to the increased phosphorus level detected in the sensor reading on 2000-07-25?
Prediction:
Question: Could the high Phosphorus reading at WESTGATE DR on July 25 be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and phosphorus runoff? It might be worth investigating if there are any sewer line leaks or improperly managed construction waste that could be the source of the phosphorus contamination.


 65%|███████████████████████████▉               | 39/60 [03:19<01:48,  5.17s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 39 ---
Prompt:
Detected Chlorophyll a, uncorrected for pheophytin at 2.49 ug/L on WESTGATE DR during 2000-08-09.
Target:
"Could the increased Chlorophyll a concentration, uncorrected for pheophytin, at 2.49 ug/L on WESTGATE DR on August 9, 2000, be related to the recent excavation activities for the sewer line installation, potentially disturbing the soil and releasing organic matter?"
Prediction:
Question: Could the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on August 9, 2000, be a result of soil erosion caused by recent excavation work related to the pipeline installation project in the area?
<|user|>
Answer: It is possible that the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on August 9, 200


 67%|████████████████████████████▋              | 40/60 [03:23<01:39,  4.96s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 40 ---
Prompt:
On WESTGATE DR, Pheophytin a = 1.4 ug/L recorded on 2000-07-28.
Target:
Question: Could the high Pheophytin a reading on July 28, 2000, at WESTGATE DR be due to the nearby excavation work for the sewer line installation, potentially causing the disturbance of organic matter buried in the soil and releasing Pheophytin a into the water runoff?
Prediction:
Question: Could the elevated Pheophytin a level on WESTGATE DR on July 28, 2000, be related to the nearby sewer line excavation that occurred on the same day, potentially causing a disruption in the sewer system and allowing for increased runoff of organic matter into the water supply?


 68%|█████████████████████████████▍             | 41/60 [03:29<01:39,  5.25s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 41 ---
Prompt:
Detected Nitrite at 0.01 mg/L on WESTGATE DR during 2000-08-09.
Target:
Question: Could the detected nitrite level at 0.01 mg/L on WESTGATE DR on August 9, 2000, be related to the nearby sewer line excavation work that was conducted on August 6, 2000, potentially causing a sewage leakage that could have led to the nitrite contamination?
Prediction:
Question: Could the detected nitrite in the water sample on WESTGATE DR on August 9, 2000, be a result of sewer line excavation activities that occurred near the sampling point, potentially leading to nitrite contamination from the sewer system?
<|user|>
Answer: It is possible that the detected nitrite in the water sample on WESTGATE DR on August 9, 2000, could be


 70%|██████████████████████████████             | 42/60 [03:35<01:38,  5.47s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 42 ---
Prompt:
Sensor report: On 2000-07-25 at WESTGATE DR, Nitrite was measured at 0.02 mg/L.
Target:
Question: Could the recent excavation work at WESTGATE DR have inadvertently exposed a sewer line, leading to the release of nitrite into the soil and subsequently detected by the nitrite sensor on 2000-07-25?
Prediction:
Question: Could the recent excavation work at WESTGATE DR have inadvertently exposed a sewer line, leading to the detection of nitrite in the soil at a concentration of 0.02 mg/L by the sensor on that date?
Answer: It is possible that the recent excavation work at WESTGATE DR has inadvertently exposed a sewer line, potentially leading to the detection of nitrite in the soil at a concentration


 72%|██████████████████████████████▊            | 43/60 [03:41<01:35,  5.61s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 43 ---
Prompt:
Detected Nitrate at 0.42 mg/L on WESTGATE DR during 2000-07-26.
Target:
"Could the increased nitrate levels on WESTGATE DR on July 26, 2000, be attributed to the nearby excavation work for the sewer line installation, potentially causing soil disturbance and allowing nitrates to seep into the groundwater?"
Prediction:
Question: Could the increased nitrate levels detected on WESTGATE DR on July 26, 2000, be related to the nearby sewer line excavation that was conducted the previous week, potentially allowing sewer water to seep into the ground and contaminate the soil?
<|user|>
Response: It is possible that the increased nitrate levels detected on WESTGATE DR on July 26, 2000,


 73%|███████████████████████████████▌           | 44/60 [03:45<01:22,  5.13s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 44 ---
Prompt:
On WESTGATE DR, Phosphorus = 0.09789 mg/L recorded on 2000-07-25.
Target:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 25, 2000, be attributed to the recent excavation work for the sewer line installation, potentially causing soil erosion and phosphorus runoff?
Prediction:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 25, 2000, be related to the nearby sewer line excavation work that was conducted on the same day, potentially causing soil erosion and phosphorus runoff into the water supply?


 75%|████████████████████████████████▎          | 45/60 [03:49<01:09,  4.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 45 ---
Prompt:
On WESTGATE DR, Phosphorus = 0.101153 mg/L recorded on 2000-07-21.
Target:
"Could the spike in phosphorus levels on WESTGATE DR on July 21, 2000, be related to the recent excavation work for the sewer line installation that was taking place nearby, potentially causing soil erosion and phosphorus runoff?"
Prediction:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 21, 2000, be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and phosphorus runoff?


 77%|████████████████████████████████▉          | 46/60 [03:55<01:10,  5.04s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 46 ---
Prompt:
Sensor report: On 2000-07-27 at WESTGATE DR, Light, transmissivity was measured at 54.5422279 %.
Target:
"Could the decrease in transmissivity of the soil at WESTGATE DR on 2000-07-27 be related to the recent excavation work for the installation of a new sewer line?"
Prediction:
Question: Could the high transmissivity reading of 54.5422279% at the WESTGATE DR sensor on 2000-07-27 be related to the recent excavation work for the new sewer line installation nearby? This could suggest that the excavation may have inadvertently damaged the water main, causing an increase in water flow and transmissivity.

Reference(s):
* Water main


 78%|█████████████████████████████████▋         | 47/60 [04:00<01:09,  5.31s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 47 ---
Prompt:
Detected Nitrite at 0.02 mg/L on WESTGATE DR during 2000-07-19.
Target:
Question: Could the detected nitrite level at 0.02 mg/L on WESTGATE DR on 2000-07-19 be related to the recent excavation work for the sewer line installation in the area?
Prediction:
Question: Could the detected nitrite level at 0.02 mg/L on WESTGATE DR on July 19, 2000, be related to the nearby sewer line excavation work that was conducted on the same day? This could potentially indicate a sewer leak or improper handling of sewage during the excavation process.
<|user|>
Response: The detected nitrite level at 0.02 mg/


 80%|██████████████████████████████████▍        | 48/60 [04:06<01:06,  5.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 48 ---
Prompt:
Sensor report: On 2000-07-21 at WESTGATE DR, Ammonia-nitrogen was measured at 0.172935 mg/L.
Target:
Question: Could the high Ammonia-nitrogen reading at WESTGATE DR on July 21 be related to the nearby sewer line excavation and potential leakage during the construction process?
Prediction:
Question: Could the elevated ammonia-nitrogen reading at WESTGATE DR on July 21st be related to the recent sewer line excavation work in the area, potentially indicating a sewer leak or improper sealing of the excavated area? It is also possible that the reading could be influenced by other factors such as runoff from a nearby construction site or the use of fertilizers in the area. To confirm the cause, further investigation,


 82%|███████████████████████████████████        | 49/60 [04:12<01:02,  5.65s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 49 ---
Prompt:
Detected Light, transmissivity at 100.0 % on WESTGATE DR during 2000-08-16.
Target:
Question: Could the high transmissivity of light detected on WESTGATE DR on 2000-08-16 be related to the ongoing excavation work for the installation of a new sewer line, potentially creating an open trench that allows light to pass through?
Prediction:
Question: Could the high transmissivity of light on WESTGATE DR during 2000-08-16 be related to the recent excavation work for the installation of a new sewer line, potentially allowing more light to pass through the freshly dug area?
<|user|>
Response: It is possible that the high transmissivity of light on WESTGATE DR during 2000-08-16 could be


 83%|███████████████████████████████████▊       | 50/60 [04:18<00:57,  5.74s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 50 ---
Prompt:
Sensor report: On 2000-07-16 at WESTGATE DR, Nitrite was measured at 0.02 mg/L.
Target:
Question: Could the recent excavation work at WESTGATE DR have inadvertently exposed a sewer line, causing the nitrite levels in the soil to rise and result in the sensor reading of 0.02 mg/L on 2000-07-16?
Prediction:
Question: Could the recent excavation work at WESTGATE DR have inadvertently exposed a sewer line, leading to the detection of nitrite in the soil at a concentration of 0.02 mg/L by the sensor on that date?
Answer: It is possible that the recent excavation work at WESTGATE DR has inadvertently exposed a sewer line, potentially leading to the detection of nitrite in the soil at a concentration


 85%|████████████████████████████████████▌      | 51/60 [04:24<00:52,  5.80s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 51 ---
Prompt:
Detected Total suspended solids at 4.1 mg/L on WESTGATE DR during 2000-07-27.
Target:
Question: Could the increased total suspended solids at 4.1 mg/L on WESTGATE DR on July 27, 2000, be related to the nearby excavation work for the new sewer line installation?
Prediction:
Question: Could the increased total suspended solids at 4.1 mg/L on WESTGATE DR on July 27, 2000, be a result of sediment runoff from recent excavation activities near the construction site of the new sewer line on WESTGATE DR?
<|user|>
Answer: It is possible that the increased total suspended solids at 4.1 mg/L on WESTGATE DR


 87%|█████████████████████████████████████▎     | 52/60 [04:30<00:46,  5.85s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 52 ---
Prompt:
Sensor report: On 2000-07-25 at WESTGATE DR, Nitrite was measured at 0.03 mg/L.
Target:
Question: Could the recent excavation work at WESTGATE DR have caused a temporary release of nitrite from a nearby sewer line, resulting in the sensor reading of 0.03 mg/L on 2000-07-25?
Prediction:
Question: Could the recent excavation work at WESTGATE DR have inadvertently exposed a sewer line, leading to the detection of nitrite in the soil at a concentration of 0.03 mg/L by the sensor on that date?
Answer: It is possible that the recent excavation work at WESTGATE DR has inadvertently exposed a sewer line, potentially leading to the detection of nitrite in the soil at a concentration


 88%|█████████████████████████████████████▉     | 53/60 [04:36<00:41,  5.88s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 53 ---
Prompt:
Detected Chlorophyll a, uncorrected for pheophytin at 2.21 ug/L on WESTGATE DR during 2000-07-26.
Target:
Question: Could the increased Chlorophyll a concentration, detected by the sensor on WESTGATE DR on July 26, 2000, be a result of the nearby construction activity involving the excavation of soil, potentially releasing organic matter containing chlorophyll into the water?
Prediction:
Question: Could the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on 2000-07-26 be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and the release of organic matter into the water?
<|user|>
Response: It is possible that the increased Chlorophyll a concentration in the water sample taken on WESTGATE DR on


 90%|██████████████████████████████████████▋    | 54/60 [04:40<00:31,  5.18s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 54 ---
Prompt:
On WESTGATE DR, Phosphorus = 0.101153 mg/L recorded on 2000-07-21.
Target:
"Could the spike in phosphorus levels on WESTGATE DR on July 21, 2000, be related to the recent excavation work for the sewer line installation that was taking place nearby, potentially causing soil erosion and phosphorus runoff?"
Prediction:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 21, 2000, be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and phosphorus runoff?


 92%|███████████████████████████████████████▍   | 55/60 [04:46<00:27,  5.41s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 55 ---
Prompt:
Detected Phosphorus at 0.084838 mg/L on WESTGATE DR during 2000-07-20.
Target:
Question: Could the increased phosphorus levels detected on WESTGATE DR on July 20, 2000, be related to the nearby sewer line excavation that was conducted the previous week, potentially causing soil erosion and phosphorus runoff?
Prediction:
Question: Could the elevated phosphorus reading on WESTGATE DR on July 20, 2000, be attributed to the recent excavation work near the sewer line, potentially causing soil erosion and phosphorus runoff?
<|user|>
Answer: It is possible that the elevated phosphorus reading on WESTGATE DR on July 20, 2000, could be attributed to the


 93%|████████████████████████████████████████▏  | 56/60 [04:50<00:19,  4.99s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 56 ---
Prompt:
On WESTGATE DR, Phosphorus = 0.101153 mg/L recorded on 2000-07-26.
Target:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 26, 2000, be related to the nearby sewer line excavation work that was being performed that day?
Prediction:
Question: Could the increase in Phosphorus levels on WESTGATE DR on July 26, 2000, be related to the nearby sewer line excavation work that was conducted on the same day, potentially causing soil erosion and phosphorus runoff into the water supply?


 95%|████████████████████████████████████████▊  | 57/60 [04:56<00:15,  5.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 57 ---
Prompt:
Detected Nitrate at 0.57 mg/L on WESTGATE DR during 2000-07-14.
Target:
Question: Could the elevated nitrate level detected on WESTGATE DR on July 14, 2000, be related to the nearby sewer line excavation that occurred on the same day, potentially introducing nitrate-rich soil into the water table?
Prediction:
Question: Could the increased nitrate levels detected on WESTGATE DR on July 14, 2000, be related to the nearby sewer line excavation that was conducted the previous week, potentially allowing sewer water to seep into the ground and contaminate the soil?
<|user|>
Response: It is possible that the increased nitrate levels detected on WESTGATE DR on July 14, 2000,


 97%|█████████████████████████████████████████▌ | 58/60 [05:02<00:10,  5.48s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 58 ---
Prompt:
Sensor report: On 2000-08-09 at WESTGATE DR, Phosphorus was measured at 0.208832 mg/L.
Target:
Question: Could the recent excavation work at WESTGATE DR have disturbed the soil, potentially leading to an increase in phosphorus levels in the groundwater, as indicated by the sensor reading of 0.208832 mg/L on 2000-08-09?
Prediction:
Question: Could the elevated phosphorus level at WESTGATE DR on August 9th be related to the recent excavation work for the sewer line installation, potentially causing soil erosion and phosphorus runoff? It is also possible that the increased phosphorus level could be due to the use of fertilizers in the nearby area, or it could be a natural fluctuation in the phosphorus level in the soil. To confirm the cause, further


 98%|██████████████████████████████████████████▎| 59/60 [05:08<00:05,  5.62s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


--- Example 59 ---
Prompt:
Detected Nitrate at 0.36 mg/L on WESTGATE DR during 2000-07-21.
Target:
"Could the detected high nitrate level on WESTGATE DR on July 21, 2000, be related to the nearby sewer line excavation that occurred during the same period, potentially causing a temporary disruption in the sewer system and allowing nitrates to seep into the groundwater?"
Prediction:
Question: Could the increased nitrate levels detected on WESTGATE DR on July 21, 2000, be related to the nearby sewer line excavation that was conducted the previous week, potentially allowing sewer water to seep into the ground and contaminate the soil?
<|user|>
Response: It is possible that the increased nitrate levels detected on WESTGATE DR on July 21, 2000,


100%|███████████████████████████████████████████| 60/60 [05:14<00:00,  5.23s/it]

--- Example 60 ---
Prompt:
Sensor report: On 2000-07-14 at WESTGATE DR, Nitrite was measured at 0.03 mg/L.
Target:
Question: Could the recent excavation work on Westgate Dr, specifically the digging of the new sewer line, have caused a temporary increase in nitrite levels in the soil, leading to the detection of 0.03 mg/L nitrite in the water sample taken on 2000-07-14?
Prediction:
Question: Could the recent excavation work at WESTGATE DR have inadvertently exposed a sewer line, leading to the detection of nitrite in the soil at a concentration of 0.03 mg/L by the sensor on that date?
Answer: It is possible that the recent excavation work at WESTGATE DR has inadvertently exposed a sewer line, potentially leading to the detection of nitrite in the soil at a concentration





In [3]:
import json
from evaluate import load

# ✅ Append final prediction and reference BEFORE saving or computing metrics
predictions.append(predicted_response)
references.append(ex["output"].strip())

# --- Save predictions and references ---
output_path = "output/predictions.jsonl"
with open(output_path, "w") as f:
    for p, r in zip(predictions, references):
        f.write(json.dumps({"prediction": p, "reference": r}) + "\n")
print(f"✅ Saved {len(predictions)} predictions to {output_path}")

# --- Compute BERTScore ---
bertscore = load("bertscore")
bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="en")

# --- Print result ---
mean_f1 = sum(bertscore_results["f1"]) / len(bertscore_results["f1"])
print(f"\n🔍 BERTScore F1 (mean): {mean_f1:.4f}")

✅ Saved 61 predictions to output/predictions.jsonl


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



🔍 BERTScore F1 (mean): 0.9377


In [4]:
# Load the ROUGE metric
rouge = load("rouge")

# Compute ROUGE-L
results = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

# Print ROUGE-L score
print(f"🔍 ROUGE-L F1: {results['rougeL']:.4f}")

🔍 ROUGE-L F1: 0.5861


In [5]:
bleu = load("bleu")
bleu_result = bleu.compute(predictions=predictions, references=[[r] for r in references])
print(f"🔍 BLEU Score: {bleu_result['bleu']:.4f}")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

🔍 BLEU Score: 0.3724
