In [4]:
import os
import re
import json
import requests
import joblib
import torch
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from urllib.parse import urlparse
from dotenv import load_dotenv, find_dotenv
from firebase import db
from openai import OpenAI

In [5]:
# Load env and init Firebase
dotenv = find_dotenv()
if dotenv:
    load_dotenv(dotenv)


openai = OpenAI(api_key=os.getenv("OPENAI_RANDY_KEY"))

In [6]:
# ——— Helper Functions ———

def tokenize(text: str) -> list[str]:
    return re.findall(r"\w+", text.lower())

def extract_features(result: dict, query: str) -> dict:
    title, snippet, url = result["title"], result["snippet"], result["url"]
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    tld = domain.split(".")[-1] if domain else ""
    q_tokens = set(tokenize(query))
    t_tokens = tokenize(title)
    s_tokens = tokenize(snippet)
    feats = {
        "title_overlap":   sum(1 for w in q_tokens if w in t_tokens),
        "snippet_overlap": sum(1 for w in q_tokens if w in s_tokens),
        "url_length":      len(url),
        "url_depth":       parsed.path.count("/"),
        "has_date":        int(any(y in title or y in snippet for y in ("2023","2024"))),
        "score_trust":     int(domain.endswith((".gov",".org",".edu"))),
    }
    # one-hot TLD
    feats[f"tld_{tld}"] = 1
    return feats

def is_scrapable(url: str) -> bool:
    try:
        r = requests.get(url, timeout=5)
        return r.status_code == 200
    except:
        return False

def get_gpt_score(result: dict, query: str) -> int:
    prompt = (
        f"Rate the relevance of this search result to the query:\n"
        f"Query: \"{query}\"\n"
        f"Title: \"{result['title']}\"\n"
        f"Snippet: \"{result['snippet']}\"\n"
        f"URL: {result['url']}\n\n"
        "On a scale of 0 (irrelevant) to 5 (perfectly relevant), return a single integer."
    )
    resp = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You rate search result relevance."},
            {"role": "user",   "content": prompt}
        ],
        temperature=0.0
    ).choices[0].message.content
    # extract first integer
    match = re.search(r"\d+", resp)
    if not match:
        return 0
    score = int(match.group())
    return max(0, min(5, score))


# ——— Data Fetching ———

def fetch_ranking_data(collection: str = "ranking_training_data") -> list[dict]:
    """
    Retrieve all search results from Firestore collection.
    Each document must have 'title', 'snippet', and 'url' fields.
    """
    docs = db.collection(collection).stream()
    results = []
    for d in docs:
        data = d.to_dict()
        results.append({
            "title":   data.get("title", ""),
            "snippet": data.get("snippet", ""),
            "url":     data.get("url", "")
        })
    return results


# ——— Dataset Builders ———

def build_ffnn_dataset(results: list[dict], query: str) -> pd.DataFrame:
    """
    Builds a DataFrame of feedforward features + GPT score + scrapability.
    Columns: all extracted features, 'gpt_score' (0–5), 'malformed_url' (0/1), 'label' placeholder.
    """
    records = []
    for res in results:
        feats = extract_features(res, query)
        # scrapability
        malformed = 0 if is_scrapable(res["url"]) else 1
        feats["malformed_url"] = malformed
        # GPT-based relevance score
        feats["gpt_score"] = 0 if malformed else get_gpt_score(res, query)
        records.append(feats)
    df = pd.DataFrame(records).fillna(0)
    # one-hot any missing TLD columns
    df = pd.get_dummies(df, columns=[c for c in df if c.startswith("tld_")])
    # placeholder label (to be set later)
    df["label"] = 0
    return df

def build_rnn_dataset(results: list[dict], query: str) -> list[dict]:
    """
    Builds an RNN-style dataset: raw text fields + GPT score + scrapability + placeholder label.
    Each record is a dict with keys: 'query','title','snippet','gpt_score','malformed_url','label'
    """
    dataset = []
    for res in results:
        malformed = 0 if is_scrapable(res["url"]) else 1
        score     = 0 if malformed else get_gpt_score(res, query)
        dataset.append({
            "query":        query,
            "title":        res["title"],
            "snippet":      res["snippet"],
            "malformed_url":malformed,
            "gpt_score":    score,
            "label":        0
        })
    return dataset


# ——— Firestore Storage ———

def store_ffnn_dataset(df: pd.DataFrame, collection: str = "ffnn_dataset"):
    """
    Stores each row of the FFNN DataFrame into Firestore.
    Overwrites any existing documents with the same numeric ID.
    """
    col = db.collection(collection)
    # optional: clear existing
    for d in col.stream():
        col.document(d.id).delete()
    for idx, row in df.reset_index(drop=True).iterrows():
        data = row.to_dict()
        col.document(str(idx)).set(data)

def store_rnn_dataset(records: list[dict], collection: str = "rnn_dataset"):
    """
    Stores each record of the RNN dataset into Firestore.
    """
    col = db.collection(collection)
    # optional: clear existing
    for d in col.stream():
        col.document(d.id).delete()
    for idx, rec in enumerate(records):
        col.document(str(idx)).set(rec)


# ——— Combined Builder & Storer ———

def build_and_store_datasets(
    query: str,
    ranking_collection: str = "ranking_training_data",
    ffnn_collection:   str = "ffnn_dataset",
    rnn_collection:    str = "rnn_dataset"
) -> tuple[pd.DataFrame, list[dict]]:
    """
    Fetches raw search results, builds both FFNN and RNN datasets,
    and stores them in Firestore under the specified collections.
    Returns (ffnn_df, rnn_records).
    """
    results = fetch_ranking_data(ranking_collection)
    ffnn_df  = build_ffnn_dataset(results, query)
    rnn_data = build_rnn_dataset(results, query)

    store_ffnn_dataset(ffnn_df, ffnn_collection)
    store_rnn_dataset(rnn_data, rnn_collection)

    return ffnn_df, rnn_data

In [7]:
query = "Find a list of hospitals in Ontario"
ffnn_df, rnn_data = build_and_store_datasets(query)
print("Built FFNN dataset shape:", ffnn_df.shape)
print("Built RNN dataset size:", len(rnn_data))

Built FFNN dataset shape: (4665, 129)
Built RNN dataset size: 4665
