In [19]:
import pandas as pd
import json
import requests
import os

class RSIMetadataBuilder:
    def __init__(self, api_key, model_name="openai/gpt-4o", kb_path="metadata_kb.json"):
        self.api_key = api_key
        self.model_name = model_name
        self.kb_path = kb_path
        self.kb = self._load_kb()
    
    def _load_kb(self):
        if os.path.exists(self.kb_path):
            with open(self.kb_path, "r") as f:
                return json.load(f)
        return {}

    def _save_kb(self):
        with open(self.kb_path, "w") as f:
            json.dump(self.kb, f, indent=4)

    def build_first_order_metadata(self, df: pd.DataFrame):
        metadata = {}
        for col in df.columns:
            metadata[col] = {
                "dtype": str(df[col].dtype),
                "missing_percent": df[col].isna().mean() * 100,
                "unique_values": df[col].nunique(),
                "sample_values": df[col].dropna().unique()[:5].tolist()
            }
        return metadata

    def _call_openrouter(self, prompt):
        url = "https://openrouter.ai/api/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        }
        payload = {
            "model": self.model_name,
            "messages": [{"role": "user", "content": prompt}]
        }
        try:
            response = requests.post(url, headers=headers, data=json.dumps(payload))
            response.raise_for_status()
            result = response.json()
            # extract model's content
            return result["choices"][0]["message"]["content"].strip()
        except Exception as e:
            print(f"[OpenRouter Error] {e}")
            return None

    def ai_augment_metadata(self, metadata):
        enriched_metadata = {}

        for col, info in metadata.items():
            # KB lookup first
            if col in self.kb:
                print(f"[KB HIT] Using stored metadata for '{col}'")
                enriched_metadata[col] = {**info, **self.kb[col]}
                continue

            prompt = f"""
You are a data expert. Given the column name '{col}' and sample values {info['sample_values']}, 
infer the following as a valid JSON object **only** (no extra text):

- semantic_type
- possible_meaning
- expected_format
- potential_issues

Example output:
{{ 
  "semantic_type": "currency",
  "possible_meaning": "Total transaction amount",
  "expected_format": "float",
  "potential_issues": "Missing values or negative numbers"
}}
"""


            ai_text = self._call_openrouter(prompt)

            # parse JSON safely
            try:
                enrichment = json.loads(ai_text)
            except:
                enrichment = {"semantic_type": None, "possible_meaning": None, "expected_format": None, "potential_issues": None}

            enriched_metadata[col] = {**info, **enrichment}

            # Store in KB for RSI self-improvement
            self.kb[col] = enrichment

        self._save_kb()
        return enriched_metadata

    def update_feedback(self, column_name, corrected_metadata):
        self.kb[column_name] = corrected_metadata
        self._save_kb()
        print(f"[Feedback Updated] KB updated for '{column_name}'.")

    def save_metadata(self, metadata, file_path="metadata.json"):
        with open(file_path, "w") as f:
            json.dump(metadata, f, indent=4)
        print(f"Metadata saved to {file_path}")


In [24]:
data_path = '../data/employees.csv'
df = pd.read_csv(data_path)

In [25]:
df

Unnamed: 0,Emp_ID,Name,Gender,Age,City,Department,Salary,Email
0,1,Tariq,Male,47.0,Karachi,Sales,123609.72,tariq76@gmail.com
1,2,Hina,Female,26.0,Quetta,HR,125683.94,hina90@gmail.com
2,3,Usman,Male,36.0,Lahore,Sales,,usman71@gmail.com
3,4,Sana,Female,21.0,Quetta,Finance,147901.41,sana72@gmail.com
4,5,Ayesha,Male,40.0,lahore,HR,123880.8,ayesha25@gmail.com
5,6,Ahmed,Male,,Quetta,Finance,50115.7,ahmed33@gmail.com
6,7,Bilal,Female,60.0,Peshawar,Finance,32984.18,bilal93@gmail.com
7,8,Ahmed,Female,,Quetta,IT,,ahmed89@gmail.com
8,9,Usman,Male,,Quetta,Sales,,usman35@gmail.com
9,10,Sara,Female,,Islamabad,Finance,127273.37,sara73@gmail.com


In [27]:
api_key = "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"

builder = RSIMetadataBuilder(api_key=api_key, model_name="openai/gpt-4o")
first_order = builder.build_first_order_metadata(df)
enriched = builder.ai_augment_metadata(first_order)
builder.save_metadata(enriched)


Metadata saved to metadata.json
