In [7]:
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional
import json
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')


RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
class GenericRecord:
    data: Dict[str, Any]
    primary_key: str

    def to_text(self, text_columns: Optional[List[str]] = None) -> str:
        if text_columns:
            relevant_data = {k: v for k, v in self.data.items() if k in text_columns}
        else:
            relevant_data = self.data

        text_parts = []
        for key, value in relevant_data.items():
            if pd.notna(value) and value != '':
                key_formatted = key.replace('_', ' ').title()
                text_parts.append(f"{key_formatted}: {value}")

        return ". ".join(text_parts) + "."

    def get_metadata(self) -> Dict[str, Any]:
        return self.data.copy()



In [None]:
class VectorStore:

    def __init__(self, embedding_model: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(embedding_model)
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index = faiss.IndexFlatIP(self.dimension)
        self.records: List[GenericRecord] = []
        self.texts: List[str] = []
        self.metadata: List[Dict] = []
        self.text_columns: Optional[List[str]] = None

    def add_records(self, records: List[GenericRecord], text_columns: Optional[List[str]] = None):
        self.text_columns = text_columns
        texts = [record.to_text(text_columns) for record in records]
        embeddings = self.model.encode(texts, normalize_embeddings=True, show_progress_bar=False)

        self.index.add(embeddings.astype('float32'))
        self.records.extend(records)
        self.texts.extend(texts)
        self.metadata.extend([record.get_metadata() for record in records])

    def search(self, query: str, k: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
        if len(self.records) == 0:
            return []

        query_embedding = self.model.encode([query], normalize_embeddings=True)
        search_k = min(k * 5, len(self.records))
        distances, indices = self.index.search(query_embedding.astype('float32'), search_k)

        results = []
        for dist, idx in zip(distances[0], indices[0]):
            if idx < len(self.records):
                record = self.records[idx]
                metadata = self.metadata[idx]

                if filters and not self._matches_filters(metadata, filters):
                    continue

                results.append({
                    'primary_key': record.primary_key,
                    'similarity_score': float(dist),
                    'record': record.data
                })

                if len(results) >= k:
                    break

        return results

    def _matches_filters(self, metadata: Dict, filters: Dict) -> bool:
        """Check if metadata matches filters"""
        for key, value in filters.items():
            if key not in metadata:
                continue

            meta_value = metadata[key]

            # List of values (OR condition)
            if isinstance(value, list):
                if meta_value not in value:
                    return False

            # Comparison operators
            elif isinstance(value, tuple) and len(value) == 2:
                operator, threshold = value
                try:
                    meta_value = float(meta_value)
                    threshold = float(threshold)
                except (ValueError, TypeError):
                    return False

                if operator == '>' and not meta_value > threshold:
                    return False
                elif operator == '>=' and not meta_value >= threshold:
                    return False
                elif operator == '<' and not meta_value < threshold:
                    return False
                elif operator == '<=' and not meta_value <= threshold:
                    return False
                elif operator == '==' and not meta_value == threshold:
                    return False
                elif operator == '!=' and not meta_value != threshold:
                    return False
            else:
                # Exact match
                if str(metadata[key]).lower() != str(value).lower():
                    return False

        return True

    def save(self, path: str):
        save_path = Path(path)
        save_path.mkdir(parents=True, exist_ok=True)
        faiss.write_index(self.index, str(save_path / "index.faiss"))
        with open(save_path / "data.pkl", 'wb') as f:
            pickle.dump({
                'records': self.records,
                'texts': self.texts,
                'metadata': self.metadata,
                'text_columns': self.text_columns
            }, f)

    def load(self, path: str):
        """Load vector store"""
        load_path = Path(path)
        self.index = faiss.read_index(str(load_path / "index.faiss"))
        with open(load_path / "data.pkl", 'rb') as f:
            data = pickle.load(f)
            self.records = data['records']
            self.texts = data['texts']
            self.metadata = data['metadata']
            self.text_columns = data.get('text_columns')

In [None]:
class UniversalDataIndexer:


    def __init__(self, embedding_model: str = 'all-MiniLM-L6-v2'):
        self.stores: Dict[str, VectorStore] = {}
        self.embedding_model = embedding_model
        self.csv_schemas: Dict[str, Dict] = {}

    def index_csv(self,
                  csv_path: str,
                  store_name: str,
                  primary_key_column: Optional[str] = None,
                  text_columns: Optional[List[str]] = None,
                  exclude_columns: Optional[List[str]] = None):
        df = pd.read_csv(csv_path)

        if primary_key_column is None:
            primary_key_column = df.columns[0]

        if exclude_columns:
            df = df.drop(columns=exclude_columns, errors='ignore')

        self.csv_schemas[store_name] = {
            'columns': list(df.columns),
            'primary_key': primary_key_column,
            'text_columns': text_columns
        }

        records = []
        for _, row in df.iterrows():
            data = {}
            for col in df.columns:
                value = row[col]
                data[col] = '' if pd.isna(value) else value

            record = GenericRecord(data=data, primary_key=str(row[primary_key_column]))
            records.append(record)

        store = VectorStore(self.embedding_model)
        store.add_records(records, text_columns)
        self.stores[store_name] = store

        print(f"✓ Indexed {len(records)} records from '{csv_path}' into '{store_name}' store")

    def search(self, store_name: str, query: str, k: int = 5, filters: Optional[Dict] = None) -> List[Dict]:
        """Search a vector store"""
        if store_name not in self.stores:
            raise ValueError(f"Store '{store_name}' not found. Available: {list(self.stores.keys())}")
        return self.stores[store_name].search(query, k, filters)

    def get_schema(self, store_name: str) -> Dict:
        return self.csv_schemas.get(store_name, {})

    def list_stores(self) -> List[str]:
        return list(self.stores.keys())

    def save_all(self, base_path: str = "./vector_stores"):
        for store_name, store in self.stores.items():
            store.save(f"{base_path}/{store_name}")

        schema_path = Path(base_path) / "schemas.json"
        schema_path.parent.mkdir(parents=True, exist_ok=True)
        with open(schema_path, 'w') as f:
            json.dump(self.csv_schemas, f, indent=2)

        print(f"✓ Saved all stores to '{base_path}'")

    def load_all(self, base_path: str = "./vector_stores"):
        """Load all stores"""
        base = Path(base_path)

        schema_path = base / "schemas.json"
        if schema_path.exists():
            with open(schema_path, 'r') as f:
                self.csv_schemas = json.load(f)

        for store_dir in base.iterdir():
            if store_dir.is_dir():
                store_name = store_dir.name
                store = VectorStore(self.embedding_model)
                store.load(str(store_dir))
                self.stores[store_name] = store

        print(f"✓ Loaded {len(self.stores)} stores from '{base_path}'")



In [None]:
if __name__ == "__main__":

    indexer = UniversalDataIndexer()

    print("\n" + "="*60)
    print("MULTI-MODAL CSV DATA INDEXER")
    print("="*60)

    while True:
        print("\n--- MENU ---")
        print("1. Index a CSV file")
        print("2. Search indexed data")
        print("3. List indexed stores")
        print("4. Save all stores")
        print("5. Load saved stores")
        print("6. Exit")

        choice = input("\nSelect option (1-6): ").strip()

        if choice == '1':
            # Index CSV
            csv_path = input("Enter CSV file path: ").strip()
            store_name = input("Enter store name: ").strip()
            primary_key = input("Enter primary key column (or press Enter for auto): ").strip()

            try:
                indexer.index_csv(
                    csv_path=csv_path,
                    store_name=store_name,
                    primary_key_column=primary_key if primary_key else None
                )
            except Exception as e:
                print(f"✗ Error: {e}")

        elif choice == '2':
            # Search
            if not indexer.list_stores():
                print("✗ No stores indexed yet. Please index a CSV first.")
                continue

            print(f"\nAvailable stores: {', '.join(indexer.list_stores())}")
            store_name = input("Enter store name: ").strip()

            if store_name not in indexer.list_stores():
                print(f"✗ Store '{store_name}' not found")
                continue

            query = input("Enter search query: ").strip()
            k = input("Number of results (default 5): ").strip()
            k = int(k) if k else 5

            # Optional filters
            use_filters = input("Add filters? (y/n): ").strip().lower()
            filters = None
            if use_filters == 'y':
                filter_key = input("  Filter column: ").strip()
                filter_value = input("  Filter value (or >value, <value): ").strip()

                if filter_value.startswith('>'):
                    filters = {filter_key: ('>', float(filter_value[1:]))}
                elif filter_value.startswith('<'):
                    filters = {filter_key: ('<', float(filter_value[1:]))}
                else:
                    filters = {filter_key: filter_value}

            try:
                results = indexer.search(store_name, query, k, filters)

                print(f"\n--- RESULTS ({len(results)} found) ---")
                for i, result in enumerate(results, 1):
                    print(f"\n{i}. {result['primary_key']} (Score: {result['similarity_score']:.3f})")
                    for key, value in list(result['record'].items())[:5]:
                        print(f"   {key}: {value}")
            except Exception as e:
                print(f"✗ Error: {e}")

        elif choice == '3':
            # List stores
            stores = indexer.list_stores()
            if not stores:
                print("\n✗ No stores indexed yet")
            else:
                print(f"\n--- INDEXED STORES ({len(stores)}) ---")
                for store in stores:
                    schema = indexer.get_schema(store)
                    print(f"\n{store}:")
                    print(f"  Records: {len(indexer.stores[store].records)}")
                    print(f"  Primary Key: {schema['primary_key']}")
                    print(f"  Columns: {', '.join(schema['columns'][:5])}...")

        elif choice == '4':
            # Save
            path = input("Save path (default: ./vector_stores): ").strip()
            path = path if path else "./vector_stores"
            indexer.save_all(path)

        elif choice == '5':
            # Load
            path = input("Load path (default: ./vector_stores): ").strip()
            path = path if path else "./vector_stores"
            try:
                indexer.load_all(path)
            except Exception as e:
                print(f"✗ Error: {e}")

        elif choice == '6':
            print("\nGoodbye!")
            break

        else:
            print("✗ Invalid option")