# Imports and Setup

In [1]:
import os
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional
import json
from dataclasses import dataclass
from sentence_transformers import SentenceTransformer
import faiss
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# GenericRecord Class

In [2]:
@dataclass
class GenericRecord:
    data: Dict[str, Any]
    primary_key: str

    def to_text(self, text_columns: Optional[List[str]] = None) -> str:
        if text_columns:
            relevant_data = {k: v for k, v in self.data.items() if k in text_columns}
        else:
            relevant_data = self.data

        text_parts = []
        for key, value in relevant_data.items():
            if pd.notna(value) and value != '':
                key_formatted = key.replace('_', ' ').title()
                text_parts.append(f"{key_formatted}: {value}")

        return ". ".join(text_parts) + "."

    def get_metadata(self) -> Dict[str, Any]:
        return self.data.copy()

# VectorStore Class

In [3]:
class VectorStore:
    def __init__(self, embedding_model: str = 'all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(embedding_model)
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index = faiss.IndexFlatIP(self.dimension)
        self.records: List[GenericRecord] = []
        self.texts: List[str] = []
        self.metadata: List[Dict] = []
        self.text_columns: Optional[List[str]] = None

    def add_records(self, records: List[GenericRecord], text_columns: Optional[List[str]] = None):
        self.text_columns = text_columns
        texts = [record.to_text(text_columns) for record in records]
        embeddings = self.model.encode(texts, normalize_embeddings=True, show_progress_bar=False)

        self.index.add(embeddings.astype('float32'))
        self.records.extend(records)
        self.texts.extend(texts)
        self.metadata.extend([record.get_metadata() for record in records])

    def search(self, query: str, k: int = 5, filters: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
        if len(self.records) == 0:
            return []

        query_embedding = self.model.encode([query], normalize_embeddings=True)
        search_k = min(k * 5, len(self.records))
        distances, indices = self.index.search(query_embedding.astype('float32'), search_k)

        results = []
        for dist, idx in zip(distances[0], indices[0]):
            if idx < len(self.records):
                record = self.records[idx]
                metadata = self.metadata[idx]

                if filters and not self._matches_filters(metadata, filters):
                    continue

                results.append({
                    'primary_key': record.primary_key,
                    'similarity_score': float(dist),
                    'record': record.data
                })

                if len(results) >= k:
                    break

        return results

    def _matches_filters(self, metadata: Dict, filters: Dict) -> bool:
        for key, value in filters.items():
            if key not in metadata:
                continue

            meta_value = metadata[key]

            if isinstance(value, list):
                if meta_value not in value:
                    return False

            elif isinstance(value, tuple) and len(value) == 2:
                operator, threshold = value
                try:
                    meta_value = float(meta_value)
                    threshold = float(threshold)
                except (ValueError, TypeError):
                    return False

                if operator == '>' and not meta_value > threshold:
                    return False
                elif operator == '>=' and not meta_value >= threshold:
                    return False
                elif operator == '<' and not meta_value < threshold:
                    return False
                elif operator == '<=' and not meta_value <= threshold:
                    return False
                elif operator == '==' and not meta_value == threshold:
                    return False
                elif operator == '!=' and not meta_value != threshold:
                    return False
            else:
                if str(metadata[key]).lower() != str(value).lower():
                    return False

        return True

    def save(self, path: str):
        save_path = Path(path)
        save_path.mkdir(parents=True, exist_ok=True)
        faiss.write_index(self.index, str(save_path / "index.faiss"))
        with open(save_path / "data.pkl", 'wb') as f:
            pickle.dump({
                'records': self.records,
                'texts': self.texts,
                'metadata': self.metadata,
                'text_columns': self.text_columns
            }, f)

    def load(self, path: str):
        load_path = Path(path)
        self.index = faiss.read_index(str(load_path / "index.faiss"))
        with open(load_path / "data.pkl", 'rb') as f:
            data = pickle.load(f)
            self.records = data['records']
            self.texts = data['texts']
            self.metadata = data['metadata']
            self.text_columns = data.get('text_columns')

# UniversalDataIndexer Class

In [4]:
class UniversalDataIndexer:
    def __init__(self, embedding_model: str = 'all-MiniLM-L6-v2'):
        self.stores: Dict[str, VectorStore] = {}
        self.embedding_model = embedding_model
        self.csv_schemas: Dict[str, Dict] = {}

    def index_csv(self,
                  csv_path: str,
                  store_name: str,
                  primary_key_column: Optional[str] = None,
                  text_columns: Optional[List[str]] = None,
                  exclude_columns: Optional[List[str]] = None):
        df = pd.read_csv(csv_path)

        if primary_key_column is None:
            primary_key_column = df.columns[0]

        if exclude_columns:
            df = df.drop(columns=exclude_columns, errors='ignore')

        self.csv_schemas[store_name] = {
            'columns': list(df.columns),
            'primary_key': primary_key_column,
            'text_columns': text_columns
        }

        records = []
        for _, row in df.iterrows():
            data = {}
            for col in df.columns:
                value = row[col]
                data[col] = '' if pd.isna(value) else value

            record = GenericRecord(data=data, primary_key=str(row[primary_key_column]))
            records.append(record)

        store = VectorStore(self.embedding_model)
        store.add_records(records, text_columns)
        self.stores[store_name] = store

        print(f"✓ Indexed {len(records)} records from '{csv_path}' into '{store_name}' store")

    def search(self, store_name: str, query: str, k: int = 5, filters: Optional[Dict] = None) -> List[Dict]:
        if store_name not in self.stores:
            raise ValueError(f"Store '{store_name}' not found. Available: {list(self.stores.keys())}")
        return self.stores[store_name].search(query, k, filters)

    def get_schema(self, store_name: str) -> Dict:
        return self.csv_schemas.get(store_name, {})

    def list_stores(self) -> List[str]:
        return list(self.stores.keys())

    def save_all(self, base_path: str = "./vector_stores"):
        for store_name, store in self.stores.items():
            store.save(f"{base_path}/{store_name}")

        schema_path = Path(base_path) / "schemas.json"
        schema_path.parent.mkdir(parents=True, exist_ok=True)
        with open(schema_path, 'w') as f:
            json.dump(self.csv_schemas, f, indent=2)

        print(f"✓ Saved all stores to '{base_path}'")

    def load_all(self, base_path: str = "./vector_stores"):
        base = Path(base_path)

        schema_path = base / "schemas.json"
        if schema_path.exists():
            with open(schema_path, 'r') as f:
                self.csv_schemas = json.load(f)

        for store_dir in base.iterdir():
            if store_dir.is_dir():
                store_name = store_dir.name
                store = VectorStore(self.embedding_model)
                store.load(str(store_dir))
                self.stores[store_name] = store

        print(f"✓ Loaded {len(self.stores)} stores from '{base_path}'")

# Using the Indexer

In [5]:
# Create the indexer
indexer = UniversalDataIndexer()
print("Indexer created successfully!")

Indexer created successfully!


## Index all csv files

In [6]:
base_path = os.path.abspath("../data_directory/unclean_data")

1. Index Accounts

In [7]:
indexer.index_csv(
    csv_path=os.path.join(base_path, "accounts.csv"),
    store_name="accounts",
    primary_key_column="account",
    text_columns=["sector", "subsidiary_of", "office_location"]  # Columns for semantic search
)

✓ Indexed 85 records from '/Users/nissiotoo/Documents/Team2B/data_directory/unclean_data/accounts.csv' into 'accounts' store


2. Index Products

In [8]:
indexer.index_csv(
    csv_path=os.path.join(base_path, "products.csv"),
    store_name="products",
    primary_key_column="product",
    text_columns=["product", "series"]  # Product name and series for search
)

✓ Indexed 7 records from '/Users/nissiotoo/Documents/Team2B/data_directory/unclean_data/products.csv' into 'products' store


3. Index Sales Pipeline

In [9]:
indexer.index_csv(
    csv_path=os.path.join(base_path, "sales_pipeline.csv"),
    store_name="pipeline",
    primary_key_column="opportunity_id",
    text_columns=["account", "product", "sales_agent", "deal_stage"]  # Key fields for search
)

✓ Indexed 8800 records from '/Users/nissiotoo/Documents/Team2B/data_directory/unclean_data/sales_pipeline.csv' into 'pipeline' store


4. Index Sales Teams

In [10]:
indexer.index_csv(
    csv_path=os.path.join(base_path, "sales_teams.csv"),
    store_name="teams",
    primary_key_column="sales_agent",  # first column as key
    text_columns=["sales_agent", "manager", "regional_office"] # Key fields for search (all columns)
)

✓ Indexed 35 records from '/Users/nissiotoo/Documents/Team2B/data_directory/unclean_data/sales_teams.csv' into 'teams' store


Index Data Dictionary

In [11]:
indexer.index_csv(
    csv_path=os.path.join(base_path, "data_dictionary.csv"),
    store_name="dictionary",
    primary_key_column="Table",  # first column as key
    text_columns=["Table", "Field", "Description"]  # Key fields for search (all columns)
)

✓ Indexed 21 records from '/Users/nissiotoo/Documents/Team2B/data_directory/unclean_data/data_dictionary.csv' into 'dictionary' store


## Verify the stores

In [12]:
# List all indexed stores
stores = indexer.list_stores()
print(f"Available stores: {stores}\n")

# Show schema for each store
for store in stores:
    schema = indexer.get_schema(store)
    print(f"{store.upper()}:")
    print(f"   Primary Key: {schema['primary_key']}")
    print(f"   Columns: {schema['columns'][:5]}...")  # Show first 5 columns
    print(f"   Total Records: {len(indexer.stores[store].records)}\n")

Available stores: ['accounts', 'products', 'pipeline', 'teams', 'dictionary']

ACCOUNTS:
   Primary Key: account
   Columns: ['account', 'sector', 'year_established', 'revenue', 'employees']...
   Total Records: 85

PRODUCTS:
   Primary Key: product
   Columns: ['product', 'series', 'sales_price']...
   Total Records: 7

PIPELINE:
   Primary Key: opportunity_id
   Columns: ['opportunity_id', 'sales_agent', 'product', 'account', 'deal_stage']...
   Total Records: 8800

TEAMS:
   Primary Key: sales_agent
   Columns: ['sales_agent', 'manager', 'regional_office']...
   Total Records: 35

DICTIONARY:
   Primary Key: Table
   Columns: ['Table', 'Field', 'Description']...
   Total Records: 21



# Search accounts

In [13]:
# Find accounts in specific sectors
results = indexer.search(
    store_name="accounts",
    query="healthcare technology companies",
    k=5
)

print("HEALTHCARE/TECH ACCOUNTS:\n")
for i, result in enumerate(results, 1):
    print(f"{i}. Score: {result['similarity_score']:.3f}")
    print(f"   Account: {result['record'].get('account', 'N/A')}")
    print(f"   Sector: {result['record'].get('sector', 'N/A')}")
    print(f"   Revenue: {result['record'].get('revenue', 'N/A')}")
    print(f"   Employees: {result['record'].get('employees', 'N/A')}\n")

HEALTHCARE/TECH ACCOUNTS:

1. Score: 0.510
   Account: Zumgoity
   Sector: medical
   Revenue: 441.08
   Employees: 1210

2. Score: 0.510
   Account: The New York Inquirer
   Sector: medical
   Revenue: 439.21
   Employees: 792

3. Score: 0.510
   Account: Silis
   Sector: medical
   Revenue: 2818.38
   Employees: 6290

4. Score: 0.510
   Account: Ron-tech
   Sector: medical
   Revenue: 3922.42
   Employees: 6837

5. Score: 0.510
   Account: Lexiqvolax
   Sector: medical
   Revenue: 1618.89
   Employees: 3889



# Search sales pipeline

In [14]:
# Find high-value deals
results = indexer.search(
    store_name="pipeline",
    query="low value closed won deals",
    k=5
)

print("HIGH-VALUE DEALS:\n")
for i, result in enumerate(results, 1):
    print(f"{i}. Score: {result['similarity_score']:.3f}")
    print(f"   Account: {result['record'].get('account', 'N/A')}")
    print(f"   Product: {result['record'].get('product', 'N/A')}")
    print(f"   Close Value: {result['record'].get('close_value', 'N/A')}")
    print(f"   Stage: {result['record'].get('deal_stage', 'N/A')}\n")

HIGH-VALUE DEALS:

1. Score: 0.365
   Account: Doncon
   Product: MG Special
   Close Value: 58.0
   Stage: Won

2. Score: 0.350
   Account: Stanredtax
   Product: MG Special
   Close Value: 60.0
   Stage: Won

3. Score: 0.349
   Account: Donquadtech
   Product: MG Special
   Close Value: 59.0
   Stage: Won

4. Score: 0.348
   Account: Treequote
   Product: MG Special
   Close Value: 60.0
   Stage: Won

5. Score: 0.348
   Account: Doncon
   Product: GTX Basic
   Close Value: 529.0
   Stage: Won



# Search with filters

In [15]:
# Find accounts with revenue > $1,000
results = indexer.search(
    store_name="accounts",
    query="large enterprise accounts",
    k=5,
    filters={"revenue": ('>', 1000)}  # Filter for revenue > $1,000
)

print("LARGE ENTERPRISE ACCOUNTS (Revenue > $1,000):\n")
for i, result in enumerate(results, 1):
    print(f"{i}. {result['record'].get('account', 'N/A')}")
    print(f"   Sector: {result['record'].get('sector', 'N/A')}")
    print(f"   Revenue: ${result['record'].get('revenue', 'N/A'):,}\n")

LARGE ENTERPRISE ACCOUNTS (Revenue > $1,000):

1. Cheers
   Sector: entertainment
   Revenue: $4,269.9

2. Zotware
   Sector: software
   Revenue: $4,478.47

3. Kan-code
   Sector: software
   Revenue: $11,698.03

4. Dontechi
   Sector: software
   Revenue: $4,618.0

5. Scotfind
   Sector: software
   Revenue: $6,354.87



# Search sales teams

In [16]:
# Find sales agents by region
results = indexer.search(
    store_name="teams",
    query="west coast sales representatives",
    k=5
)

print("WEST COAST SALES TEAM:\n")
for i, result in enumerate(results, 1):
    print(f"{i}. Agent: {result['record'].get('sales_agent', 'N/A')}")
    print(f"   Manager: {result['record'].get('manager', 'N/A')}")
    print(f"   Office: {result['record'].get('regional_office', 'N/A')}\n")

WEST COAST SALES TEAM:

1. Agent: Carol Thompson
   Manager: Celia Rouche
   Office: West

2. Agent: Elizabeth Anderson
   Manager: Cara Losch
   Office: East

3. Agent: Markita Hansen
   Manager: Celia Rouche
   Office: West

4. Agent: Wilburn Farren
   Manager: Cara Losch
   Office: East

5. Agent: Vicki Laflamme
   Manager: Celia Rouche
   Office: West



# Cross-Reference Search

In [17]:
# Find which products a specific account bought
account_name = "sonron"  # Search any account name from accounts.csv

# First find the account
account_results = indexer.search(
    store_name="accounts",
    query=account_name,
    k=1
)

if account_results:
    print(f"DEALS FOR: {account_name}\n")
    
    # Then find deals for that account
    pipeline_results = indexer.search(
        store_name="pipeline",
        query=account_name,
        k=10
    )
    
    for i, result in enumerate(pipeline_results, 1):
        print(f"{i}. Product: {result['record'].get('product', 'N/A')}")
        print(f"   Close Value: {result['record'].get('close_value', 'N/A')}")
        print(f"   Stage: {result['record'].get('deal_stage', 'N/A')}\n")
else:
    print(f"Account '{account_name}' not found")

DEALS FOR: sonron

1. Product: MG Advanced
   Close Value: 3250.0
   Stage: Won

2. Product: MG Advanced
   Close Value: 3527.0
   Stage: Won

3. Product: MG Advanced
   Close Value: 3689.0
   Stage: Won

4. Product: MG Advanced
   Close Value: 3484.0
   Stage: Won

5. Product: GTX Plus Pro
   Close Value: 5816.0
   Stage: Won

6. Product: GTX Plus Pro
   Close Value: 4631.0
   Stage: Won

7. Product: GTXPro
   Close Value: 4936.0
   Stage: Won

8. Product: GTXPro
   Close Value: 4683.0
   Stage: Won

9. Product: MG Special
   Close Value: 0.0
   Stage: Lost

10. Product: GTX Basic
   Close Value: 432.0
   Stage: Won

