In [1]:
import asyncio
import os
import sys
import pandas as pd
import time
import text_lloom.workbench as wb
from text_lloom.llm import Model, EmbedModel
from dotenv import load_dotenv
from google import genai

# Load environment variables from .env
load_dotenv()

# --- Gemini API Key ---
print("üîç Step 1: Loading API Key...")
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    print("‚ùå Error: GOOGLE_API_KEY not found in .env")
    sys.exit(1)
print("‚úÖ API Key loaded successfully")

# --- Model Setup ---

print("\nüîç Step 2: Setting up model functions...")
# SETUP functions
def setup_llm_fn(api_key):
    print("   ‚öôÔ∏è  Setting up LLM...")
    client = genai.Client(api_key=api_key)
    print("   ‚úÖ LLM setup complete")
    return client

def setup_embed_fn(api_key):
    print("   ‚öôÔ∏è  Setting up Embedding model...")
    client = genai.Client(api_key=api_key)
    print("   ‚úÖ Embedding model setup complete")
    return client

print("‚úÖ Model functions ready")

# CALL functions
print("\nüîç Step 3: Setting up call functions...")
async def call_llm_fn(model, prompt):
    if "system_prompt" not in model.args:
        model.args["system_prompt"] = "You are a helpful assistant who helps with identifying patterns in text examples."
    if "temperature" not in model.args:
        model.args["temperature"] = 0
    
    try:
        # Check if json is requested in prompt to decide on response_mime_type
        config = {
            "temperature": model.args["temperature"],
            "max_output_tokens": 8192,
        }
        
        # Only enforce JSON if it looks like the prompt expects it
        if "JSON" in prompt or "json" in prompt:
             config["response_mime_type"] = "application/json"

        res = model.client.models.generate_content(
            model=model.name,
            contents=prompt,
            config=config
        )
        res_parsed = res.text if res and hasattr(res, 'text') else None
        tokens = [0, 0]
        return res_parsed, tokens
    except Exception as e:
        print(f"   ‚ùå LLM Error: {str(e)}")
        return None, [0, 0]

def call_embed_fn(model, text_arr):
    print(f"   üîó Embedding call initiated with {len(text_arr)} items...")
    
    # Ensure text_arr is a list of strings
    if isinstance(text_arr, str):
        text_arr = [text_arr]
    
    # Filter out empty strings which cause API errors
    valid_indices = [i for i, t in enumerate(text_arr) if t and isinstance(t, str) and t.strip()]
    
    if not valid_indices:
        print("   ‚ö†Ô∏è Warning: No valid text to embed.")
        return [[0.0] * 3072] * len(text_arr), [0, 0]  # CHANGED: 3072 dimensions
    
    filtered_text = [text_arr[i] for i in valid_indices]
    embeddings_map = {} 
    
    # Process in small batches
    batch_size = 10
    
    for i in range(0, len(filtered_text), batch_size):
        batch = filtered_text[i:i+batch_size]
        start_idx = i
        
        max_retries = 3
        batch_embeddings = []
        
        for attempt in range(max_retries):
            try:
                # CHANGED: Using gemini-embedding-001 (the working model)
                res = model.client.models.embed_content(
                    model="gemini-embedding-001",
                    contents=batch,
                )
                
                # Extract embeddings - response has 'embeddings' attribute
                if hasattr(res, 'embeddings') and res.embeddings:
                    batch_embeddings = [e.values for e in res.embeddings]
                
                if batch_embeddings:
                    break
            except Exception as e:
                sleep_time = 2 * (attempt + 1)
                if "429" in str(e):
                    print(f"   ‚ö†Ô∏è Rate limited. Waiting {sleep_time}s...")
                elif "500" in str(e) or "503" in str(e):
                    print(f"   ‚ö†Ô∏è Server error. Waiting {sleep_time}s...")
                else:
                    print(f"   ‚ùå Embedding Error: {str(e)}")
                    break
                time.sleep(sleep_time)
        
        # Fill in embeddings for this batch
        if batch_embeddings:
             for j, emb in enumerate(batch_embeddings):
                 if j < len(batch): 
                    original_idx = valid_indices[start_idx + j]
                    embeddings_map[original_idx] = emb
        else:
            print(f"   ‚ùå Batch failed (size {len(batch)}). API returned no embeddings.")
            raise RuntimeError(f"Embedding failed for batch starting at {start_idx}")

    # reconstruct full list in original order
    result_embeddings = []
    for i in range(len(text_arr)):
        if i in embeddings_map:
            result_embeddings.append(embeddings_map[i])
        else:
            result_embeddings.append([0.0] * 3072)  # CHANGED: 3072 dimensions

    tokens = [0, 0]
    print(f"   ‚úÖ Embedding complete. {len(result_embeddings)} vectors.")
    return result_embeddings, tokens

print("‚úÖ Call functions ready")

# --- LLooM Instance ---

print("\nüîç Step 4: Loading CSV file...")
df = pd.read_csv("events.csv")
print(f"‚úÖ Loaded {len(df)} events")

# Ensure we have enough data (duplicate if too small for testing UMAP)
if len(df) < 15:
    print("‚ö†Ô∏è Warning: Dataset is very small. Duplicating data for UMAP stability...")
    multiplier = (20 // len(df)) + 1
    df = pd.concat([df] * multiplier, ignore_index=True)
    print(f"   New shape: {df.shape}")

print("\nüîç Step 5: Creating LLooM instance with Gemini models...")
# Create the LLooM instance with custom Gemini models
l = wb.lloom(
    df=df,
    text_col="event",
    
    # Custom Gemini models
    distill_model=Model(
        setup_fn=setup_llm_fn,
        fn=call_llm_fn,
        name="gemini-2.5-flash",
        cost=[0.0005/1000, 0.0015/1000], 
        rate_limit=(60, 60), 
        context_window=32000, 
        api_key=api_key
    ),
    cluster_model=EmbedModel(
        setup_fn=setup_embed_fn,
        fn=call_embed_fn,
        name="gemini-embedding-001",  # CHANGED: from "models/embedding-001"
        cost=(0.00001/1000), 
        batch_size=10, 
        api_key=api_key
    ),
    synth_model=Model(
        setup_fn=setup_llm_fn,
        fn=call_llm_fn,
        name="gemini-2.5-flash", 
        cost=[0.01/1000, 0.03/1000], 
        rate_limit=(60,60), 
        context_window=32000, 
        api_key=api_key
    ),
    score_model=Model(
        setup_fn=setup_llm_fn,
        fn=call_llm_fn,
        name="gemini-2.5-flash", 
        cost=[0.0005/1000, 0.0015/1000], 
        rate_limit=(60,60),
        context_window=32000,
        api_key=api_key
        ),
)

print("‚úÖ LLooM instance created successfully!")

üîç Step 1: Loading API Key...
‚úÖ API Key loaded successfully

üîç Step 2: Setting up model functions...
‚úÖ Model functions ready

üîç Step 3: Setting up call functions...
‚úÖ Call functions ready

üîç Step 4: Loading CSV file...
‚úÖ Loaded 26 events

üîç Step 5: Creating LLooM instance with Gemini models...
   ‚öôÔ∏è  Setting up LLM...
   ‚úÖ LLM setup complete
   ‚öôÔ∏è  Setting up Embedding model...
   ‚úÖ Embedding model setup complete
   ‚öôÔ∏è  Setting up LLM...
   ‚úÖ LLM setup complete
   ‚öôÔ∏è  Setting up LLM...
   ‚úÖ LLM setup complete
No `id_col` provided. Created an ID column named 'id'.
‚úÖ LLooM instance created successfully!


In [2]:
print("\n" + "="*60)
print("üöÄ Running LLooM Auto Generation")
print("="*60)

print("\nüìä Generating features with auto parameters...")
print("   (This may take several minutes...)\n")

# Run gen_auto without interactive prompt by using debug=False
score_df = await l.gen_auto(
    max_concepts=5,
    debug=False  # Skip interactive prompt
)

print("\n‚úÖ Generation and scoring complete!")
print(f"   Score DataFrame shape: {score_df.shape if score_df is not None else 'None'}")

# Display results
if score_df is not None:
    print("\nüìã Score Results Preview:")
    print(score_df.head(10))



üöÄ Running LLooM Auto Generation

üìä Generating features with auto parameters...
   (This may take several minutes...)

Cost estimates not available for distill model `gemini-2.5-flash`
Cost estimates not available for cluster model `gemini-embedding-001`
Cost estimates not available for synth model `gemini-2.5-flash`


[48;5;117mDistill-filter[0m
‚úÖ Done    


[48;5;117mDistill-summarize[0m
‚úÖ Done    


[48;5;117mCluster[0m
‚†ã Loading   üîó Embedding call initiated with 10 items...
‚†ß Loading    ‚úÖ Embedding complete. 10 vectors.
   üîó Embedding call initiated with 10 items...
‚†¥ Loading    ‚úÖ Embedding complete. 10 vectors.
   üîó Embedding call initiated with 10 items...
‚†ã Loading    ‚úÖ Embedding complete. 10 vectors.
   üîó Embedding call initiated with 10 items...
‚†º Loading    ‚úÖ Embedding complete. 10 vectors.
   üîó Embedding call initiated with 10 items...
‚†π Loading    ‚úÖ Embedding complete. 10 vectors.
   üîó Embedding call initiated with 2 i

In [3]:
l.select()

<text_lloom.__init__.ConceptSelectWidget object at 0x13e47c1d0>

In [4]:
l.vis()

<text_lloom.__init__.MatrixWidget object at 0x110ca8990>

In [None]:
l.show_selected()



[1mActive concepts[0m (n=14):
- [1mHarassment Allegation Inquiry[0m: Does the text describe an inquiry, allegation, or specific comment related to harassment, discrimination, or bias?
- [1mDinner Invitation Event[0m: Does the text mention an invitation, specifically for dinner, or a dinner event occurring at a specific time or place?
- [1mUniversity Job Change[0m: Does the text describe someone joining a new university position or leaving one institution for another?
- [1mMisconduct Allegations[0m: Does the text describe accusations, inappropriate behavior, harassment, or formal findings of misconduct against Lawrence Krauss?
- [1mPublic Engagements[0m: Does the text describe Lawrence Krauss participating in public events, conventions, social gatherings, or professional meetings?
- [1mInternal Communications[0m: Does the text describe Lawrence Krauss's internal emails, administrative meetings, or formal communications within an organization?
- [1mEpstein Controversy[0