In [19]:
# Cell 1: Imports & API Setup
import shap
import pandas as pd
import numpy as np
import google.generativeai as genai
import os
import re
import time
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# --- CONFIGURATION ---
# Ensure your API Key is set in your environment variables
# OR uncomment the line below and paste it directly (not recommended for sharing)
# os.environ["GOOGLE_API_KEY"] = "YOUR_ACTUAL_API_KEY"

genai.configure(api_key=os.environ.get("GOOGLE_API_KEY"))

# Use the STABLE model compatible with high volume (as of Dec 2025)
MODEL_NAME = 'gemma-3-12b-it' 

print(f"âœ… Setup Complete. Using Model: {MODEL_NAME}")

âœ… Setup Complete. Using Model: gemma-3-12b-it


In [20]:
# Cell 2: Load Data & Filter for ONLY Important Features

# 1. Define your whitelist
IMPORTANT_FEATURES = [
    "Neighborhood", "Lot Area", "Lot Frontage", "Overall Quality", "Overall Condition",
    "Year Built", "Year Remodeled/Added", "Above Ground Living Area (sqft)",
    "First Floor Area (sqft)", "Total Basement Area (sqft)", "Basement Finished Area 1 (sqft)",
    "Basement Full Bathrooms", "Full Bathrooms", "Bedrooms Above Ground",
    "Kitchen Quality", "Heating Quality", "Central Air Conditioning",
    "Exterior Quality", "Exterior Condition", "Basement Height Quality",
    "Basement Condition", "Garage Capacity (Cars)", "Garage Area (sqft)", "Garage Type"
]

# 2. Load Sample
def load_and_filter(filepath):
    # Parse file
    data_dict = {}
    with open(filepath, 'r') as f:
        for line in f:
            if ":" in line:
                key, val = line.split(":", 1)
                data_dict[key.strip()] = val.strip()
    
    df = pd.DataFrame([data_dict])
    
    # Filter columns immediately
    # We use 'intersection' so code doesn't crash if a specific name is slightly misspelled
    valid_cols = [c for c in IMPORTANT_FEATURES if c in df.columns]
    df_filtered = df[valid_cols].copy()
    
    # Convert numbers
    for col in df_filtered.columns:
        df_filtered[col] = pd.to_numeric(df_filtered[col], errors='ignore')
        
    return df_filtered, valid_cols

sample_df, final_columns = load_and_filter("random_sample.txt")

# 3. Load Background (AmesHousing.csv) and match columns
full_df = pd.read_csv("AmesHousing.csv")

# Ensure background has exact same columns in exact same order
background_pool = full_df[final_columns].copy()

# Fix Missing Values (SHAP hates NaNs)
# Numeric columns get Median, Text columns get "Unknown"
for col in background_pool.columns:
    if pd.api.types.is_numeric_dtype(background_pool[col]):
        background_pool[col] = background_pool[col].fillna(background_pool[col].median())
    else:
        background_pool[col] = background_pool[col].fillna("Unknown")

print(f"âœ… Data Optimized.")
print(f"   Original Features: 80+")
print(f"   Selected Features: {len(final_columns)}")
print(f"   Sample Shape: {sample_df.shape}")

âœ… Data Optimized.
   Original Features: 80+
   Selected Features: 22
   Sample Shape: (1, 22)


  df_filtered[col] = pd.to_numeric(df_filtered[col], errors='ignore')


In [21]:
# Cell 3: API Bridge (30s Delay Logic)

def call_gemini_api(prompt):
    """
    API call with 30s delay to match your strict constraint.
    """
    # Print a countdown so you know it's not frozen
    print("   [Waiting 30s...]", end="\r")
    time.sleep(30) 
    
    try:
        model = genai.GenerativeModel(MODEL_NAME)
        
        # Safety Settings
        safety_settings = {
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        }
        
        generation_config = genai.types.GenerationConfig(
            candidate_count=1,
            temperature=0.0, 
            max_output_tokens=500 
        )
        
        full_prompt = (
            "Act as a Real Estate Calculator. "
            "Predict the price for this house based on these features:\n"
            f"{prompt}\n"
            "Output ONLY the number."
        )
        
        response = model.generate_content(
            full_prompt,
            generation_config=generation_config,
            safety_settings=safety_settings
        )
        
        if not response.parts: return 0.0
        clean_number = re.sub(r"[^\d.]", "", response.text)
        return float(clean_number) if clean_number else 0.0

    except Exception as e:
        print(f"\n   Error: {e}")
        return 0.0

def llm_predict_wrapper(data_numpy):
    df_temp = pd.DataFrame(data_numpy, columns=sample_df.columns)
    predictions = []
    
    # Progress Bar
    total = len(df_temp)
    for i, (_, row) in enumerate(df_temp.iterrows()):
        print(f"Processing {i+1}/{total}...", end="\r")
        
        prompt_text = ""
        for col, val in row.items():
            prompt_text += f"- {col}: {val}\n"
        
        price = call_gemini_api(prompt_text)
        predictions.append(price)
        
    return np.array(predictions)

In [22]:
# Cell 4: Initialize Single-Row Background (Critical Optimization)

# Strategy: Use ONE background sample (Median House).
# If we used 10 background samples, your 25 mins would turn into 4 hours.
# Using the median provides a perfectly valid "Average Reference Point".

# 1. Calculate Median/Mode row
median_row = {}
for col in background_pool.columns:
    if pd.api.types.is_numeric_dtype(background_pool[col]):
        median_row[col] = background_pool[col].median()
    else:
        # For text, take the most common value (Mode)
        median_row[col] = background_pool[col].mode()[0]

background_summary = pd.DataFrame([median_row])

# 2. Init Explainer
explainer = shap.KernelExplainer(llm_predict_wrapper, background_summary)

print("Explainer Ready (Using Single Median Background for Max Speed)")

Explainer Ready (Using Single Median Background for Max Speed)


In [23]:
# Cell 5: Run Calculation

# With 24 features, nsamples=50 is the mathematical minimum (2 * features + 2).
# 50 samples * 1 background * 30 seconds = ~25 minutes.

print(f"Starting calculation for {len(sample_df.columns)} features.")
print("Estimated Time: ~25 Minutes. Do not close this tab.")

shap_values = explainer.shap_values(sample_df, nsamples=50)

print("\nDONE!")

Starting calculation for 22 features.
Estimated Time: ~25 Minutes. Do not close this tab.


  0%|          | 0/1 [00:00<?, ?it/s]

   [Waiting 30s...]
DONE!


In [24]:
# Cell 6: Save Data for Future Merging
import json

# 1. Extract Parcel ID for the filename (Safe Method)
# Since 'Parcel ID' was filtered out of sample_df to save tokens, we read it from the file again.
current_id = "unknown_id"
try:
    with open("random_sample.txt", "r") as f:
        for line in f:
            if "Parcel ID" in line:
                current_id = line.split(":")[1].strip()
                break
except:
    pass

# 2. Create Output Folder
output_dir = "shap_results"
os.makedirs(output_dir, exist_ok=True)

# 3. Save SHAP Values (Numpy Binary Format)
# This stores the output matrix (Price impact of each feature)
shap_filename = os.path.join(output_dir, f"shap_values_{current_id}.npy")
np.save(shap_filename, shap_values)

# 4. Save Feature Data (CSV)
# This stores the input values (What the house actually looked like)
data_filename = os.path.join(output_dir, f"features_{current_id}.csv")
sample_df.to_csv(data_filename, index=False)

print(f"âœ… SUCCESS: Analysis Saved for House {current_id}")
print(f"   ðŸ“‚ {shap_filename}")
print(f"   ðŸ“‚ {data_filename}")
print("-" * 50)
print("NEXT STEP: Run this notebook again for the next house/sample.")
print("Once you have done 5 houses, run the 'Combiner' code below.")

âœ… SUCCESS: Analysis Saved for House 0528326110
   ðŸ“‚ shap_results/shap_values_0528326110.npy
   ðŸ“‚ shap_results/features_0528326110.csv
--------------------------------------------------
NEXT STEP: Run this notebook again for the next house/sample.
Once you have done 5 houses, run the 'Combiner' code below.
