In [0]:
%sql
create or replace volume formula1.default.driver_images

In [0]:
dbutils.fs.rm("/Volumes/formula1/default/driver_images/", True)

In [0]:
import requests
from PIL import Image
from io import BytesIO

image_filenames = [
    "Alexander_Albon_23.png",
    "Carlos_Sainz_55.png",
    "Charles_Leclerc_16.png",
    "Esteban_Ocon_31.png",
    "Fernando_Alonso_14.png",
    "Gabriel_Bortoleto_5.png",
    "George_Russell_63.png",
    "Isack_Hadjar_6.png",
    "Jack_Doohan_7.png",
    "Kimi_Antonelli_12.png",
    "Lance_Stroll_18.png",
    "Lando_Norris_4.png",
    "Lewis_Hamilton_44.png",
    "Liam_Lawson_30.png",
    "Max_Verstappen_1.png",
    "Nico_Hulkenberg_27.png",
    "Oliver_Bearman_87.png",
    "Oscar_Piastri_81.png",
    "Pierre_Gasly_10.png",
    "Yuki_Tsunoda_22.png"
]

base_url = "https://raw.githubusercontent.com/toUpperCase78/formula1-datasets/be28da6b5a94315dd5fc8c3fc5f240fdccf6f723/F1%202025%20Season%20Drivers/"
volume_path = "/Volumes/formula1/default/driver_images/"

for filename in image_filenames:
    url = f"{base_url}{filename}"
    response = requests.get(url)
    
    if response.status_code == 200:
        # Load image into PIL
        img = Image.open(BytesIO(response.content))
        
        # Resize to smaller dimensions (adjust as needed)
        # This keeps aspect ratio and makes max dimension 800px
        max_size = (800, 800)
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        
        # Save as JPEG with compression (quality 85 is good balance)
        output_filename = filename.replace('.png', '.jpg')
        img.convert('RGB').save(
            f"{volume_path}{output_filename}", 
            'JPEG', 
            quality=85, 
            optimize=True
        )
        print(f"Saved compressed: {output_filename}")
    else:
        print(f"Failed to download: {filename}")

In [0]:
%sql
create or replace table formula1.default.driver_images_table
TBLPROPERTIES (delta.enableChangeDataFeed = true)
SELECT
  ai_query(
    'databricks-claude-sonnet-4',
    'Please describe the mood of the person of the person in the image',
    files => files.content
  ) AS enriched_caption,
  files.path
FROM READ_FILES('/Volumes/formula1/default/driver_images', format => 'binaryFile') AS files

In [0]:
%sql
select
*
from formula1.default.driver_images_table

In [0]:
%pip install -U --quiet databricks-sdk==0.49.0 "databricks-langchain>=0.4.0" databricks-agents mlflow[databricks] databricks-vectorsearch==0.55 langchain==0.3.25 langchain_core==0.3.59 bs4==0.0.2 markdownify==0.14.1 pydantic==2.10.1
dbutils.library.restartPython()

In [0]:
from databricks.vector_search.client import VectorSearchClient
vsc = VectorSearchClient(disable_notice=True)

In [0]:
import time

# Create unique names with timestamp
timestamp = str(int(time.time()))
endpoint_name = f'f1_drivers_endpoint_{timestamp}'
index_name = f'formula1.default.f1_drivers_index_{timestamp}'

print(f"Creating new endpoint: {endpoint_name}")
print(f"Creating new index: {index_name}")

# Create the vector search endpoint
try:
    endpoint = vsc.create_endpoint(
        name=endpoint_name, 
        endpoint_type='STANDARD'
    )
    print(f"✅ Successfully created endpoint: {endpoint_name}")
except Exception as e:
    print(f"❌ Error creating endpoint: {e}")
    raise

# Wait a moment for endpoint to be ready
print("Waiting for endpoint to be ready...")
time.sleep(10)

# Create the vector search index
try:
    index = vsc.create_delta_sync_index(
        endpoint_name=endpoint_name,
        index_name=index_name,
        source_table_name='formula1.default.driver_images_table',
        pipeline_type="TRIGGERED",
        primary_key="path",
        embedding_source_column='enriched_caption',
        embedding_model_endpoint_name='databricks-gte-large-en'
    )
    print(f"✅ Successfully created index: {index_name}")
except Exception as e:
    print(f"❌ Error creating index: {e}")
    raise

print("\n🎉 Vector search setup complete!")
print(f"Endpoint: {endpoint_name}")
print(f"Index: {index_name}")

In [0]:
# Use the index name we just created
index_name = 'formula1.default.f1_drivers_index_1760374349'

print(f"Testing vector search with index: {index_name}")
print("Attempting semantic search...")

try:
    # Test semantic search for confident drivers
    results = spark.sql(f"""
    SELECT 
        search_score,
        REGEXP_EXTRACT(path, r'([^/]+)\\.jpg$', 1) as driver_name,
        enriched_caption
    FROM VECTOR_SEARCH(
        index => '{index_name}',
        query_text => 'confident and determined professional athlete',
        num_results => 5
    )
    ORDER BY search_score DESC
    """)
    
    print("\n✅ Vector search is working! \n🏆 Top 5 drivers with confident mood:")
    display(results)
    
    # Test another semantic search
    results2 = spark.sql(f"""
    SELECT 
        search_score,
        REGEXP_EXTRACT(path, r'([^/]+)\\.jpg$', 1) as driver_name,
        enriched_caption
    FROM VECTOR_SEARCH(
        index => '{index_name}',
        query_text => 'happy celebrating victory triumph',
        num_results => 3
    )
    ORDER BY search_score DESC
    """)
    
    print("\n🎉 Top 3 drivers with celebratory mood:")
    display(results2)
    
    # Test one more search
    results3 = spark.sql(f"""
    SELECT 
        search_score,
        REGEXP_EXTRACT(path, r'([^/]+)\\.jpg$', 1) as driver_name,
        enriched_caption
    FROM VECTOR_SEARCH(
        index => '{index_name}',
        query_text => 'serious focused intense concentration',
        num_results => 3
    )
    ORDER BY search_score DESC
    """)
    
    print("\n🎯 Top 3 drivers with serious/focused mood:")
    display(results3)
    
except Exception as e:
    print(f"\n⚠️ Vector search not ready yet: {e}")
    print("The index is still building embeddings. This typically takes 5-10 minutes.")
    print("\n🔍 Showing text-based search as fallback:")
    
    confident_drivers = spark.sql("""
    SELECT 
        REGEXP_EXTRACT(path, r'([^/]+)\\.jpg$', 1) as driver_name,
        enriched_caption
    FROM formula1.default.driver_images_table
    WHERE LOWER(enriched_caption) LIKE '%confident%'
    ORDER BY driver_name
    """)
    display(confident_drivers)

In [0]:
display(spark.read.format("binaryFile").load("dbfs:/Volumes/formula1/default/driver_images/Max_Verstappen_1.jpg"))