# Load embedding model inside current python environment. GTE

In [0]:
model_name = "system.ai.gte_large_en_v1_5"

In [0]:
import mlflow
import subprocess
import sys
import os

def install_model_dependencies(model_uri):
    """
    Get and install model dependencies using MLflow
    """
    print(f"Getting dependencies for model: {model_uri}")
    
    try:
        # Get model dependencies (returns path to requirements.txt)
        deps_file = mlflow.pyfunc.get_model_dependencies(model_uri)
        print(f"Dependencies file: {deps_file}")
        
        if deps_file and os.path.exists(deps_file):
            print(f"\nReading dependencies from: {deps_file}")
            
            # Read the requirements file
            with open(deps_file, 'r') as f:
                requirements = f.read().strip().split('\n')
            
            # Filter out empty lines and comments
            requirements = [req.strip() for req in requirements if req.strip() and not req.strip().startswith('#')]
            
            if requirements:
                print(f"Found {len(requirements)} dependencies:")
                for req in requirements:
                    print(f"  - {req}")
                
                print("\nInstalling dependencies...")
                # Install using pip
                subprocess.check_call([sys.executable, "-m", "pip", "install"] + requirements)
                print("✅ Dependencies installed successfully!")
                return True
            else:
                print("No dependencies found in requirements file.")
                return True
        else:
            print("No dependencies file found or file doesn't exist.")
            return False
            
    except Exception as e:
        print(f"Error processing dependencies: {e}")
        return False

# Example usage
model_uri = f"models:/{model_name}/2"
success = install_model_dependencies(model_uri)

if not success:
    print("\nFalling back to manual installation...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers", "torch"])
    print("✅ Manual dependencies installed!")

print("\nDependencies are ready!")

In [0]:
print("Loading the model...")

# Now load the model
model = mlflow.pyfunc.load_model(model_uri)

print(f"✅ Successfully loaded model: {model_name} version 2")
print(f"Model type: {type(model)}")
print(f"Model ready for use!")

In [0]:
import pandas as pd
import numpy as np

# Test the embedding model with sample text
sample_texts = [
    "This is a sample sentence for embedding.",
    "Machine learning models can generate text embeddings.",
    "Databricks provides powerful AI capabilities."
]

print("Generating embeddings for sample texts...")
for i, text in enumerate(sample_texts):
    print(f"{i+1}. {text}")

# Format input as expected by the model (with 'input' column)
input_data = pd.DataFrame({'input': sample_texts})

# Generate embeddings using the loaded model
result = model.predict(input_data)

print(f"\nModel output structure:")
print(f"- Type: {type(result)}")
print(f"- Keys: {list(result.keys())}")

# Extract embeddings from the 'data' key
if 'data' in result:
    data = result['data']
    print(f"\nData structure:")
    print(f"- Type: {type(data)}")
    print(f"- Length: {len(data) if hasattr(data, '__len__') else 'N/A'}")
    
    # Extract embeddings (usually each item in data has an 'embedding' field)
    embeddings = []
    for item in data:
        if isinstance(item, dict) and 'embedding' in item:
            embeddings.append(item['embedding'])
        else:
            print(f"Item structure: {item}")
    
    if embeddings:
        embeddings = np.array(embeddings)
        print(f"\n✅ Successfully extracted embeddings!")
        print(f"- Shape: {embeddings.shape}")
        print(f"- Embedding dimension: {embeddings.shape[1]}")
        print(f"- Number of texts processed: {embeddings.shape[0]}")
        print(f"- First embedding (first 5 values): {embeddings[0][:5]}")
        
        # Calculate similarity between first two embeddings
        if len(embeddings) > 1:
            similarity = np.dot(embeddings[0], embeddings[1]) / (np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1]))
            print(f"- Cosine similarity between first two texts: {similarity:.4f}")
    else:
        print("Could not extract embeddings from the data structure")
else:
    print("No 'data' key found in result")