In [1]:
!pip install -q torch torchvision transformers timm sentencepiece

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer, pipeline
from torchvision import transforms as T
from PIL import Image

# Set device for the language model only
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize FLAVA model and tokenizer on CPU
flava_model = AutoModel.from_pretrained("facebook/flava-full").to("cpu").eval()
flava_tokenizer = AutoTokenizer.from_pretrained("facebook/flava-full")

# Initialize GPT-J or GPT-Neo, using the GPU if available
language_model = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B", device=0 if torch.cuda.is_available() else -1)


In [3]:
def preprocess_image(image_path, input_size=224):
    image = Image.open(image_path).convert("RGB")
    transform = T.Compose([
        T.Resize((input_size, input_size)),
        T.ToTensor(),
        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])
    image_tensor = transform(image).unsqueeze(0).to("cpu")  # Ensure tensor is on the correct device for FLAVA model
    return image_tensor

def get_visual_embedding(image_tensor):
    with torch.no_grad():
        # FLAVA expects the input tensor on CPU, so no need to move to 'device'
        visual_embedding = flava_model.get_image_features(image_tensor)
    return visual_embedding

def generate_text_from_visual(embedding, initial_prompt="Analyze the graph"):
    # Use a portion of the embedding in the prompt for context
    prompt = f"{initial_prompt}: {embedding.tolist()[:10]} ..."
    response = language_model(
        prompt,
        max_new_tokens=150,  # Set a cap on the number of new tokens generated
        num_return_sequences=1,
        no_repeat_ngram_size=2
    )[0]['generated_text']
    return response


In [4]:
# Execution pipeline
image_path = "/mnt/code/test_l1.png"  # Replace with your image path

# Preprocess the image
image_tensor = preprocess_image(image_path)

# Generate visual embedding using FLAVA
visual_embedding = get_visual_embedding(image_tensor)

# Generate human-interpretable insights
insight = generate_text_from_visual(visual_embedding)
print(f"Insight:\n{insight}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1675580 > 2048). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 15.98 GiB. GPU 0 has a total capacity of 22.18 GiB of which 11.69 GiB is free. Process 1746370 has 10.49 GiB memory in use. Of the allocated memory 10.19 GiB is allocated by PyTorch, and 20.13 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
torch.cuda.empty_cache()
